This commit is contained in:
Daniel Hladek 2020-05-10 11:48:17 +02:00
parent 9e36952563
commit e5a7fc1e99
7 changed files with 86 additions and 19 deletions

11
Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM python:3.8.0-alpine
RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev
RUN addgroup -S appgroup -g 1000 && \
adduser -u 1000 -S appuser -G appgroup
ADD requirements.txt /
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /requirements.txt
RUN pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
ENTRYPOINT ["websuck"]

View File

@ -1,6 +1,8 @@
BeautifulSoup4 BeautifulSoup4
justext justext
cassandra-driver
python-dateutil python-dateutil
click click
pycurl pycurl
lz4
lxml
cassandra-driver

View File

@ -32,11 +32,12 @@ setuptools.setup(
install_requires=[ install_requires=[
"BeautifulSoup4", "BeautifulSoup4",
"justext", "justext",
"cassandra-driver",
"python-dateutil", "python-dateutil",
"click", "click",
"pycurl", "pycurl",
"greenstalk" "greenstalk",
"lz4",
"cassandra-driver",
], ],
) )

View File

@ -60,7 +60,7 @@ class Response:
# HMTL metarefresh redirect # HMTL metarefresh redirect
def get_metarefresh(self): def get_metarefresh(self):
if self.content is None: if self.bs is None:
return None return None
metarefresh = None metarefresh = None
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"}) t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
@ -132,11 +132,12 @@ class Connection:
key = l[0:s - 1] key = l[0:s - 1]
value = l[s + 1:].rstrip() value = l[s + 1:].rstrip()
self.headers[key] = value self.headers[key] = value
if key.lower() == "refresh": kl = key.lower()
if kl == "refresh":
self.add_redirect(value) self.add_redirect(value)
elif key.lower() == "location": elif kl == "location":
self.add_redirect(value) self.add_redirect(value)
elif key == "Content-Type" and "text" not in value: elif kl == "content-type" and "text" not in value:
# Pycurl potom vyhodi 23, failed writing header # Pycurl potom vyhodi 23, failed writing header
return 0 return 0
@ -209,12 +210,18 @@ class Connection:
elif errno == 60: elif errno == 60:
# 60 bad ssl certificate # 60 bad ssl certificate
link_status = "bad_connection" link_status = "bad_connection"
elif errno == 56:
# 56 Connection reset by peer
link_status = "bad_connection"
elif errno == 16: elif errno == 16:
# 16 HTTP2 # 16 HTTP2
link_status = "bad_connection" link_status = "bad_connection"
elif errno == 6: elif errno == 6:
# 60 Unable to resolve dns # 60 Unable to resolve dns
link_status = "bad_connection" link_status = "bad_connection"
elif errno == 7:
# 7 Connection refused
link_status = "bad_connection"
else: else:
raise e raise e
except UnicodeDecodeError as e: except UnicodeDecodeError as e:
@ -300,7 +307,7 @@ class ParsedDocument:
self.paragraph_sizes = pszs self.paragraph_sizes = pszs
if bs is None: if bs is None:
return return
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs) self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
# Extrakcia linkov zo stranky # Extrakcia linkov zo stranky
base = self.work_link base = self.work_link

View File

@ -26,15 +26,15 @@ def create_queue_from_context(ctx):
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True) @click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True) @click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True) @click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
@click.option("--parser",metavar="file_name",help="zzz") @click.option("--parser",metavar="file_name",help="zzz")
@click.option("--visit",is_flag=True) @click.option("--visit",is_flag=True)
@click.option("--queue",is_flag=True) @click.option("--queue",is_flag=True)
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,justext_language,parser,visit,queue): def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
ctx.ensure_object(dict) ctx.ensure_object(dict)
p = BaseParser() p = BaseParser()
p.justext_language = justext_language if parser is not None:
assert os.path.isfile(parser)
else:
suckerfile = os.getcwd() + "/Suckerfile.py" suckerfile = os.getcwd() + "/Suckerfile.py"
if os.path.isfile(suckerfile): if os.path.isfile(suckerfile):
parser = suckerfile parser = suckerfile
@ -130,12 +130,22 @@ def check(ctx,domain):
def report(ctx): def report(ctx):
db = create_database_from_context(ctx) db = create_database_from_context(ctx)
db.daily_report() db.daily_report()
if ctx.obj["queue"]: try:
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
stats = q.stats_tube(ctx.obj["beanstalkd_tube"]) stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"] buried = stats["current-jobs-buried"]
ready = stats["current-jobs-buried"] ready = stats["current-jobs-ready"]
print("{} ready jobs, {} burried jobs".format(ready,buried)) print("{} ready jobs, {} burried jobs".format(ready,buried))
except Error as err:
print(err)
@cli.command(help="Database summary")
@click.pass_context
def summary(ctx):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
db.summary(p)
@cli.command(help="Print keyspace schema") @cli.command(help="Print keyspace schema")
def schema(): def schema():
schema = get_schema() schema = get_schema()

View File

@ -161,6 +161,42 @@ INSERT INTO content(
) )
self.session.execute(self.index_response_insert_html,d) self.session.execute(self.index_response_insert_html,d)
def summary(self,parser):
gs = 0
cs = 0
fc = 0
vd = 0
ud = 0
sl = 0
fd = 0
jd = 0
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
for row in rows:
if not parser.is_domain_good(row[0]):
jd += 1
if row[1] is not None:
gs += row[1]
if row[2] is not None:
cs += row[2]
if row[3] is not None:
fc += row[3]
if row[4] is not None:
sl += row[4]
if row[3] is None or row[3] == 0:
ud += 1
else:
vd += 1
if row[4] is None or row[4] == 0:
fd += 1
print("Good characters: {}".format(gs))
print("Fetched characters: {}".format(cs))
print("Fetched documents: {}".format(fc))
print("Visited domains: {}".format(vd))
print("Unvisited domains: {}".format(ud))
print("Junk domains: {}".format(jd))
print("New links : {}".format(sl))
print("Finished domains : {}".format(fd))
def daily_report(self): def daily_report(self):
#rows = self.session.execute(self.daily_links_select) #rows = self.session.execute(self.daily_links_select)
rows = self.session.execute("SELECT domain_name,count(link_status) FROM daily_links WHERE day=toDate(now()) GROUP BY day,domain_name") rows = self.session.execute("SELECT domain_name,count(link_status) FROM daily_links WHERE day=toDate(now()) GROUP BY day,domain_name")

View File

@ -237,7 +237,7 @@ class BaseParser:
# Extracts matainformation from html # Extracts matainformation from html
# First it looks for name, content in meta tags # First it looks for name, content in meta tags
# then it looks for opengraph # then it looks for opengraph
def extract_og(self, bs): def extract_meta(self, bs):
tags = set() tags = set()
authors = set() authors = set()
title = "" title = ""