zz
This commit is contained in:
parent
9e36952563
commit
e5a7fc1e99
11
Dockerfile
Normal file
11
Dockerfile
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
FROM python:3.8.0-alpine
|
||||||
|
|
||||||
|
RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev
|
||||||
|
|
||||||
|
RUN addgroup -S appgroup -g 1000 && \
|
||||||
|
adduser -u 1000 -S appuser -G appgroup
|
||||||
|
|
||||||
|
ADD requirements.txt /
|
||||||
|
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /requirements.txt
|
||||||
|
RUN pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
|
||||||
|
ENTRYPOINT ["websuck"]
|
@ -1,6 +1,8 @@
|
|||||||
BeautifulSoup4
|
BeautifulSoup4
|
||||||
justext
|
justext
|
||||||
cassandra-driver
|
|
||||||
python-dateutil
|
python-dateutil
|
||||||
click
|
click
|
||||||
pycurl
|
pycurl
|
||||||
|
lz4
|
||||||
|
lxml
|
||||||
|
cassandra-driver
|
||||||
|
5
setup.py
5
setup.py
@ -32,11 +32,12 @@ setuptools.setup(
|
|||||||
install_requires=[
|
install_requires=[
|
||||||
"BeautifulSoup4",
|
"BeautifulSoup4",
|
||||||
"justext",
|
"justext",
|
||||||
"cassandra-driver",
|
|
||||||
"python-dateutil",
|
"python-dateutil",
|
||||||
"click",
|
"click",
|
||||||
"pycurl",
|
"pycurl",
|
||||||
"greenstalk"
|
"greenstalk",
|
||||||
|
"lz4",
|
||||||
|
"cassandra-driver",
|
||||||
],
|
],
|
||||||
|
|
||||||
)
|
)
|
||||||
|
@ -60,7 +60,7 @@ class Response:
|
|||||||
|
|
||||||
# HMTL metarefresh redirect
|
# HMTL metarefresh redirect
|
||||||
def get_metarefresh(self):
|
def get_metarefresh(self):
|
||||||
if self.content is None:
|
if self.bs is None:
|
||||||
return None
|
return None
|
||||||
metarefresh = None
|
metarefresh = None
|
||||||
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
|
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
|
||||||
@ -132,11 +132,12 @@ class Connection:
|
|||||||
key = l[0:s - 1]
|
key = l[0:s - 1]
|
||||||
value = l[s + 1:].rstrip()
|
value = l[s + 1:].rstrip()
|
||||||
self.headers[key] = value
|
self.headers[key] = value
|
||||||
if key.lower() == "refresh":
|
kl = key.lower()
|
||||||
|
if kl == "refresh":
|
||||||
self.add_redirect(value)
|
self.add_redirect(value)
|
||||||
elif key.lower() == "location":
|
elif kl == "location":
|
||||||
self.add_redirect(value)
|
self.add_redirect(value)
|
||||||
elif key == "Content-Type" and "text" not in value:
|
elif kl == "content-type" and "text" not in value:
|
||||||
# Pycurl potom vyhodi 23, failed writing header
|
# Pycurl potom vyhodi 23, failed writing header
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@ -209,12 +210,18 @@ class Connection:
|
|||||||
elif errno == 60:
|
elif errno == 60:
|
||||||
# 60 bad ssl certificate
|
# 60 bad ssl certificate
|
||||||
link_status = "bad_connection"
|
link_status = "bad_connection"
|
||||||
|
elif errno == 56:
|
||||||
|
# 56 Connection reset by peer
|
||||||
|
link_status = "bad_connection"
|
||||||
elif errno == 16:
|
elif errno == 16:
|
||||||
# 16 HTTP2
|
# 16 HTTP2
|
||||||
link_status = "bad_connection"
|
link_status = "bad_connection"
|
||||||
elif errno == 6:
|
elif errno == 6:
|
||||||
# 60 Unable to resolve dns
|
# 60 Unable to resolve dns
|
||||||
link_status = "bad_connection"
|
link_status = "bad_connection"
|
||||||
|
elif errno == 7:
|
||||||
|
# 7 Connection refused
|
||||||
|
link_status = "bad_connection"
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
@ -300,7 +307,7 @@ class ParsedDocument:
|
|||||||
self.paragraph_sizes = pszs
|
self.paragraph_sizes = pszs
|
||||||
if bs is None:
|
if bs is None:
|
||||||
return
|
return
|
||||||
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs)
|
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
|
||||||
|
|
||||||
# Extrakcia linkov zo stranky
|
# Extrakcia linkov zo stranky
|
||||||
base = self.work_link
|
base = self.work_link
|
||||||
|
@ -26,18 +26,18 @@ def create_queue_from_context(ctx):
|
|||||||
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
|
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
|
||||||
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
|
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
|
||||||
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
|
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
|
||||||
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
|
|
||||||
@click.option("--parser",metavar="file_name",help="zzz")
|
@click.option("--parser",metavar="file_name",help="zzz")
|
||||||
@click.option("--visit",is_flag=True)
|
@click.option("--visit",is_flag=True)
|
||||||
@click.option("--queue",is_flag=True)
|
@click.option("--queue",is_flag=True)
|
||||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,justext_language,parser,visit,queue):
|
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
||||||
ctx.ensure_object(dict)
|
ctx.ensure_object(dict)
|
||||||
p = BaseParser()
|
p = BaseParser()
|
||||||
p.justext_language = justext_language
|
if parser is not None:
|
||||||
|
assert os.path.isfile(parser)
|
||||||
suckerfile = os.getcwd() + "/Suckerfile.py"
|
else:
|
||||||
if os.path.isfile(suckerfile):
|
suckerfile = os.getcwd() + "/Suckerfile.py"
|
||||||
parser = suckerfile
|
if os.path.isfile(suckerfile):
|
||||||
|
parser = suckerfile
|
||||||
if parser is not None:
|
if parser is not None:
|
||||||
p = load_parser(parser)
|
p = load_parser(parser)
|
||||||
assert p is not None
|
assert p is not None
|
||||||
@ -130,12 +130,22 @@ def check(ctx,domain):
|
|||||||
def report(ctx):
|
def report(ctx):
|
||||||
db = create_database_from_context(ctx)
|
db = create_database_from_context(ctx)
|
||||||
db.daily_report()
|
db.daily_report()
|
||||||
if ctx.obj["queue"]:
|
try:
|
||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
||||||
buried = stats["current-jobs-buried"]
|
buried = stats["current-jobs-buried"]
|
||||||
ready = stats["current-jobs-buried"]
|
ready = stats["current-jobs-ready"]
|
||||||
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
||||||
|
except Error as err:
|
||||||
|
print(err)
|
||||||
|
|
||||||
|
@cli.command(help="Database summary")
|
||||||
|
@click.pass_context
|
||||||
|
def summary(ctx):
|
||||||
|
db = create_database_from_context(ctx)
|
||||||
|
p = ctx.obj["parser"]
|
||||||
|
db.summary(p)
|
||||||
|
|
||||||
@cli.command(help="Print keyspace schema")
|
@cli.command(help="Print keyspace schema")
|
||||||
def schema():
|
def schema():
|
||||||
schema = get_schema()
|
schema = get_schema()
|
||||||
|
@ -161,6 +161,42 @@ INSERT INTO content(
|
|||||||
)
|
)
|
||||||
self.session.execute(self.index_response_insert_html,d)
|
self.session.execute(self.index_response_insert_html,d)
|
||||||
|
|
||||||
|
def summary(self,parser):
|
||||||
|
gs = 0
|
||||||
|
cs = 0
|
||||||
|
fc = 0
|
||||||
|
vd = 0
|
||||||
|
ud = 0
|
||||||
|
sl = 0
|
||||||
|
fd = 0
|
||||||
|
jd = 0
|
||||||
|
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
||||||
|
for row in rows:
|
||||||
|
if not parser.is_domain_good(row[0]):
|
||||||
|
jd += 1
|
||||||
|
if row[1] is not None:
|
||||||
|
gs += row[1]
|
||||||
|
if row[2] is not None:
|
||||||
|
cs += row[2]
|
||||||
|
if row[3] is not None:
|
||||||
|
fc += row[3]
|
||||||
|
if row[4] is not None:
|
||||||
|
sl += row[4]
|
||||||
|
if row[3] is None or row[3] == 0:
|
||||||
|
ud += 1
|
||||||
|
else:
|
||||||
|
vd += 1
|
||||||
|
if row[4] is None or row[4] == 0:
|
||||||
|
fd += 1
|
||||||
|
print("Good characters: {}".format(gs))
|
||||||
|
print("Fetched characters: {}".format(cs))
|
||||||
|
print("Fetched documents: {}".format(fc))
|
||||||
|
print("Visited domains: {}".format(vd))
|
||||||
|
print("Unvisited domains: {}".format(ud))
|
||||||
|
print("Junk domains: {}".format(jd))
|
||||||
|
print("New links : {}".format(sl))
|
||||||
|
print("Finished domains : {}".format(fd))
|
||||||
|
|
||||||
def daily_report(self):
|
def daily_report(self):
|
||||||
#rows = self.session.execute(self.daily_links_select)
|
#rows = self.session.execute(self.daily_links_select)
|
||||||
rows = self.session.execute("SELECT domain_name,count(link_status) FROM daily_links WHERE day=toDate(now()) GROUP BY day,domain_name")
|
rows = self.session.execute("SELECT domain_name,count(link_status) FROM daily_links WHERE day=toDate(now()) GROUP BY day,domain_name")
|
||||||
|
@ -237,7 +237,7 @@ class BaseParser:
|
|||||||
# Extracts matainformation from html
|
# Extracts matainformation from html
|
||||||
# First it looks for name, content in meta tags
|
# First it looks for name, content in meta tags
|
||||||
# then it looks for opengraph
|
# then it looks for opengraph
|
||||||
def extract_og(self, bs):
|
def extract_meta(self, bs):
|
||||||
tags = set()
|
tags = set()
|
||||||
authors = set()
|
authors = set()
|
||||||
title = ""
|
title = ""
|
||||||
|
Loading…
Reference in New Issue
Block a user