diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..06a6bbd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.8.0-alpine + +RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev + +RUN addgroup -S appgroup -g 1000 && \ + adduser -u 1000 -S appuser -G appgroup + +ADD requirements.txt / +RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /requirements.txt +RUN pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip +ENTRYPOINT ["websuck"] diff --git a/requirements.txt b/requirements.txt index 6341dae..eb2d635 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ BeautifulSoup4 justext -cassandra-driver python-dateutil click pycurl +lz4 +lxml +cassandra-driver diff --git a/setup.py b/setup.py index 1297b72..852e2f5 100644 --- a/setup.py +++ b/setup.py @@ -32,11 +32,12 @@ setuptools.setup( install_requires=[ "BeautifulSoup4", "justext", - "cassandra-driver", "python-dateutil", "click", "pycurl", - "greenstalk" + "greenstalk", + "lz4", + "cassandra-driver", ], ) diff --git a/websucker/agent.py b/websucker/agent.py index de1efd5..4632c9f 100755 --- a/websucker/agent.py +++ b/websucker/agent.py @@ -60,7 +60,7 @@ class Response: # HMTL metarefresh redirect def get_metarefresh(self): - if self.content is None: + if self.bs is None: return None metarefresh = None t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"}) @@ -132,11 +132,12 @@ class Connection: key = l[0:s - 1] value = l[s + 1:].rstrip() self.headers[key] = value - if key.lower() == "refresh": + kl = key.lower() + if kl == "refresh": self.add_redirect(value) - elif key.lower() == "location": + elif kl == "location": self.add_redirect(value) - elif key == "Content-Type" and "text" not in value: + elif kl == "content-type" and "text" not in value: # Pycurl potom vyhodi 23, failed writing header return 0 @@ -209,12 +210,18 @@ class Connection: elif errno == 60: # 60 bad ssl certificate link_status = "bad_connection" + elif errno == 56: + # 56 Connection reset by peer + link_status = "bad_connection" elif errno == 16: # 16 HTTP2 link_status = "bad_connection" elif errno == 6: # 60 Unable to resolve dns link_status = "bad_connection" + elif errno == 7: + # 7 Connection refused + link_status = "bad_connection" else: raise e except UnicodeDecodeError as e: @@ -300,7 +307,7 @@ class ParsedDocument: self.paragraph_sizes = pszs if bs is None: return - self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs) + self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs) # Extrakcia linkov zo stranky base = self.work_link diff --git a/websucker/cli.py b/websucker/cli.py index 94a85ec..c7f9544 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -26,18 +26,18 @@ def create_queue_from_context(ctx): @click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True) @click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True) @click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True) -@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True) @click.option("--parser",metavar="file_name",help="zzz") @click.option("--visit",is_flag=True) @click.option("--queue",is_flag=True) -def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,justext_language,parser,visit,queue): +def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue): ctx.ensure_object(dict) p = BaseParser() - p.justext_language = justext_language - - suckerfile = os.getcwd() + "/Suckerfile.py" - if os.path.isfile(suckerfile): - parser = suckerfile + if parser is not None: + assert os.path.isfile(parser) + else: + suckerfile = os.getcwd() + "/Suckerfile.py" + if os.path.isfile(suckerfile): + parser = suckerfile if parser is not None: p = load_parser(parser) assert p is not None @@ -130,12 +130,22 @@ def check(ctx,domain): def report(ctx): db = create_database_from_context(ctx) db.daily_report() - if ctx.obj["queue"]: + try: q = create_queue_from_context(ctx) stats = q.stats_tube(ctx.obj["beanstalkd_tube"]) buried = stats["current-jobs-buried"] - ready = stats["current-jobs-buried"] + ready = stats["current-jobs-ready"] print("{} ready jobs, {} burried jobs".format(ready,buried)) + except Error as err: + print(err) + +@cli.command(help="Database summary") +@click.pass_context +def summary(ctx): + db = create_database_from_context(ctx) + p = ctx.obj["parser"] + db.summary(p) + @cli.command(help="Print keyspace schema") def schema(): schema = get_schema() diff --git a/websucker/db.py b/websucker/db.py index 83116d6..be0f214 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -160,7 +160,43 @@ INSERT INTO content( VERSION, ) self.session.execute(self.index_response_insert_html,d) - + + def summary(self,parser): + gs = 0 + cs = 0 + fc = 0 + vd = 0 + ud = 0 + sl = 0 + fd = 0 + jd = 0 + rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1") + for row in rows: + if not parser.is_domain_good(row[0]): + jd += 1 + if row[1] is not None: + gs += row[1] + if row[2] is not None: + cs += row[2] + if row[3] is not None: + fc += row[3] + if row[4] is not None: + sl += row[4] + if row[3] is None or row[3] == 0: + ud += 1 + else: + vd += 1 + if row[4] is None or row[4] == 0: + fd += 1 + print("Good characters: {}".format(gs)) + print("Fetched characters: {}".format(cs)) + print("Fetched documents: {}".format(fc)) + print("Visited domains: {}".format(vd)) + print("Unvisited domains: {}".format(ud)) + print("Junk domains: {}".format(jd)) + print("New links : {}".format(sl)) + print("Finished domains : {}".format(fd)) + def daily_report(self): #rows = self.session.execute(self.daily_links_select) rows = self.session.execute("SELECT domain_name,count(link_status) FROM daily_links WHERE day=toDate(now()) GROUP BY day,domain_name") diff --git a/websucker/parser.py b/websucker/parser.py index 05b6428..7fac7d1 100644 --- a/websucker/parser.py +++ b/websucker/parser.py @@ -237,7 +237,7 @@ class BaseParser: # Extracts matainformation from html # First it looks for name, content in meta tags # then it looks for opengraph - def extract_og(self, bs): + def extract_meta(self, bs): tags = set() authors = set() title = ""