diff --git a/requirements.txt b/requirements.txt index eb2d635..4f3417c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ pycurl lz4 lxml cassandra-driver +trafilatura +py3langid diff --git a/websucker/cli.py b/websucker/cli.py index 0cb010e..80a31b8 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -1,7 +1,7 @@ from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains -from websucker.agent import ParsedDocument from websucker.parser import BaseParser -from websucker.parser import normalize_link,urlunparse +from websucker.parser import SoupParser +from websucker.parser import TrafilaturaParser from websucker.parser import load_parser from websucker.db import Data from websucker.db import get_schema @@ -36,7 +36,7 @@ def create_queue_from_context(ctx): @click.option("--queue",is_flag=True) def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue): ctx.ensure_object(dict) - p = BaseParser() + p = TrafilaturaParser() if parser is not None: assert os.path.isfile(parser) else: @@ -220,7 +220,7 @@ def fetch(ctx,urls): responses = connection.html_download2(urls) for res in responses: target_link = res.get_canonical() - pd = parser.full_extract(res.content,res.bs,parser,target_link) + pd = parser.full_extract(res.content,res.bs,target_link) print(pd) if __name__ == "__main__": diff --git a/websucker/db.py b/websucker/db.py index 04d9095..e6ab4ff 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -14,7 +14,7 @@ import json VERSION = "sucker6" -def calculate_checksums(self, text): +def calculate_checksums(text): """ @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line """ @@ -295,13 +295,7 @@ INSERT INTO content( def index_follow_links(self,parser,links,connection): # Index seen links - follow_links = set() - for l in links: - if parser.is_link_good(l): - #if connection is not None and parser.listen_robot and not connection.is_robot_good(l): - # continue - link = normalize_link(l,strip_query=parser.strip_query) - follow_links.add(urlunparse(link)) + follow_links = parser.filter_links(links) newlinkdomains = set() newlinkcount = 0 @@ -342,9 +336,10 @@ INSERT INTO content( pd.text_date, pd.body, body_length, - VERSION, + VERSION, pd.current_time ) + print(value) content_future = self.session.execute_async(self.index_content_content_insert,value) # result later @@ -361,7 +356,7 @@ INSERT INTO content( if link_status == "good": futures = [] - paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text) + paragraph_checksums,paragraph_sizes = calculate_checksums(pd.body) for pc,psz in zip(paragraph_checksums,paragraph_sizes): fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3]))) futures.append(fut) diff --git a/websucker/parser.py b/websucker/parser.py index 01e565a..fad0bf5 100644 --- a/websucker/parser.py +++ b/websucker/parser.py @@ -146,10 +146,8 @@ class BaseParser: self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz", ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"] self.skipchars = re.compile(r"[();:@& ]") - self.store = True self.verbose = verbose self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?