diff --git a/websucker/agent.py b/websucker/agent.py index 96a050e..e352a5a 100755 --- a/websucker/agent.py +++ b/websucker/agent.py @@ -282,96 +282,6 @@ class Connection: res = self.robots[domain].can_fetch("Agent", url) return res -class ParsedDocument: - """ - One document in the database - """ - def __init__(self, parser,work_link): - self.parser = parser - self.work_link = work_link - - self.content = None - self.bs = None - self.paragraph_checksums = None - self.paragraph_sizes = None - - self.link_set = set() - self.body = None - self.text_date = None - self.tags = None - self.authors = None - self.title = None - self.description = None - self.section = None - self.article_published_time = None - self.current_time = datetime.date.today() - - - def extract(self,content,bs): - """ - Parse content and fill the object - """ - self.content = content - self.bs = bs - - # Extract text and metatext - self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time) - # Paragraph Checksums - pch,pszs = self.parser.calculate_checksums(self.body) - self.paragraph_checksums = pch - self.paragraph_sizes = pszs - if bs is None: - return - self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs) - - # Extrakcia linkov zo stranky - base = self.work_link - if bs.base is not None and "href" in bs.base.attrs: - base = bs.base["href"] - # Normalizacia linkov - for l in bs.find_all("a", href=True): - if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: - continue - href = l["href"] - try: - nl = normalize_link(href, base) - link = urlunparse(nl) - if link == base: - continue - self.link_set.add(link) - except ValueError: - pass - - def get_links(self): - """ - @return all links - """ - return self.link_set - - def get_follow_links(self): - """ - @return good normalized links - """ - follow_links = set() - for l in self.link_set: - if self.parser.is_link_good(l): - link = normalize_link(l,strip_query=self.parser.strip_query) - follow_links.add(urlunparse(link)) - return follow_links - - - def __str__(self): - r = [] - if self.authors is not None: - r.append(",".join(self.authors)) - if self.title is not None: - r.append(self.title) - if self.body is not None: - if (len(self.body) < 20): - r.append(self.body) - else: - r.append(self.body[0:20] + " ....") - return ">>> ".join(r) def parse_and_index(work_link,parser,responses,db): @@ -389,10 +299,9 @@ def parse_and_index(work_link,parser,responses,db): lr = responses[-1] if lr.bs is not None: target_link = lr.get_canonical() - parsed = ParsedDocument(parser,target_link) - parsed.extract(lr.content, lr.bs) + parsed = parser.full_extract(lr.content,lr.bs,target_link) db.index_content(target_link,parsed) - links = parsed.get_links() + links = parsed.link_set return target_link,links def visit_sitemap(domain,connection,parser,db): diff --git a/websucker/cli.py b/websucker/cli.py index dcdeae8..0cb010e 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -220,8 +220,7 @@ def fetch(ctx,urls): responses = connection.html_download2(urls) for res in responses: target_link = res.get_canonical() - pd = ParsedDocument(parser,target_link) - pd.extract(res.content, res.bs) + pd = parser.full_extract(res.content,res.bs,parser,target_link) print(pd) if __name__ == "__main__": diff --git a/websucker/db.py b/websucker/db.py index ec5d2b5..04d9095 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -14,6 +14,35 @@ import json VERSION = "sucker6" +def calculate_checksums(self, text): + """ + @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line + """ + checksums = [] + sizes = [] + hval = 0 + hsz = 0 + sz = 0 + for c in text: + cv = ord(c) + sz += 1 + if cv > 64: + hval += (hval << 3) + cv + zv = hval >> 31 + hval &= 0x7fffffff + hval += zv + hsz += 1 + if c == "\n" and hsz > 0: + if hsz > 100: + checksums.append(hval) + sizes.append(sz) + sz = 0 + hsz = 0 + if hsz > 100: + checksums.append(hval) + sizes.append(sz) + return checksums, sizes + def get_schema(): with pkg_resources.resource_stream(__name__,"schema.sql") as f: schema = f.read() @@ -303,7 +332,7 @@ INSERT INTO content( value = ( domain_name, target_link, - pd.get_links(), + pd.link_set, pd.title, pd.description, pd.section, @@ -332,12 +361,13 @@ INSERT INTO content( if link_status == "good": futures = [] - for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes): + paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text) + for pc,psz in zip(paragraph_checksums,paragraph_sizes): fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3]))) futures.append(fut) for fut in futures: fut.result() - originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes) + originality = self.check_document(paragraph_checksums,paragraph_sizes) if originality < 0.8: link_status = "bad_copy" self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3])) diff --git a/websucker/parser.py b/websucker/parser.py index 72925fc..01e565a 100644 --- a/websucker/parser.py +++ b/websucker/parser.py @@ -53,6 +53,26 @@ def normalize_link(link, base=None,strip_query=False): path = dirname + "/" + filename return schema, netloc, path, query +def get_bs_links(work_link,bs): + # Extrakcia linkov zo stranky + base = work_link + if bs.base is not None and "href" in bs.base.attrs: + base = bs.base["href"] + link_set = set() + # Normalizacia linkov + for l in bs.find_all("a", href=True): + if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: + continue + href = l["href"] + try: + nl = normalize_link(href, base) + link = urlunparse(nl) + if link == base: + continue + link_set.add(link) + except ValueError: + pass + return link_set def get_date(te): dates = [] @@ -86,6 +106,39 @@ def get_date(te): pass return dates +class ParsedDocument: + """ + One document in the database + """ + def __init__(self): + self.work_link = None + + self.content = None + self.bs = None + + self.link_set = set() + self.body = None + self.text_date = None + self.tags = None + self.authors = None + self.title = None + self.description = None + self.section = None + self.article_published_time = None + self.current_time = datetime.date.today() + + def __str__(self): + r = [] + if self.authors is not None: + r.append(",".join(self.authors)) + if self.title is not None: + r.append(self.title) + if self.body is not None: + if (len(self.body) < 20): + r.append(self.body) + else: + r.append(self.body[0:20] + " ....") + return ">>> ".join(r) class BaseParser: def __init__(self, verbose=False): @@ -106,6 +159,21 @@ class BaseParser: self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"]) self.justext_language = "Slovak" + def full_extract(self,content,bs,work_link): + """ + Parse content and fill the object + """ + pd = ParsedDocument() + pd.work_link = work_link + + pd.current_time = datetime.date.today() + # Extract text and metatext + pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time) + pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs) + pd.link_set = get_bs_links(work_link,bs) + return pd + + def is_domain_good(self, domain): r = None # Netloc @@ -192,7 +260,6 @@ class BaseParser: except lxml.etree.XMLSyntaxError: print("XML Syntax parse error") except lxml.etree.ParserError: - print("XML Parse parse error") except justext.core.JustextError: print("Justext error") @@ -295,34 +362,6 @@ class BaseParser: return tags,authors,title.strip(),article_published_time.strip(),description,section.strip() - def calculate_checksums(self, text): - """ - @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line - """ - checksums = [] - sizes = [] - hval = 0 - hsz = 0 - sz = 0 - for c in text: - cv = ord(c) - sz += 1 - if cv > 64: - hval += (hval << 3) + cv - zv = hval >> 31 - hval &= 0x7fffffff - hval += zv - hsz += 1 - if c == "\n" and hsz > 0: - if hsz > 100: - checksums.append(hval) - sizes.append(sz) - sz = 0 - hsz = 0 - if hsz > 100: - checksums.append(hval) - sizes.append(sz) - return checksums, sizes class EnglishParser(BaseParser): def __init__(self):