wiprefactor

This commit is contained in:
Daniel Hládek 2023-02-28 19:29:51 +01:00
parent 30d51944f4
commit 437d4f9684
4 changed files with 104 additions and 127 deletions

View File

@ -282,96 +282,6 @@ class Connection:
res = self.robots[domain].can_fetch("Agent", url) res = self.robots[domain].can_fetch("Agent", url)
return res return res
class ParsedDocument:
"""
One document in the database
"""
def __init__(self, parser,work_link):
self.parser = parser
self.work_link = work_link
self.content = None
self.bs = None
self.paragraph_checksums = None
self.paragraph_sizes = None
self.link_set = set()
self.body = None
self.text_date = None
self.tags = None
self.authors = None
self.title = None
self.description = None
self.section = None
self.article_published_time = None
self.current_time = datetime.date.today()
def extract(self,content,bs):
"""
Parse content and fill the object
"""
self.content = content
self.bs = bs
# Extract text and metatext
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
# Paragraph Checksums
pch,pszs = self.parser.calculate_checksums(self.body)
self.paragraph_checksums = pch
self.paragraph_sizes = pszs
if bs is None:
return
self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
# Extrakcia linkov zo stranky
base = self.work_link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
nl = normalize_link(href, base)
link = urlunparse(nl)
if link == base:
continue
self.link_set.add(link)
except ValueError:
pass
def get_links(self):
"""
@return all links
"""
return self.link_set
def get_follow_links(self):
"""
@return good normalized links
"""
follow_links = set()
for l in self.link_set:
if self.parser.is_link_good(l):
link = normalize_link(l,strip_query=self.parser.strip_query)
follow_links.add(urlunparse(link))
return follow_links
def __str__(self):
r = []
if self.authors is not None:
r.append(",".join(self.authors))
if self.title is not None:
r.append(self.title)
if self.body is not None:
if (len(self.body) < 20):
r.append(self.body)
else:
r.append(self.body[0:20] + " ....")
return ">>> ".join(r)
def parse_and_index(work_link,parser,responses,db): def parse_and_index(work_link,parser,responses,db):
@ -389,10 +299,9 @@ def parse_and_index(work_link,parser,responses,db):
lr = responses[-1] lr = responses[-1]
if lr.bs is not None: if lr.bs is not None:
target_link = lr.get_canonical() target_link = lr.get_canonical()
parsed = ParsedDocument(parser,target_link) parsed = parser.full_extract(lr.content,lr.bs,target_link)
parsed.extract(lr.content, lr.bs)
db.index_content(target_link,parsed) db.index_content(target_link,parsed)
links = parsed.get_links() links = parsed.link_set
return target_link,links return target_link,links
def visit_sitemap(domain,connection,parser,db): def visit_sitemap(domain,connection,parser,db):

View File

@ -220,8 +220,7 @@ def fetch(ctx,urls):
responses = connection.html_download2(urls) responses = connection.html_download2(urls)
for res in responses: for res in responses:
target_link = res.get_canonical() target_link = res.get_canonical()
pd = ParsedDocument(parser,target_link) pd = parser.full_extract(res.content,res.bs,parser,target_link)
pd.extract(res.content, res.bs)
print(pd) print(pd)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -14,6 +14,35 @@ import json
VERSION = "sucker6" VERSION = "sucker6"
def calculate_checksums(self, text):
"""
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
"""
checksums = []
sizes = []
hval = 0
hsz = 0
sz = 0
for c in text:
cv = ord(c)
sz += 1
if cv > 64:
hval += (hval << 3) + cv
zv = hval >> 31
hval &= 0x7fffffff
hval += zv
hsz += 1
if c == "\n" and hsz > 0:
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
sz = 0
hsz = 0
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
return checksums, sizes
def get_schema(): def get_schema():
with pkg_resources.resource_stream(__name__,"schema.sql") as f: with pkg_resources.resource_stream(__name__,"schema.sql") as f:
schema = f.read() schema = f.read()
@ -303,7 +332,7 @@ INSERT INTO content(
value = ( value = (
domain_name, domain_name,
target_link, target_link,
pd.get_links(), pd.link_set,
pd.title, pd.title,
pd.description, pd.description,
pd.section, pd.section,
@ -332,12 +361,13 @@ INSERT INTO content(
if link_status == "good": if link_status == "good":
futures = [] futures = []
for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes): paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3]))) fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
futures.append(fut) futures.append(fut)
for fut in futures: for fut in futures:
fut.result() fut.result()
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes) originality = self.check_document(paragraph_checksums,paragraph_sizes)
if originality < 0.8: if originality < 0.8:
link_status = "bad_copy" link_status = "bad_copy"
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3])) self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))

View File

@ -53,6 +53,26 @@ def normalize_link(link, base=None,strip_query=False):
path = dirname + "/" + filename path = dirname + "/" + filename
return schema, netloc, path, query return schema, netloc, path, query
def get_bs_links(work_link,bs):
# Extrakcia linkov zo stranky
base = work_link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
link_set = set()
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
nl = normalize_link(href, base)
link = urlunparse(nl)
if link == base:
continue
link_set.add(link)
except ValueError:
pass
return link_set
def get_date(te): def get_date(te):
dates = [] dates = []
@ -86,6 +106,39 @@ def get_date(te):
pass pass
return dates return dates
class ParsedDocument:
"""
One document in the database
"""
def __init__(self):
self.work_link = None
self.content = None
self.bs = None
self.link_set = set()
self.body = None
self.text_date = None
self.tags = None
self.authors = None
self.title = None
self.description = None
self.section = None
self.article_published_time = None
self.current_time = datetime.date.today()
def __str__(self):
r = []
if self.authors is not None:
r.append(",".join(self.authors))
if self.title is not None:
r.append(self.title)
if self.body is not None:
if (len(self.body) < 20):
r.append(self.body)
else:
r.append(self.body[0:20] + " ....")
return ">>> ".join(r)
class BaseParser: class BaseParser:
def __init__(self, verbose=False): def __init__(self, verbose=False):
@ -106,6 +159,21 @@ class BaseParser:
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"]) self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
self.justext_language = "Slovak" self.justext_language = "Slovak"
def full_extract(self,content,bs,work_link):
"""
Parse content and fill the object
"""
pd = ParsedDocument()
pd.work_link = work_link
pd.current_time = datetime.date.today()
# Extract text and metatext
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
pd.link_set = get_bs_links(work_link,bs)
return pd
def is_domain_good(self, domain): def is_domain_good(self, domain):
r = None r = None
# Netloc # Netloc
@ -192,7 +260,6 @@ class BaseParser:
except lxml.etree.XMLSyntaxError: except lxml.etree.XMLSyntaxError:
print("XML Syntax parse error") print("XML Syntax parse error")
except lxml.etree.ParserError: except lxml.etree.ParserError:
print("XML Parse parse error") print("XML Parse parse error")
except justext.core.JustextError: except justext.core.JustextError:
print("Justext error") print("Justext error")
@ -295,34 +362,6 @@ class BaseParser:
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip() return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
def calculate_checksums(self, text):
"""
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
"""
checksums = []
sizes = []
hval = 0
hsz = 0
sz = 0
for c in text:
cv = ord(c)
sz += 1
if cv > 64:
hval += (hval << 3) + cv
zv = hval >> 31
hval &= 0x7fffffff
hval += zv
hsz += 1
if c == "\n" and hsz > 0:
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
sz = 0
hsz = 0
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
return checksums, sizes
class EnglishParser(BaseParser): class EnglishParser(BaseParser):
def __init__(self): def __init__(self):