wiprefactor
This commit is contained in:
parent
30d51944f4
commit
437d4f9684
@ -282,96 +282,6 @@ class Connection:
|
|||||||
res = self.robots[domain].can_fetch("Agent", url)
|
res = self.robots[domain].can_fetch("Agent", url)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
class ParsedDocument:
|
|
||||||
"""
|
|
||||||
One document in the database
|
|
||||||
"""
|
|
||||||
def __init__(self, parser,work_link):
|
|
||||||
self.parser = parser
|
|
||||||
self.work_link = work_link
|
|
||||||
|
|
||||||
self.content = None
|
|
||||||
self.bs = None
|
|
||||||
self.paragraph_checksums = None
|
|
||||||
self.paragraph_sizes = None
|
|
||||||
|
|
||||||
self.link_set = set()
|
|
||||||
self.body = None
|
|
||||||
self.text_date = None
|
|
||||||
self.tags = None
|
|
||||||
self.authors = None
|
|
||||||
self.title = None
|
|
||||||
self.description = None
|
|
||||||
self.section = None
|
|
||||||
self.article_published_time = None
|
|
||||||
self.current_time = datetime.date.today()
|
|
||||||
|
|
||||||
|
|
||||||
def extract(self,content,bs):
|
|
||||||
"""
|
|
||||||
Parse content and fill the object
|
|
||||||
"""
|
|
||||||
self.content = content
|
|
||||||
self.bs = bs
|
|
||||||
|
|
||||||
# Extract text and metatext
|
|
||||||
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
|
|
||||||
# Paragraph Checksums
|
|
||||||
pch,pszs = self.parser.calculate_checksums(self.body)
|
|
||||||
self.paragraph_checksums = pch
|
|
||||||
self.paragraph_sizes = pszs
|
|
||||||
if bs is None:
|
|
||||||
return
|
|
||||||
self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
|
|
||||||
|
|
||||||
# Extrakcia linkov zo stranky
|
|
||||||
base = self.work_link
|
|
||||||
if bs.base is not None and "href" in bs.base.attrs:
|
|
||||||
base = bs.base["href"]
|
|
||||||
# Normalizacia linkov
|
|
||||||
for l in bs.find_all("a", href=True):
|
|
||||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
|
||||||
continue
|
|
||||||
href = l["href"]
|
|
||||||
try:
|
|
||||||
nl = normalize_link(href, base)
|
|
||||||
link = urlunparse(nl)
|
|
||||||
if link == base:
|
|
||||||
continue
|
|
||||||
self.link_set.add(link)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_links(self):
|
|
||||||
"""
|
|
||||||
@return all links
|
|
||||||
"""
|
|
||||||
return self.link_set
|
|
||||||
|
|
||||||
def get_follow_links(self):
|
|
||||||
"""
|
|
||||||
@return good normalized links
|
|
||||||
"""
|
|
||||||
follow_links = set()
|
|
||||||
for l in self.link_set:
|
|
||||||
if self.parser.is_link_good(l):
|
|
||||||
link = normalize_link(l,strip_query=self.parser.strip_query)
|
|
||||||
follow_links.add(urlunparse(link))
|
|
||||||
return follow_links
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
r = []
|
|
||||||
if self.authors is not None:
|
|
||||||
r.append(",".join(self.authors))
|
|
||||||
if self.title is not None:
|
|
||||||
r.append(self.title)
|
|
||||||
if self.body is not None:
|
|
||||||
if (len(self.body) < 20):
|
|
||||||
r.append(self.body)
|
|
||||||
else:
|
|
||||||
r.append(self.body[0:20] + " ....")
|
|
||||||
return ">>> ".join(r)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_and_index(work_link,parser,responses,db):
|
def parse_and_index(work_link,parser,responses,db):
|
||||||
@ -389,10 +299,9 @@ def parse_and_index(work_link,parser,responses,db):
|
|||||||
lr = responses[-1]
|
lr = responses[-1]
|
||||||
if lr.bs is not None:
|
if lr.bs is not None:
|
||||||
target_link = lr.get_canonical()
|
target_link = lr.get_canonical()
|
||||||
parsed = ParsedDocument(parser,target_link)
|
parsed = parser.full_extract(lr.content,lr.bs,target_link)
|
||||||
parsed.extract(lr.content, lr.bs)
|
|
||||||
db.index_content(target_link,parsed)
|
db.index_content(target_link,parsed)
|
||||||
links = parsed.get_links()
|
links = parsed.link_set
|
||||||
return target_link,links
|
return target_link,links
|
||||||
|
|
||||||
def visit_sitemap(domain,connection,parser,db):
|
def visit_sitemap(domain,connection,parser,db):
|
||||||
|
@ -220,8 +220,7 @@ def fetch(ctx,urls):
|
|||||||
responses = connection.html_download2(urls)
|
responses = connection.html_download2(urls)
|
||||||
for res in responses:
|
for res in responses:
|
||||||
target_link = res.get_canonical()
|
target_link = res.get_canonical()
|
||||||
pd = ParsedDocument(parser,target_link)
|
pd = parser.full_extract(res.content,res.bs,parser,target_link)
|
||||||
pd.extract(res.content, res.bs)
|
|
||||||
print(pd)
|
print(pd)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -14,6 +14,35 @@ import json
|
|||||||
VERSION = "sucker6"
|
VERSION = "sucker6"
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_checksums(self, text):
|
||||||
|
"""
|
||||||
|
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||||
|
"""
|
||||||
|
checksums = []
|
||||||
|
sizes = []
|
||||||
|
hval = 0
|
||||||
|
hsz = 0
|
||||||
|
sz = 0
|
||||||
|
for c in text:
|
||||||
|
cv = ord(c)
|
||||||
|
sz += 1
|
||||||
|
if cv > 64:
|
||||||
|
hval += (hval << 3) + cv
|
||||||
|
zv = hval >> 31
|
||||||
|
hval &= 0x7fffffff
|
||||||
|
hval += zv
|
||||||
|
hsz += 1
|
||||||
|
if c == "\n" and hsz > 0:
|
||||||
|
if hsz > 100:
|
||||||
|
checksums.append(hval)
|
||||||
|
sizes.append(sz)
|
||||||
|
sz = 0
|
||||||
|
hsz = 0
|
||||||
|
if hsz > 100:
|
||||||
|
checksums.append(hval)
|
||||||
|
sizes.append(sz)
|
||||||
|
return checksums, sizes
|
||||||
|
|
||||||
def get_schema():
|
def get_schema():
|
||||||
with pkg_resources.resource_stream(__name__,"schema.sql") as f:
|
with pkg_resources.resource_stream(__name__,"schema.sql") as f:
|
||||||
schema = f.read()
|
schema = f.read()
|
||||||
@ -303,7 +332,7 @@ INSERT INTO content(
|
|||||||
value = (
|
value = (
|
||||||
domain_name,
|
domain_name,
|
||||||
target_link,
|
target_link,
|
||||||
pd.get_links(),
|
pd.link_set,
|
||||||
pd.title,
|
pd.title,
|
||||||
pd.description,
|
pd.description,
|
||||||
pd.section,
|
pd.section,
|
||||||
@ -332,12 +361,13 @@ INSERT INTO content(
|
|||||||
if link_status == "good":
|
if link_status == "good":
|
||||||
|
|
||||||
futures = []
|
futures = []
|
||||||
for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes):
|
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
|
||||||
|
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
|
||||||
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
||||||
futures.append(fut)
|
futures.append(fut)
|
||||||
for fut in futures:
|
for fut in futures:
|
||||||
fut.result()
|
fut.result()
|
||||||
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
|
originality = self.check_document(paragraph_checksums,paragraph_sizes)
|
||||||
if originality < 0.8:
|
if originality < 0.8:
|
||||||
link_status = "bad_copy"
|
link_status = "bad_copy"
|
||||||
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
|
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
|
||||||
|
@ -53,6 +53,26 @@ def normalize_link(link, base=None,strip_query=False):
|
|||||||
path = dirname + "/" + filename
|
path = dirname + "/" + filename
|
||||||
return schema, netloc, path, query
|
return schema, netloc, path, query
|
||||||
|
|
||||||
|
def get_bs_links(work_link,bs):
|
||||||
|
# Extrakcia linkov zo stranky
|
||||||
|
base = work_link
|
||||||
|
if bs.base is not None and "href" in bs.base.attrs:
|
||||||
|
base = bs.base["href"]
|
||||||
|
link_set = set()
|
||||||
|
# Normalizacia linkov
|
||||||
|
for l in bs.find_all("a", href=True):
|
||||||
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||||
|
continue
|
||||||
|
href = l["href"]
|
||||||
|
try:
|
||||||
|
nl = normalize_link(href, base)
|
||||||
|
link = urlunparse(nl)
|
||||||
|
if link == base:
|
||||||
|
continue
|
||||||
|
link_set.add(link)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return link_set
|
||||||
|
|
||||||
def get_date(te):
|
def get_date(te):
|
||||||
dates = []
|
dates = []
|
||||||
@ -86,6 +106,39 @@ def get_date(te):
|
|||||||
pass
|
pass
|
||||||
return dates
|
return dates
|
||||||
|
|
||||||
|
class ParsedDocument:
|
||||||
|
"""
|
||||||
|
One document in the database
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self.work_link = None
|
||||||
|
|
||||||
|
self.content = None
|
||||||
|
self.bs = None
|
||||||
|
|
||||||
|
self.link_set = set()
|
||||||
|
self.body = None
|
||||||
|
self.text_date = None
|
||||||
|
self.tags = None
|
||||||
|
self.authors = None
|
||||||
|
self.title = None
|
||||||
|
self.description = None
|
||||||
|
self.section = None
|
||||||
|
self.article_published_time = None
|
||||||
|
self.current_time = datetime.date.today()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
r = []
|
||||||
|
if self.authors is not None:
|
||||||
|
r.append(",".join(self.authors))
|
||||||
|
if self.title is not None:
|
||||||
|
r.append(self.title)
|
||||||
|
if self.body is not None:
|
||||||
|
if (len(self.body) < 20):
|
||||||
|
r.append(self.body)
|
||||||
|
else:
|
||||||
|
r.append(self.body[0:20] + " ....")
|
||||||
|
return ">>> ".join(r)
|
||||||
|
|
||||||
class BaseParser:
|
class BaseParser:
|
||||||
def __init__(self, verbose=False):
|
def __init__(self, verbose=False):
|
||||||
@ -106,6 +159,21 @@ class BaseParser:
|
|||||||
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
||||||
self.justext_language = "Slovak"
|
self.justext_language = "Slovak"
|
||||||
|
|
||||||
|
def full_extract(self,content,bs,work_link):
|
||||||
|
"""
|
||||||
|
Parse content and fill the object
|
||||||
|
"""
|
||||||
|
pd = ParsedDocument()
|
||||||
|
pd.work_link = work_link
|
||||||
|
|
||||||
|
pd.current_time = datetime.date.today()
|
||||||
|
# Extract text and metatext
|
||||||
|
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
||||||
|
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
||||||
|
pd.link_set = get_bs_links(work_link,bs)
|
||||||
|
return pd
|
||||||
|
|
||||||
|
|
||||||
def is_domain_good(self, domain):
|
def is_domain_good(self, domain):
|
||||||
r = None
|
r = None
|
||||||
# Netloc
|
# Netloc
|
||||||
@ -192,7 +260,6 @@ class BaseParser:
|
|||||||
except lxml.etree.XMLSyntaxError:
|
except lxml.etree.XMLSyntaxError:
|
||||||
print("XML Syntax parse error")
|
print("XML Syntax parse error")
|
||||||
except lxml.etree.ParserError:
|
except lxml.etree.ParserError:
|
||||||
|
|
||||||
print("XML Parse parse error")
|
print("XML Parse parse error")
|
||||||
except justext.core.JustextError:
|
except justext.core.JustextError:
|
||||||
print("Justext error")
|
print("Justext error")
|
||||||
@ -295,34 +362,6 @@ class BaseParser:
|
|||||||
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
|
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
|
||||||
|
|
||||||
|
|
||||||
def calculate_checksums(self, text):
|
|
||||||
"""
|
|
||||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
|
||||||
"""
|
|
||||||
checksums = []
|
|
||||||
sizes = []
|
|
||||||
hval = 0
|
|
||||||
hsz = 0
|
|
||||||
sz = 0
|
|
||||||
for c in text:
|
|
||||||
cv = ord(c)
|
|
||||||
sz += 1
|
|
||||||
if cv > 64:
|
|
||||||
hval += (hval << 3) + cv
|
|
||||||
zv = hval >> 31
|
|
||||||
hval &= 0x7fffffff
|
|
||||||
hval += zv
|
|
||||||
hsz += 1
|
|
||||||
if c == "\n" and hsz > 0:
|
|
||||||
if hsz > 100:
|
|
||||||
checksums.append(hval)
|
|
||||||
sizes.append(sz)
|
|
||||||
sz = 0
|
|
||||||
hsz = 0
|
|
||||||
if hsz > 100:
|
|
||||||
checksums.append(hval)
|
|
||||||
sizes.append(sz)
|
|
||||||
return checksums, sizes
|
|
||||||
|
|
||||||
class EnglishParser(BaseParser):
|
class EnglishParser(BaseParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user