unvisited strategy

This commit is contained in:
Daniel Hladek 2020-06-06 11:29:36 +02:00
parent 3687403184
commit 75e1b0cd6d
3 changed files with 69 additions and 37 deletions

View File

@ -40,9 +40,12 @@ class Response:
self.redirects = redirects self.redirects = redirects
self.visited_time = datetime.date.today() self.visited_time = datetime.date.today()
self.bs = None self.bs = None
self.link_status = link_status
if content is not None and link_status == "good": if content is not None and link_status == "good":
self.bs = bs4.BeautifulSoup(content, "lxml") try:
self.bs = bs4.BeautifulSoup(content, "lxml")
except ValueError:
link_status = "bad_parse"
self.link_status = link_status
def __str__(self): def __str__(self):
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status) return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
@ -374,7 +377,7 @@ def parse_and_index(work_link,parser,responses,db):
if len(responses) > 0: if len(responses) > 0:
db.index_responses(work_link,responses) db.index_responses(work_link,responses)
lr = responses[-1] lr = responses[-1]
if lr.content is not None: if lr.bs is not None:
target_link = lr.get_canonical() target_link = lr.get_canonical()
parsed = ParsedDocument(parser,target_link) parsed = ParsedDocument(parser,target_link)
parsed.extract(lr.content, lr.bs) parsed.extract(lr.content, lr.bs)
@ -384,11 +387,12 @@ def parse_and_index(work_link,parser,responses,db):
def visit_sitemap(domain,connection,parser,db): def visit_sitemap(domain,connection,parser,db):
link = "http://" + domain link = "http://" + domain
print("Sitemap visit: " + link)
responses = connection.html_download2(link) responses = connection.html_download2(link)
if len(responses) == 0: if len(responses) == 0:
return False return False
lr = responses[-1] lr = responses[-1]
if lr.link_status.startswith("bad_"): if lr.bs is None:
return False return False
target_link,outlinks = parse_and_index(link,parser,responses,db) target_link,outlinks = parse_and_index(link,parser,responses,db)
@ -397,17 +401,18 @@ def visit_sitemap(domain,connection,parser,db):
return True return True
def visit_links(links,connection,parser,db): def visit_links(links,connection,parser,db,is_online):
# is is not online, then just check links
outlinks = [] outlinks = []
junklinks = [] junklinks = []
badrobotlinks = [] badrobotlinks = []
for work_link in links: for work_link in links:
responses = [] responses = []
if not parser.is_link_good(work_link): if not parser.is_link_good(work_link):
db.update_link_status(parser,work_link,"junk") db.update_link_status(work_link,"bad_link")
elif connection.is_robot_good(work_link): elif is_online and not connection.is_robot_good(work_link):
db.update_link_status(parser,work_link,"bad_robot") db.update_link_status(work_link,"bad_robot")
else: elif is_online:
responses = connection.html_download2(work_link) responses = connection.html_download2(work_link)
target_link,links = parse_and_index(work_link,parser,responses,db) target_link,links = parse_and_index(work_link,parser,responses,db)
nl = normalize_link(target_link) nl = normalize_link(target_link)
@ -421,13 +426,14 @@ def visit_domain(domain,parser,db):
p = parser p = parser
# Get links from frontpage # Get links from frontpage
# TODO Sitemap # TODO Sitemap
res = visit_sitemap(domain,c,parser,db) is_online = False
if not res: if parser.is_domain_good(domain):
return False # Is domain online?
is_online = visit_sitemap(domain,c,parser,db)
for i in range(p.crawl_rounds): for i in range(p.crawl_rounds):
# Visit links from frontpage # Visit links from frontpage
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db) visit_links(links,c,p,db,is_online)
db.check_domain(domain) db.check_domain(domain)
return True return True

View File

@ -5,6 +5,8 @@ import os
import pkg_resources import pkg_resources
import datetime import datetime
from websucker.parser import normalize_link,urlunparse from websucker.parser import normalize_link,urlunparse
import collections
import math
VERSION = "sucker6" VERSION = "sucker6"
@ -140,14 +142,16 @@ INSERT INTO content(
def index_responses(self,source_link,responses): def index_responses(self,source_link,responses):
# Redirect links # Redirect links
pl = normalize_link(source_link) pl = normalize_link(source_link)
domain = pl[1]
npl = urlunparse(pl)
for response in responses: for response in responses:
tl = response.get_canonical() tl = response.get_canonical()
if pl != tl: if npl != tl:
self.update_link_status(source_link,"redirect",tl) self.update_link_status(npl,"redirect",tl)
d = ( d = (
pl[1], domain,
source_link, npl,
response.get_canonical(), tl,
response.redirects, response.redirects,
response.status, response.status,
response.headers, response.headers,
@ -166,8 +170,16 @@ INSERT INTO content(
fd = 0 fd = 0
jd = 0 jd = 0
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1") rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
# TODO submdomain analysis
#dd = collections.defaultdict(set)
for row in rows: for row in rows:
if not parser.is_domain_good(row[0]): domain = row[0]
#subdomains = domain.split(".")
#d2 = subdomains[-2] + "." + subdomains[-1]
#if len(subdomains) > 2:
# d3 = ".".join(subdomains[0:-2])
# dd[d2].add(d3)
if not parser.is_domain_good(domain):
jd += 1 jd += 1
if row[1] is not None: if row[1] is not None:
gs += row[1] gs += row[1]
@ -191,6 +203,9 @@ INSERT INTO content(
print("Junk domains: {}".format(jd)) print("Junk domains: {}".format(jd))
print("New links : {}".format(sl)) print("New links : {}".format(sl))
print("Finished domains : {}".format(fd)) print("Finished domains : {}".format(fd))
#for d,sd in dd.items():
# if len(sd) > 1:
# print(d + " " + ",".join(sd))
def daily_report(self): def daily_report(self):
#rows = self.session.execute(self.daily_links_select) #rows = self.session.execute(self.daily_links_select)
@ -222,8 +237,8 @@ INSERT INTO content(
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size)) print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
def update_link_status(self,parser,links,status,redirect_target=None): def update_link_status(self,link,status,redirect_target=None):
pl = normalize_link(source_link) pl = normalize_link(link)
r = ( r = (
status, status,
redirect_target, redirect_target,
@ -244,6 +259,7 @@ INSERT INTO content(
follow_links.add(urlunparse(link)) follow_links.add(urlunparse(link))
newlinkdomains = set() newlinkdomains = set()
newlinkcount = 0
for link in follow_links: for link in follow_links:
value = [] value = []
nl = normalize_link(link) nl = normalize_link(link)
@ -253,8 +269,10 @@ INSERT INTO content(
row = rows.one() row = rows.one()
if row.applied: if row.applied:
newlinkdomains.add(nl[1]) newlinkdomains.add(nl[1])
newlinkcount += 1
for domain in newlinkdomains: for domain in newlinkdomains:
self.check_domain(domain) self.check_domain(domain)
print("{} new links, {} new domains".format(newlinkcount,len(newlinkdomains)))
def index_content(self,target_link,parsed_document): def index_content(self,target_link,parsed_document):
@ -306,7 +324,6 @@ INSERT INTO content(
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes) originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
if originality < 0.8: if originality < 0.8:
link_status = "bad_copy" link_status = "bad_copy"
print(nl)
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3])) self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
content_future.result() content_future.result()
print("<<<< " + link_status + " " + str(originality)) print("<<<< " + link_status + " " + str(originality))
@ -428,7 +445,6 @@ INSERT INTO content(
domain) domain)
if fetched_count > 0 or seen_count > 0: if fetched_count > 0 or seen_count > 0:
self.session.execute(self.domain_quality_update,uv) self.session.execute(self.domain_quality_update,uv)
print(uv)
return average_fetched_good_characters return average_fetched_good_characters
def all_domains(self,count): def all_domains(self,count):
@ -476,16 +492,34 @@ INSERT INTO content(
# get all domains # get all domains
rows = self.session.execute(self.domains_select) rows = self.session.execute(self.domains_select)
domains = [] domains = []
# Analyze third level domains
dd = collections.defaultdict(set)
third_count = 0
for row in rows: for row in rows:
domain = row[0] domain = row[0]
seen_count = row[1] seen_count = row[1]
fetched_count = row[2] fetched_count = row[2]
gain_ratio = row[3] gain_ratio = row[3]
afg = row[4] afg = row[4]
if seen_count and not fetched_count and parser.is_domain_good(domain): if seen_count and not fetched_count:
domains.append((domain,0)) subdomains = domain.split(".")
ss = min(len(domains),count) d2 = subdomains[-2] + "." + subdomains[-1]
return random.sample(domains,ss) dd[d2].add(domain)
# Select second level first
result = []
# then select third level
ll = list(dd.items())
random.shuffle(ll)
domain_weight = count / len(ll)
for domain,subdomains in ll:
dl = list(subdomains)
link_weight = domain_weight / len(dl)
random.shuffle(dl)
for d in dl:
r = random.random()
if r < link_weight:
result.append((d,0))
return result
def get_visit_links(self,domain,recent_count,old_count,random_count): def get_visit_links(self,domain,recent_count,old_count,random_count):
dblinks = [] dblinks = []

View File

@ -113,7 +113,7 @@ class BaseParser:
r = "Port in domain" r = "Port in domain"
elif len(domain) < 4: elif len(domain) < 4:
r = "Too short domain" r = "Too short domain"
elif len(domain) > 50: elif len(domain) > 127:
r = "Too long location" r = "Too long location"
elif domain.startswith(".") or domain.endswith("."): elif domain.startswith(".") or domain.endswith("."):
r = "Malformed domain" r = "Malformed domain"
@ -152,16 +152,8 @@ class BaseParser:
return False return False
for c in link: for c in link:
if ord(c) >= 128: if ord(c) >= 128:
r = "Bad domain character" r = "Bad link character"
break break
for p in self.skipdomains:
if domain.endswith(p):
r = "Bad domain"
break
if ".b-" in domain:
r = "Bad domain"
if len(domain) > 127:
r = "Too long path"
# Path # Path
for t in self.skiptypes: for t in self.skiptypes:
if path.lower().endswith(t): if path.lower().endswith(t):