unvisited strategy
This commit is contained in:
parent
3687403184
commit
75e1b0cd6d
@ -40,9 +40,12 @@ class Response:
|
|||||||
self.redirects = redirects
|
self.redirects = redirects
|
||||||
self.visited_time = datetime.date.today()
|
self.visited_time = datetime.date.today()
|
||||||
self.bs = None
|
self.bs = None
|
||||||
self.link_status = link_status
|
|
||||||
if content is not None and link_status == "good":
|
if content is not None and link_status == "good":
|
||||||
self.bs = bs4.BeautifulSoup(content, "lxml")
|
try:
|
||||||
|
self.bs = bs4.BeautifulSoup(content, "lxml")
|
||||||
|
except ValueError:
|
||||||
|
link_status = "bad_parse"
|
||||||
|
self.link_status = link_status
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
|
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
|
||||||
@ -374,7 +377,7 @@ def parse_and_index(work_link,parser,responses,db):
|
|||||||
if len(responses) > 0:
|
if len(responses) > 0:
|
||||||
db.index_responses(work_link,responses)
|
db.index_responses(work_link,responses)
|
||||||
lr = responses[-1]
|
lr = responses[-1]
|
||||||
if lr.content is not None:
|
if lr.bs is not None:
|
||||||
target_link = lr.get_canonical()
|
target_link = lr.get_canonical()
|
||||||
parsed = ParsedDocument(parser,target_link)
|
parsed = ParsedDocument(parser,target_link)
|
||||||
parsed.extract(lr.content, lr.bs)
|
parsed.extract(lr.content, lr.bs)
|
||||||
@ -384,11 +387,12 @@ def parse_and_index(work_link,parser,responses,db):
|
|||||||
|
|
||||||
def visit_sitemap(domain,connection,parser,db):
|
def visit_sitemap(domain,connection,parser,db):
|
||||||
link = "http://" + domain
|
link = "http://" + domain
|
||||||
|
print("Sitemap visit: " + link)
|
||||||
responses = connection.html_download2(link)
|
responses = connection.html_download2(link)
|
||||||
if len(responses) == 0:
|
if len(responses) == 0:
|
||||||
return False
|
return False
|
||||||
lr = responses[-1]
|
lr = responses[-1]
|
||||||
if lr.link_status.startswith("bad_"):
|
if lr.bs is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
target_link,outlinks = parse_and_index(link,parser,responses,db)
|
target_link,outlinks = parse_and_index(link,parser,responses,db)
|
||||||
@ -397,17 +401,18 @@ def visit_sitemap(domain,connection,parser,db):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def visit_links(links,connection,parser,db):
|
def visit_links(links,connection,parser,db,is_online):
|
||||||
|
# is is not online, then just check links
|
||||||
outlinks = []
|
outlinks = []
|
||||||
junklinks = []
|
junklinks = []
|
||||||
badrobotlinks = []
|
badrobotlinks = []
|
||||||
for work_link in links:
|
for work_link in links:
|
||||||
responses = []
|
responses = []
|
||||||
if not parser.is_link_good(work_link):
|
if not parser.is_link_good(work_link):
|
||||||
db.update_link_status(parser,work_link,"junk")
|
db.update_link_status(work_link,"bad_link")
|
||||||
elif connection.is_robot_good(work_link):
|
elif is_online and not connection.is_robot_good(work_link):
|
||||||
db.update_link_status(parser,work_link,"bad_robot")
|
db.update_link_status(work_link,"bad_robot")
|
||||||
else:
|
elif is_online:
|
||||||
responses = connection.html_download2(work_link)
|
responses = connection.html_download2(work_link)
|
||||||
target_link,links = parse_and_index(work_link,parser,responses,db)
|
target_link,links = parse_and_index(work_link,parser,responses,db)
|
||||||
nl = normalize_link(target_link)
|
nl = normalize_link(target_link)
|
||||||
@ -421,13 +426,14 @@ def visit_domain(domain,parser,db):
|
|||||||
p = parser
|
p = parser
|
||||||
# Get links from frontpage
|
# Get links from frontpage
|
||||||
# TODO Sitemap
|
# TODO Sitemap
|
||||||
res = visit_sitemap(domain,c,parser,db)
|
is_online = False
|
||||||
if not res:
|
if parser.is_domain_good(domain):
|
||||||
return False
|
# Is domain online?
|
||||||
|
is_online = visit_sitemap(domain,c,parser,db)
|
||||||
for i in range(p.crawl_rounds):
|
for i in range(p.crawl_rounds):
|
||||||
# Visit links from frontpage
|
# Visit links from frontpage
|
||||||
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
||||||
visit_links(links,c,p,db)
|
visit_links(links,c,p,db,is_online)
|
||||||
db.check_domain(domain)
|
db.check_domain(domain)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -5,6 +5,8 @@ import os
|
|||||||
import pkg_resources
|
import pkg_resources
|
||||||
import datetime
|
import datetime
|
||||||
from websucker.parser import normalize_link,urlunparse
|
from websucker.parser import normalize_link,urlunparse
|
||||||
|
import collections
|
||||||
|
import math
|
||||||
|
|
||||||
VERSION = "sucker6"
|
VERSION = "sucker6"
|
||||||
|
|
||||||
@ -140,14 +142,16 @@ INSERT INTO content(
|
|||||||
def index_responses(self,source_link,responses):
|
def index_responses(self,source_link,responses):
|
||||||
# Redirect links
|
# Redirect links
|
||||||
pl = normalize_link(source_link)
|
pl = normalize_link(source_link)
|
||||||
|
domain = pl[1]
|
||||||
|
npl = urlunparse(pl)
|
||||||
for response in responses:
|
for response in responses:
|
||||||
tl = response.get_canonical()
|
tl = response.get_canonical()
|
||||||
if pl != tl:
|
if npl != tl:
|
||||||
self.update_link_status(source_link,"redirect",tl)
|
self.update_link_status(npl,"redirect",tl)
|
||||||
d = (
|
d = (
|
||||||
pl[1],
|
domain,
|
||||||
source_link,
|
npl,
|
||||||
response.get_canonical(),
|
tl,
|
||||||
response.redirects,
|
response.redirects,
|
||||||
response.status,
|
response.status,
|
||||||
response.headers,
|
response.headers,
|
||||||
@ -166,8 +170,16 @@ INSERT INTO content(
|
|||||||
fd = 0
|
fd = 0
|
||||||
jd = 0
|
jd = 0
|
||||||
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
||||||
|
# TODO submdomain analysis
|
||||||
|
#dd = collections.defaultdict(set)
|
||||||
for row in rows:
|
for row in rows:
|
||||||
if not parser.is_domain_good(row[0]):
|
domain = row[0]
|
||||||
|
#subdomains = domain.split(".")
|
||||||
|
#d2 = subdomains[-2] + "." + subdomains[-1]
|
||||||
|
#if len(subdomains) > 2:
|
||||||
|
# d3 = ".".join(subdomains[0:-2])
|
||||||
|
# dd[d2].add(d3)
|
||||||
|
if not parser.is_domain_good(domain):
|
||||||
jd += 1
|
jd += 1
|
||||||
if row[1] is not None:
|
if row[1] is not None:
|
||||||
gs += row[1]
|
gs += row[1]
|
||||||
@ -191,6 +203,9 @@ INSERT INTO content(
|
|||||||
print("Junk domains: {}".format(jd))
|
print("Junk domains: {}".format(jd))
|
||||||
print("New links : {}".format(sl))
|
print("New links : {}".format(sl))
|
||||||
print("Finished domains : {}".format(fd))
|
print("Finished domains : {}".format(fd))
|
||||||
|
#for d,sd in dd.items():
|
||||||
|
# if len(sd) > 1:
|
||||||
|
# print(d + " " + ",".join(sd))
|
||||||
|
|
||||||
def daily_report(self):
|
def daily_report(self):
|
||||||
#rows = self.session.execute(self.daily_links_select)
|
#rows = self.session.execute(self.daily_links_select)
|
||||||
@ -222,8 +237,8 @@ INSERT INTO content(
|
|||||||
|
|
||||||
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
||||||
|
|
||||||
def update_link_status(self,parser,links,status,redirect_target=None):
|
def update_link_status(self,link,status,redirect_target=None):
|
||||||
pl = normalize_link(source_link)
|
pl = normalize_link(link)
|
||||||
r = (
|
r = (
|
||||||
status,
|
status,
|
||||||
redirect_target,
|
redirect_target,
|
||||||
@ -244,6 +259,7 @@ INSERT INTO content(
|
|||||||
follow_links.add(urlunparse(link))
|
follow_links.add(urlunparse(link))
|
||||||
|
|
||||||
newlinkdomains = set()
|
newlinkdomains = set()
|
||||||
|
newlinkcount = 0
|
||||||
for link in follow_links:
|
for link in follow_links:
|
||||||
value = []
|
value = []
|
||||||
nl = normalize_link(link)
|
nl = normalize_link(link)
|
||||||
@ -253,8 +269,10 @@ INSERT INTO content(
|
|||||||
row = rows.one()
|
row = rows.one()
|
||||||
if row.applied:
|
if row.applied:
|
||||||
newlinkdomains.add(nl[1])
|
newlinkdomains.add(nl[1])
|
||||||
|
newlinkcount += 1
|
||||||
for domain in newlinkdomains:
|
for domain in newlinkdomains:
|
||||||
self.check_domain(domain)
|
self.check_domain(domain)
|
||||||
|
print("{} new links, {} new domains".format(newlinkcount,len(newlinkdomains)))
|
||||||
|
|
||||||
|
|
||||||
def index_content(self,target_link,parsed_document):
|
def index_content(self,target_link,parsed_document):
|
||||||
@ -306,7 +324,6 @@ INSERT INTO content(
|
|||||||
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
|
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
|
||||||
if originality < 0.8:
|
if originality < 0.8:
|
||||||
link_status = "bad_copy"
|
link_status = "bad_copy"
|
||||||
print(nl)
|
|
||||||
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
|
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
|
||||||
content_future.result()
|
content_future.result()
|
||||||
print("<<<< " + link_status + " " + str(originality))
|
print("<<<< " + link_status + " " + str(originality))
|
||||||
@ -428,7 +445,6 @@ INSERT INTO content(
|
|||||||
domain)
|
domain)
|
||||||
if fetched_count > 0 or seen_count > 0:
|
if fetched_count > 0 or seen_count > 0:
|
||||||
self.session.execute(self.domain_quality_update,uv)
|
self.session.execute(self.domain_quality_update,uv)
|
||||||
print(uv)
|
|
||||||
return average_fetched_good_characters
|
return average_fetched_good_characters
|
||||||
|
|
||||||
def all_domains(self,count):
|
def all_domains(self,count):
|
||||||
@ -476,16 +492,34 @@ INSERT INTO content(
|
|||||||
# get all domains
|
# get all domains
|
||||||
rows = self.session.execute(self.domains_select)
|
rows = self.session.execute(self.domains_select)
|
||||||
domains = []
|
domains = []
|
||||||
|
# Analyze third level domains
|
||||||
|
dd = collections.defaultdict(set)
|
||||||
|
third_count = 0
|
||||||
for row in rows:
|
for row in rows:
|
||||||
domain = row[0]
|
domain = row[0]
|
||||||
seen_count = row[1]
|
seen_count = row[1]
|
||||||
fetched_count = row[2]
|
fetched_count = row[2]
|
||||||
gain_ratio = row[3]
|
gain_ratio = row[3]
|
||||||
afg = row[4]
|
afg = row[4]
|
||||||
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
if seen_count and not fetched_count:
|
||||||
domains.append((domain,0))
|
subdomains = domain.split(".")
|
||||||
ss = min(len(domains),count)
|
d2 = subdomains[-2] + "." + subdomains[-1]
|
||||||
return random.sample(domains,ss)
|
dd[d2].add(domain)
|
||||||
|
# Select second level first
|
||||||
|
result = []
|
||||||
|
# then select third level
|
||||||
|
ll = list(dd.items())
|
||||||
|
random.shuffle(ll)
|
||||||
|
domain_weight = count / len(ll)
|
||||||
|
for domain,subdomains in ll:
|
||||||
|
dl = list(subdomains)
|
||||||
|
link_weight = domain_weight / len(dl)
|
||||||
|
random.shuffle(dl)
|
||||||
|
for d in dl:
|
||||||
|
r = random.random()
|
||||||
|
if r < link_weight:
|
||||||
|
result.append((d,0))
|
||||||
|
return result
|
||||||
|
|
||||||
def get_visit_links(self,domain,recent_count,old_count,random_count):
|
def get_visit_links(self,domain,recent_count,old_count,random_count):
|
||||||
dblinks = []
|
dblinks = []
|
||||||
|
@ -113,7 +113,7 @@ class BaseParser:
|
|||||||
r = "Port in domain"
|
r = "Port in domain"
|
||||||
elif len(domain) < 4:
|
elif len(domain) < 4:
|
||||||
r = "Too short domain"
|
r = "Too short domain"
|
||||||
elif len(domain) > 50:
|
elif len(domain) > 127:
|
||||||
r = "Too long location"
|
r = "Too long location"
|
||||||
elif domain.startswith(".") or domain.endswith("."):
|
elif domain.startswith(".") or domain.endswith("."):
|
||||||
r = "Malformed domain"
|
r = "Malformed domain"
|
||||||
@ -152,16 +152,8 @@ class BaseParser:
|
|||||||
return False
|
return False
|
||||||
for c in link:
|
for c in link:
|
||||||
if ord(c) >= 128:
|
if ord(c) >= 128:
|
||||||
r = "Bad domain character"
|
r = "Bad link character"
|
||||||
break
|
break
|
||||||
for p in self.skipdomains:
|
|
||||||
if domain.endswith(p):
|
|
||||||
r = "Bad domain"
|
|
||||||
break
|
|
||||||
if ".b-" in domain:
|
|
||||||
r = "Bad domain"
|
|
||||||
if len(domain) > 127:
|
|
||||||
r = "Too long path"
|
|
||||||
# Path
|
# Path
|
||||||
for t in self.skiptypes:
|
for t in self.skiptypes:
|
||||||
if path.lower().endswith(t):
|
if path.lower().endswith(t):
|
||||||
|
Loading…
Reference in New Issue
Block a user