This commit is contained in:
Daniel Hladek 2020-06-08 16:09:47 +02:00
parent 75e1b0cd6d
commit 85c0380e16
3 changed files with 43 additions and 17 deletions

View File

@ -15,6 +15,7 @@ import bs4
import pycurl import pycurl
import urllib.robotparser import urllib.robotparser
import collections import collections
import random
from websucker.parser import normalize_link,urlunparse from websucker.parser import normalize_link,urlunparse
@ -430,15 +431,19 @@ def visit_domain(domain,parser,db):
if parser.is_domain_good(domain): if parser.is_domain_good(domain):
# Is domain online? # Is domain online?
is_online = visit_sitemap(domain,c,parser,db) is_online = visit_sitemap(domain,c,parser,db)
if is_online:
for i in range(p.crawl_rounds): for i in range(p.crawl_rounds):
# Visit links from frontpage # Visit links from frontpage
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db,is_online) visit_links(links,c,p,db,is_online)
db.check_domain(domain) db.check_domain(domain)
else:
db.check_domain(domain)
return True return True
def process_domains(domains,visit,parser,db,queue): def process_domains(domains,visit,parser,db,queue):
print("Websucker Agenda>>") print("Websucker Agenda>>")
random.shuffle(domains)
for domain in domains: for domain in domains:
assert len(domain[0]) > 1 assert len(domain[0]) > 1
print(domain) print(domain)

View File

@ -97,6 +97,21 @@ def unvisited(ctx, count):
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q) process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit domains from file")
@click.pass_context
@click.argument("name")
def file(ctx, name):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = []
with open(name) as f:
for l in f:
domains.append((l.strip(),0))
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here") @cli.command(help="Visit url and get links. Start here")
@click.pass_context @click.pass_context
@click.argument("link") @click.argument("link")

View File

@ -163,45 +163,51 @@ INSERT INTO content(
def summary(self,parser): def summary(self,parser):
gs = 0 gs = 0
cs = 0 cs = 0
fc = 0 fetched_documents = 0
vd = 0 vd = 0
ud = 0 unvisited_domains = 0
unvisited_junk_domains = 0
sl = 0 sl = 0
fd = 0 fd = 0
jd = 0 junk_domains = 0
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1") rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
# TODO submdomain analysis # TODO submdomain analysis
#dd = collections.defaultdict(set) #dd = collections.defaultdict(set)
second_level = set()
for row in rows: for row in rows:
domain = row[0] domain = row[0]
#subdomains = domain.split(".") subdomains = domain.split(".")
#d2 = subdomains[-2] + "." + subdomains[-1] #d2 = subdomains[-2] + "." + subdomains[-1]
#if len(subdomains) > 2: if len(subdomains) > 2:
# d3 = ".".join(subdomains[0:-2]) d3 = ".".join(subdomains[0:-2])
# dd[d2].add(d3) second_level.add(d3)
if not parser.is_domain_good(domain): if not parser.is_domain_good(domain):
jd += 1 junk_domains += 1
if row[1] is not None: if row[1] is not None:
gs += row[1] gs += row[1]
if row[2] is not None: if row[2] is not None:
cs += row[2] cs += row[2]
if row[3] is not None: if row[3] is not None:
fc += row[3] fetched_documents += row[3]
if row[4] is not None: if row[4] is not None:
sl += row[4] sl += row[4]
if row[3] is None or row[3] == 0: if row[3] is None or row[3] == 0:
ud += 1 unvisited_domains += 1
if not parser.is_domain_good(domain):
unvisited_junk_domains += 1
else: else:
vd += 1 vd += 1
if row[4] is None or row[4] == 0: if row[4] is None or row[4] == 0:
fd += 1 fd += 1
print("Good characters: {}".format(gs)) print("Good characters: {}".format(gs))
print("Fetched characters: {}".format(cs)) print("Fetched characters: {}".format(cs))
print("Fetched documents: {}".format(fc)) print("Fetched documents: {}".format(fetched_documents))
print("Visited domains: {}".format(vd)) print("Visited domains: {}".format(vd))
print("Unvisited domains: {}".format(ud)) print("Unvisited domains: {}".format(unvisited_domains))
print("Junk domains: {}".format(jd)) print("Junk domains: {}".format(junk_domains))
print("Unvisited junk domains: {}".format(unvisited_junk_domains))
print("New links : {}".format(sl)) print("New links : {}".format(sl))
print("Second level domains: {}".format(len(second_level)))
print("Finished domains : {}".format(fd)) print("Finished domains : {}".format(fd))
#for d,sd in dd.items(): #for d,sd in dd.items():
# if len(sd) > 1: # if len(sd) > 1: