zz
This commit is contained in:
parent
75e1b0cd6d
commit
85c0380e16
@ -15,6 +15,7 @@ import bs4
|
||||
import pycurl
|
||||
import urllib.robotparser
|
||||
import collections
|
||||
import random
|
||||
|
||||
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
@ -430,15 +431,19 @@ def visit_domain(domain,parser,db):
|
||||
if parser.is_domain_good(domain):
|
||||
# Is domain online?
|
||||
is_online = visit_sitemap(domain,c,parser,db)
|
||||
if is_online:
|
||||
for i in range(p.crawl_rounds):
|
||||
# Visit links from frontpage
|
||||
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
||||
visit_links(links,c,p,db,is_online)
|
||||
db.check_domain(domain)
|
||||
else:
|
||||
db.check_domain(domain)
|
||||
return True
|
||||
|
||||
def process_domains(domains,visit,parser,db,queue):
|
||||
print("Websucker Agenda>>")
|
||||
random.shuffle(domains)
|
||||
for domain in domains:
|
||||
assert len(domain[0]) > 1
|
||||
print(domain)
|
||||
|
@ -97,6 +97,21 @@ def unvisited(ctx, count):
|
||||
q = create_queue_from_context(ctx)
|
||||
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||
|
||||
@cli.command(help="Visit domains from file")
|
||||
@click.pass_context
|
||||
@click.argument("name")
|
||||
def file(ctx, name):
|
||||
db = create_database_from_context(ctx)
|
||||
p = ctx.obj["parser"]
|
||||
domains = []
|
||||
with open(name) as f:
|
||||
for l in f:
|
||||
domains.append((l.strip(),0))
|
||||
q = None
|
||||
if ctx.obj["queue"]:
|
||||
q = create_queue_from_context(ctx)
|
||||
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||
|
||||
@cli.command(help="Visit url and get links. Start here")
|
||||
@click.pass_context
|
||||
@click.argument("link")
|
||||
|
@ -163,45 +163,51 @@ INSERT INTO content(
|
||||
def summary(self,parser):
|
||||
gs = 0
|
||||
cs = 0
|
||||
fc = 0
|
||||
fetched_documents = 0
|
||||
vd = 0
|
||||
ud = 0
|
||||
unvisited_domains = 0
|
||||
unvisited_junk_domains = 0
|
||||
sl = 0
|
||||
fd = 0
|
||||
jd = 0
|
||||
junk_domains = 0
|
||||
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
||||
# TODO submdomain analysis
|
||||
#dd = collections.defaultdict(set)
|
||||
second_level = set()
|
||||
for row in rows:
|
||||
domain = row[0]
|
||||
#subdomains = domain.split(".")
|
||||
subdomains = domain.split(".")
|
||||
#d2 = subdomains[-2] + "." + subdomains[-1]
|
||||
#if len(subdomains) > 2:
|
||||
# d3 = ".".join(subdomains[0:-2])
|
||||
# dd[d2].add(d3)
|
||||
if len(subdomains) > 2:
|
||||
d3 = ".".join(subdomains[0:-2])
|
||||
second_level.add(d3)
|
||||
if not parser.is_domain_good(domain):
|
||||
jd += 1
|
||||
junk_domains += 1
|
||||
if row[1] is not None:
|
||||
gs += row[1]
|
||||
if row[2] is not None:
|
||||
cs += row[2]
|
||||
if row[3] is not None:
|
||||
fc += row[3]
|
||||
fetched_documents += row[3]
|
||||
if row[4] is not None:
|
||||
sl += row[4]
|
||||
if row[3] is None or row[3] == 0:
|
||||
ud += 1
|
||||
unvisited_domains += 1
|
||||
if not parser.is_domain_good(domain):
|
||||
unvisited_junk_domains += 1
|
||||
else:
|
||||
vd += 1
|
||||
if row[4] is None or row[4] == 0:
|
||||
fd += 1
|
||||
print("Good characters: {}".format(gs))
|
||||
print("Fetched characters: {}".format(cs))
|
||||
print("Fetched documents: {}".format(fc))
|
||||
print("Fetched documents: {}".format(fetched_documents))
|
||||
print("Visited domains: {}".format(vd))
|
||||
print("Unvisited domains: {}".format(ud))
|
||||
print("Junk domains: {}".format(jd))
|
||||
print("Unvisited domains: {}".format(unvisited_domains))
|
||||
print("Junk domains: {}".format(junk_domains))
|
||||
print("Unvisited junk domains: {}".format(unvisited_junk_domains))
|
||||
print("New links : {}".format(sl))
|
||||
print("Second level domains: {}".format(len(second_level)))
|
||||
print("Finished domains : {}".format(fd))
|
||||
#for d,sd in dd.items():
|
||||
# if len(sd) > 1:
|
||||
|
Loading…
Reference in New Issue
Block a user