zz
This commit is contained in:
parent
75e1b0cd6d
commit
85c0380e16
@ -15,6 +15,7 @@ import bs4
|
|||||||
import pycurl
|
import pycurl
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
import collections
|
import collections
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
from websucker.parser import normalize_link,urlunparse
|
from websucker.parser import normalize_link,urlunparse
|
||||||
@ -430,15 +431,19 @@ def visit_domain(domain,parser,db):
|
|||||||
if parser.is_domain_good(domain):
|
if parser.is_domain_good(domain):
|
||||||
# Is domain online?
|
# Is domain online?
|
||||||
is_online = visit_sitemap(domain,c,parser,db)
|
is_online = visit_sitemap(domain,c,parser,db)
|
||||||
for i in range(p.crawl_rounds):
|
if is_online:
|
||||||
# Visit links from frontpage
|
for i in range(p.crawl_rounds):
|
||||||
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
# Visit links from frontpage
|
||||||
visit_links(links,c,p,db,is_online)
|
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
||||||
|
visit_links(links,c,p,db,is_online)
|
||||||
|
db.check_domain(domain)
|
||||||
|
else:
|
||||||
db.check_domain(domain)
|
db.check_domain(domain)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def process_domains(domains,visit,parser,db,queue):
|
def process_domains(domains,visit,parser,db,queue):
|
||||||
print("Websucker Agenda>>")
|
print("Websucker Agenda>>")
|
||||||
|
random.shuffle(domains)
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
assert len(domain[0]) > 1
|
assert len(domain[0]) > 1
|
||||||
print(domain)
|
print(domain)
|
||||||
|
@ -97,6 +97,21 @@ def unvisited(ctx, count):
|
|||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(domains,ctx.obj["visit"],p,db,q)
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||||
|
|
||||||
|
@cli.command(help="Visit domains from file")
|
||||||
|
@click.pass_context
|
||||||
|
@click.argument("name")
|
||||||
|
def file(ctx, name):
|
||||||
|
db = create_database_from_context(ctx)
|
||||||
|
p = ctx.obj["parser"]
|
||||||
|
domains = []
|
||||||
|
with open(name) as f:
|
||||||
|
for l in f:
|
||||||
|
domains.append((l.strip(),0))
|
||||||
|
q = None
|
||||||
|
if ctx.obj["queue"]:
|
||||||
|
q = create_queue_from_context(ctx)
|
||||||
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||||
|
|
||||||
@cli.command(help="Visit url and get links. Start here")
|
@cli.command(help="Visit url and get links. Start here")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
|
@ -163,45 +163,51 @@ INSERT INTO content(
|
|||||||
def summary(self,parser):
|
def summary(self,parser):
|
||||||
gs = 0
|
gs = 0
|
||||||
cs = 0
|
cs = 0
|
||||||
fc = 0
|
fetched_documents = 0
|
||||||
vd = 0
|
vd = 0
|
||||||
ud = 0
|
unvisited_domains = 0
|
||||||
|
unvisited_junk_domains = 0
|
||||||
sl = 0
|
sl = 0
|
||||||
fd = 0
|
fd = 0
|
||||||
jd = 0
|
junk_domains = 0
|
||||||
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
|
||||||
# TODO submdomain analysis
|
# TODO submdomain analysis
|
||||||
#dd = collections.defaultdict(set)
|
#dd = collections.defaultdict(set)
|
||||||
|
second_level = set()
|
||||||
for row in rows:
|
for row in rows:
|
||||||
domain = row[0]
|
domain = row[0]
|
||||||
#subdomains = domain.split(".")
|
subdomains = domain.split(".")
|
||||||
#d2 = subdomains[-2] + "." + subdomains[-1]
|
#d2 = subdomains[-2] + "." + subdomains[-1]
|
||||||
#if len(subdomains) > 2:
|
if len(subdomains) > 2:
|
||||||
# d3 = ".".join(subdomains[0:-2])
|
d3 = ".".join(subdomains[0:-2])
|
||||||
# dd[d2].add(d3)
|
second_level.add(d3)
|
||||||
if not parser.is_domain_good(domain):
|
if not parser.is_domain_good(domain):
|
||||||
jd += 1
|
junk_domains += 1
|
||||||
if row[1] is not None:
|
if row[1] is not None:
|
||||||
gs += row[1]
|
gs += row[1]
|
||||||
if row[2] is not None:
|
if row[2] is not None:
|
||||||
cs += row[2]
|
cs += row[2]
|
||||||
if row[3] is not None:
|
if row[3] is not None:
|
||||||
fc += row[3]
|
fetched_documents += row[3]
|
||||||
if row[4] is not None:
|
if row[4] is not None:
|
||||||
sl += row[4]
|
sl += row[4]
|
||||||
if row[3] is None or row[3] == 0:
|
if row[3] is None or row[3] == 0:
|
||||||
ud += 1
|
unvisited_domains += 1
|
||||||
|
if not parser.is_domain_good(domain):
|
||||||
|
unvisited_junk_domains += 1
|
||||||
else:
|
else:
|
||||||
vd += 1
|
vd += 1
|
||||||
if row[4] is None or row[4] == 0:
|
if row[4] is None or row[4] == 0:
|
||||||
fd += 1
|
fd += 1
|
||||||
print("Good characters: {}".format(gs))
|
print("Good characters: {}".format(gs))
|
||||||
print("Fetched characters: {}".format(cs))
|
print("Fetched characters: {}".format(cs))
|
||||||
print("Fetched documents: {}".format(fc))
|
print("Fetched documents: {}".format(fetched_documents))
|
||||||
print("Visited domains: {}".format(vd))
|
print("Visited domains: {}".format(vd))
|
||||||
print("Unvisited domains: {}".format(ud))
|
print("Unvisited domains: {}".format(unvisited_domains))
|
||||||
print("Junk domains: {}".format(jd))
|
print("Junk domains: {}".format(junk_domains))
|
||||||
|
print("Unvisited junk domains: {}".format(unvisited_junk_domains))
|
||||||
print("New links : {}".format(sl))
|
print("New links : {}".format(sl))
|
||||||
|
print("Second level domains: {}".format(len(second_level)))
|
||||||
print("Finished domains : {}".format(fd))
|
print("Finished domains : {}".format(fd))
|
||||||
#for d,sd in dd.items():
|
#for d,sd in dd.items():
|
||||||
# if len(sd) > 1:
|
# if len(sd) > 1:
|
||||||
|
Loading…
Reference in New Issue
Block a user