From 85c0380e16132d25806273f26dfc22e99c7d0a3e Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Mon, 8 Jun 2020 16:09:47 +0200 Subject: [PATCH] zz --- websucker/agent.py | 13 +++++++++---- websucker/cli.py | 15 +++++++++++++++ websucker/db.py | 32 +++++++++++++++++++------------- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/websucker/agent.py b/websucker/agent.py index e385e4a..860a5ac 100755 --- a/websucker/agent.py +++ b/websucker/agent.py @@ -15,6 +15,7 @@ import bs4 import pycurl import urllib.robotparser import collections +import random from websucker.parser import normalize_link,urlunparse @@ -430,15 +431,19 @@ def visit_domain(domain,parser,db): if parser.is_domain_good(domain): # Is domain online? is_online = visit_sitemap(domain,c,parser,db) - for i in range(p.crawl_rounds): - # Visit links from frontpage - links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) - visit_links(links,c,p,db,is_online) + if is_online: + for i in range(p.crawl_rounds): + # Visit links from frontpage + links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) + visit_links(links,c,p,db,is_online) + db.check_domain(domain) + else: db.check_domain(domain) return True def process_domains(domains,visit,parser,db,queue): print("Websucker Agenda>>") + random.shuffle(domains) for domain in domains: assert len(domain[0]) > 1 print(domain) diff --git a/websucker/cli.py b/websucker/cli.py index 99ea05a..b094b04 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -97,6 +97,21 @@ def unvisited(ctx, count): q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p,db,q) +@cli.command(help="Visit domains from file") +@click.pass_context +@click.argument("name") +def file(ctx, name): + db = create_database_from_context(ctx) + p = ctx.obj["parser"] + domains = [] + with open(name) as f: + for l in f: + domains.append((l.strip(),0)) + q = None + if ctx.obj["queue"]: + q = create_queue_from_context(ctx) + process_domains(domains,ctx.obj["visit"],p,db,q) + @cli.command(help="Visit url and get links. Start here") @click.pass_context @click.argument("link") diff --git a/websucker/db.py b/websucker/db.py index 58207b8..e1aa8aa 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -163,45 +163,51 @@ INSERT INTO content( def summary(self,parser): gs = 0 cs = 0 - fc = 0 + fetched_documents = 0 vd = 0 - ud = 0 + unvisited_domains = 0 + unvisited_junk_domains = 0 sl = 0 fd = 0 - jd = 0 + junk_domains = 0 rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1") # TODO submdomain analysis #dd = collections.defaultdict(set) + second_level = set() for row in rows: domain = row[0] - #subdomains = domain.split(".") + subdomains = domain.split(".") #d2 = subdomains[-2] + "." + subdomains[-1] - #if len(subdomains) > 2: - # d3 = ".".join(subdomains[0:-2]) - # dd[d2].add(d3) + if len(subdomains) > 2: + d3 = ".".join(subdomains[0:-2]) + second_level.add(d3) if not parser.is_domain_good(domain): - jd += 1 + junk_domains += 1 if row[1] is not None: gs += row[1] if row[2] is not None: cs += row[2] if row[3] is not None: - fc += row[3] + fetched_documents += row[3] if row[4] is not None: sl += row[4] if row[3] is None or row[3] == 0: - ud += 1 + unvisited_domains += 1 + if not parser.is_domain_good(domain): + unvisited_junk_domains += 1 else: vd += 1 if row[4] is None or row[4] == 0: fd += 1 print("Good characters: {}".format(gs)) print("Fetched characters: {}".format(cs)) - print("Fetched documents: {}".format(fc)) + print("Fetched documents: {}".format(fetched_documents)) print("Visited domains: {}".format(vd)) - print("Unvisited domains: {}".format(ud)) - print("Junk domains: {}".format(jd)) + print("Unvisited domains: {}".format(unvisited_domains)) + print("Junk domains: {}".format(junk_domains)) + print("Unvisited junk domains: {}".format(unvisited_junk_domains)) print("New links : {}".format(sl)) + print("Second level domains: {}".format(len(second_level))) print("Finished domains : {}".format(fd)) #for d,sd in dd.items(): # if len(sd) > 1: