From 8d4a873005d909d5a3f08f5b6a028ca6c8113ac5 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Wed, 20 May 2020 09:22:19 +0200 Subject: [PATCH] zz --- websucker/cli.py | 11 ++++++----- websucker/db.py | 10 +++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/websucker/cli.py b/websucker/cli.py index 93d9d79..34bdbd8 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -77,11 +77,12 @@ def work(ctx): #@click.option("visit",is_flag=True) def best(ctx, count): db = create_database_from_context(ctx) - domains = db.get_best_domains(count) + p = ctx.obj["parser"] + domains = db.get_best_domains(count,p) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) - process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) + process_domains(domains,ctx.obj["visit"],p ,db,q) @cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @@ -89,12 +90,12 @@ def best(ctx, count): @click.argument("count",type=int,default=20) def unvisited(ctx, count): db = create_database_from_context(ctx) - domains = db.get_unvisited_domains(count) - + p = ctx.obj["parser"] + domains = db.get_unvisited_domains(count,p) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) - process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) + process_domains(domains,ctx.obj["visit"],p,db,q) @cli.command(help="Visit url and get links. Start here") @click.pass_context diff --git a/websucker/db.py b/websucker/db.py index 22bb7f1..37fa9ce 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -445,7 +445,7 @@ INSERT INTO content( # returns sorted list of tuples domain,gain_ratio return res - def get_best_domains(self,count): + def get_best_domains(self,count,parser): # get all domains rows = self.session.execute(self.domains_select) domains = [] @@ -455,7 +455,7 @@ INSERT INTO content( fetched_count = row[2] gain_ratio = row[3] afg = row[4] - if seen_count and fetched_count and gain_ratio: + if seen_count and fetched_count and gain_ratio and parser.is_domain_good(domain): domains.append(list(row)) l = len(domains) ss = min(l,count) @@ -466,7 +466,7 @@ INSERT INTO content( # returns sorted list of tuples domain,gain_ratio return res - def get_unvisited_domains(self,count): + def get_unvisited_domains(self,count,parser): # get all domains rows = self.session.execute(self.domains_select) domains = [] @@ -476,8 +476,8 @@ INSERT INTO content( fetched_count = row[2] gain_ratio = row[3] afg = row[4] - if seen_count and not fetched_count: - domains.append(row) + if seen_count and not fetched_count and parser.is_domain_good(domain): + domains.append(domain) ss = min(len(domains),count) return random.sample(domains,ss)