This commit is contained in:
Daniel Hladek 2020-05-20 09:22:19 +02:00
parent 52118b38d3
commit 8d4a873005
2 changed files with 11 additions and 10 deletions

View File

@ -77,11 +77,12 @@ def work(ctx):
#@click.option("visit",is_flag=True) #@click.option("visit",is_flag=True)
def best(ctx, count): def best(ctx, count):
db = create_database_from_context(ctx) db = create_database_from_context(ctx)
domains = db.get_best_domains(count) p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None q = None
if ctx.obj["queue"]: if ctx.obj["queue"]:
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@ -89,12 +90,12 @@ def best(ctx, count):
@click.argument("count",type=int,default=20) @click.argument("count",type=int,default=20)
def unvisited(ctx, count): def unvisited(ctx, count):
db = create_database_from_context(ctx) db = create_database_from_context(ctx)
domains = db.get_unvisited_domains(count) p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
q = None q = None
if ctx.obj["queue"]: if ctx.obj["queue"]:
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here") @cli.command(help="Visit url and get links. Start here")
@click.pass_context @click.pass_context

View File

@ -445,7 +445,7 @@ INSERT INTO content(
# returns sorted list of tuples domain,gain_ratio # returns sorted list of tuples domain,gain_ratio
return res return res
def get_best_domains(self,count): def get_best_domains(self,count,parser):
# get all domains # get all domains
rows = self.session.execute(self.domains_select) rows = self.session.execute(self.domains_select)
domains = [] domains = []
@ -455,7 +455,7 @@ INSERT INTO content(
fetched_count = row[2] fetched_count = row[2]
gain_ratio = row[3] gain_ratio = row[3]
afg = row[4] afg = row[4]
if seen_count and fetched_count and gain_ratio: if seen_count and fetched_count and gain_ratio and parser.is_domain_good(domain):
domains.append(list(row)) domains.append(list(row))
l = len(domains) l = len(domains)
ss = min(l,count) ss = min(l,count)
@ -466,7 +466,7 @@ INSERT INTO content(
# returns sorted list of tuples domain,gain_ratio # returns sorted list of tuples domain,gain_ratio
return res return res
def get_unvisited_domains(self,count): def get_unvisited_domains(self,count,parser):
# get all domains # get all domains
rows = self.session.execute(self.domains_select) rows = self.session.execute(self.domains_select)
domains = [] domains = []
@ -476,8 +476,8 @@ INSERT INTO content(
fetched_count = row[2] fetched_count = row[2]
gain_ratio = row[3] gain_ratio = row[3]
afg = row[4] afg = row[4]
if seen_count and not fetched_count: if seen_count and not fetched_count and parser.is_domain_good(domain):
domains.append(row) domains.append(domain)
ss = min(len(domains),count) ss = min(len(domains),count)
return random.sample(domains,ss) return random.sample(domains,ss)