This commit is contained in:
Daniel Hladek 2020-05-20 09:22:19 +02:00
parent 52118b38d3
commit 8d4a873005
2 changed files with 11 additions and 10 deletions

View File

@ -77,11 +77,12 @@ def work(ctx):
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
domains = db.get_best_domains(count)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@ -89,12 +90,12 @@ def best(ctx, count):
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
db = create_database_from_context(ctx)
domains = db.get_unvisited_domains(count)
p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here")
@click.pass_context

View File

@ -445,7 +445,7 @@ INSERT INTO content(
# returns sorted list of tuples domain,gain_ratio
return res
def get_best_domains(self,count):
def get_best_domains(self,count,parser):
# get all domains
rows = self.session.execute(self.domains_select)
domains = []
@ -455,7 +455,7 @@ INSERT INTO content(
fetched_count = row[2]
gain_ratio = row[3]
afg = row[4]
if seen_count and fetched_count and gain_ratio:
if seen_count and fetched_count and gain_ratio and parser.is_domain_good(domain):
domains.append(list(row))
l = len(domains)
ss = min(l,count)
@ -466,7 +466,7 @@ INSERT INTO content(
# returns sorted list of tuples domain,gain_ratio
return res
def get_unvisited_domains(self,count):
def get_unvisited_domains(self,count,parser):
# get all domains
rows = self.session.execute(self.domains_select)
domains = []
@ -476,8 +476,8 @@ INSERT INTO content(
fetched_count = row[2]
gain_ratio = row[3]
afg = row[4]
if seen_count and not fetched_count:
domains.append(row)
if seen_count and not fetched_count and parser.is_domain_good(domain):
domains.append(domain)
ss = min(len(domains),count)
return random.sample(domains,ss)