This commit is contained in:
Daniel Hladek 2020-07-04 09:34:31 +02:00
parent 2ac0b911f5
commit 2941bf8a4c
2 changed files with 24 additions and 0 deletions

View File

@ -84,6 +84,18 @@ def best(ctx, count):
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q) process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="select random domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def blind(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_random_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@click.pass_context @click.pass_context

View File

@ -494,6 +494,18 @@ INSERT INTO content(
# returns sorted list of tuples domain,gain_ratio # returns sorted list of tuples domain,gain_ratio
return res return res
def get_random_domains(self,count,parser):
# get all domains
rows = self.session.execute(self.domains_select)
domains = []
for row in rows:
domain = row[0]
if parser.is_domain_good(domain):
domains.append(list(row))
l = len(domains)
ss = min(l,count)
return random.sample(domains,ss)
def get_unvisited_domains(self,count,parser): def get_unvisited_domains(self,count,parser):
# get all domains # get all domains
rows = self.session.execute(self.domains_select) rows = self.session.execute(self.domains_select)