diff --git a/websucker/cli.py b/websucker/cli.py index b094b04..a141c2b 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -84,6 +84,18 @@ def best(ctx, count): q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p ,db,q) +@cli.command(help="select random domains") +@click.pass_context +@click.argument("count",type=int,default=20) +#@click.option("visit",is_flag=True) +def blind(ctx, count): + db = create_database_from_context(ctx) + p = ctx.obj["parser"] + domains = db.get_random_domains(count,p) + q = None + if ctx.obj["queue"]: + q = create_queue_from_context(ctx) + process_domains(domains,ctx.obj["visit"],p ,db,q) @cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @click.pass_context diff --git a/websucker/db.py b/websucker/db.py index e1aa8aa..03f51c2 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -494,6 +494,18 @@ INSERT INTO content( # returns sorted list of tuples domain,gain_ratio return res + def get_random_domains(self,count,parser): + # get all domains + rows = self.session.execute(self.domains_select) + domains = [] + for row in rows: + domain = row[0] + if parser.is_domain_good(domain): + domains.append(list(row)) + l = len(domains) + ss = min(l,count) + return random.sample(domains,ss) + def get_unvisited_domains(self,count,parser): # get all domains rows = self.session.execute(self.domains_select)