zz
This commit is contained in:
parent
52118b38d3
commit
8d4a873005
@ -77,11 +77,12 @@ def work(ctx):
|
||||
#@click.option("visit",is_flag=True)
|
||||
def best(ctx, count):
|
||||
db = create_database_from_context(ctx)
|
||||
domains = db.get_best_domains(count)
|
||||
p = ctx.obj["parser"]
|
||||
domains = db.get_best_domains(count,p)
|
||||
q = None
|
||||
if ctx.obj["queue"]:
|
||||
q = create_queue_from_context(ctx)
|
||||
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
||||
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
||||
|
||||
|
||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
||||
@ -89,12 +90,12 @@ def best(ctx, count):
|
||||
@click.argument("count",type=int,default=20)
|
||||
def unvisited(ctx, count):
|
||||
db = create_database_from_context(ctx)
|
||||
domains = db.get_unvisited_domains(count)
|
||||
|
||||
p = ctx.obj["parser"]
|
||||
domains = db.get_unvisited_domains(count,p)
|
||||
q = None
|
||||
if ctx.obj["queue"]:
|
||||
q = create_queue_from_context(ctx)
|
||||
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
||||
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||
|
||||
@cli.command(help="Visit url and get links. Start here")
|
||||
@click.pass_context
|
||||
|
@ -445,7 +445,7 @@ INSERT INTO content(
|
||||
# returns sorted list of tuples domain,gain_ratio
|
||||
return res
|
||||
|
||||
def get_best_domains(self,count):
|
||||
def get_best_domains(self,count,parser):
|
||||
# get all domains
|
||||
rows = self.session.execute(self.domains_select)
|
||||
domains = []
|
||||
@ -455,7 +455,7 @@ INSERT INTO content(
|
||||
fetched_count = row[2]
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if seen_count and fetched_count and gain_ratio:
|
||||
if seen_count and fetched_count and gain_ratio and parser.is_domain_good(domain):
|
||||
domains.append(list(row))
|
||||
l = len(domains)
|
||||
ss = min(l,count)
|
||||
@ -466,7 +466,7 @@ INSERT INTO content(
|
||||
# returns sorted list of tuples domain,gain_ratio
|
||||
return res
|
||||
|
||||
def get_unvisited_domains(self,count):
|
||||
def get_unvisited_domains(self,count,parser):
|
||||
# get all domains
|
||||
rows = self.session.execute(self.domains_select)
|
||||
domains = []
|
||||
@ -476,8 +476,8 @@ INSERT INTO content(
|
||||
fetched_count = row[2]
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if seen_count and not fetched_count:
|
||||
domains.append(row)
|
||||
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
||||
domains.append(domain)
|
||||
ss = min(len(domains),count)
|
||||
return random.sample(domains,ss)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user