zz
This commit is contained in:
parent
52118b38d3
commit
8d4a873005
@ -77,11 +77,12 @@ def work(ctx):
|
|||||||
#@click.option("visit",is_flag=True)
|
#@click.option("visit",is_flag=True)
|
||||||
def best(ctx, count):
|
def best(ctx, count):
|
||||||
db = create_database_from_context(ctx)
|
db = create_database_from_context(ctx)
|
||||||
domains = db.get_best_domains(count)
|
p = ctx.obj["parser"]
|
||||||
|
domains = db.get_best_domains(count,p)
|
||||||
q = None
|
q = None
|
||||||
if ctx.obj["queue"]:
|
if ctx.obj["queue"]:
|
||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
||||||
@ -89,12 +90,12 @@ def best(ctx, count):
|
|||||||
@click.argument("count",type=int,default=20)
|
@click.argument("count",type=int,default=20)
|
||||||
def unvisited(ctx, count):
|
def unvisited(ctx, count):
|
||||||
db = create_database_from_context(ctx)
|
db = create_database_from_context(ctx)
|
||||||
domains = db.get_unvisited_domains(count)
|
p = ctx.obj["parser"]
|
||||||
|
domains = db.get_unvisited_domains(count,p)
|
||||||
q = None
|
q = None
|
||||||
if ctx.obj["queue"]:
|
if ctx.obj["queue"]:
|
||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||||
|
|
||||||
@cli.command(help="Visit url and get links. Start here")
|
@cli.command(help="Visit url and get links. Start here")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
@ -445,7 +445,7 @@ INSERT INTO content(
|
|||||||
# returns sorted list of tuples domain,gain_ratio
|
# returns sorted list of tuples domain,gain_ratio
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def get_best_domains(self,count):
|
def get_best_domains(self,count,parser):
|
||||||
# get all domains
|
# get all domains
|
||||||
rows = self.session.execute(self.domains_select)
|
rows = self.session.execute(self.domains_select)
|
||||||
domains = []
|
domains = []
|
||||||
@ -455,7 +455,7 @@ INSERT INTO content(
|
|||||||
fetched_count = row[2]
|
fetched_count = row[2]
|
||||||
gain_ratio = row[3]
|
gain_ratio = row[3]
|
||||||
afg = row[4]
|
afg = row[4]
|
||||||
if seen_count and fetched_count and gain_ratio:
|
if seen_count and fetched_count and gain_ratio and parser.is_domain_good(domain):
|
||||||
domains.append(list(row))
|
domains.append(list(row))
|
||||||
l = len(domains)
|
l = len(domains)
|
||||||
ss = min(l,count)
|
ss = min(l,count)
|
||||||
@ -466,7 +466,7 @@ INSERT INTO content(
|
|||||||
# returns sorted list of tuples domain,gain_ratio
|
# returns sorted list of tuples domain,gain_ratio
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def get_unvisited_domains(self,count):
|
def get_unvisited_domains(self,count,parser):
|
||||||
# get all domains
|
# get all domains
|
||||||
rows = self.session.execute(self.domains_select)
|
rows = self.session.execute(self.domains_select)
|
||||||
domains = []
|
domains = []
|
||||||
@ -476,8 +476,8 @@ INSERT INTO content(
|
|||||||
fetched_count = row[2]
|
fetched_count = row[2]
|
||||||
gain_ratio = row[3]
|
gain_ratio = row[3]
|
||||||
afg = row[4]
|
afg = row[4]
|
||||||
if seen_count and not fetched_count:
|
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
||||||
domains.append(row)
|
domains.append(domain)
|
||||||
ss = min(len(domains),count)
|
ss = min(len(domains),count)
|
||||||
return random.sample(domains,ss)
|
return random.sample(domains,ss)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user