This commit is contained in:
Daniel Hládek 2021-01-21 10:47:29 +01:00
parent 370cd1536f
commit 0278bed4ff
2 changed files with 26 additions and 44 deletions

View File

@ -371,25 +371,6 @@ class ParsedDocument:
return ">>> ".join(r)
def get_domains(arg):
"""
Get domains from argument or stdin
if arg is -, get from stdin, else split arg
@param arg dash or domains separated by comma
@return domains
"""
domains = []
if arg == "-":
for l in sys.stdin:
domain = l.rstrip()
assert(domain is not None)
if len(domain) == 0:
continue
domains.append(domain)
else:
domains = arg.split(",")
return domains
def parse_and_index(work_link,parser,responses,db):
"""
Take all responses from work link, parse and store in db

View File

@ -52,7 +52,7 @@ def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,bea
ctx.obj["queue"] = queue
@cli.command(help="All domains")
@cli.command(help="Get visited domains from db")
@click.pass_context
@click.argument("count",type=int,default=20)
def all(ctx,count):
@ -63,28 +63,7 @@ def all(ctx,count):
q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
@cli.command(help="Work queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="find best domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="select random domains")
@cli.command(help="Get random domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
@ -97,7 +76,29 @@ def blind(ctx, count):
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@cli.command(help="Visit domains from queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="Get best domains from db")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Get unvisited domains")
@click.pass_context
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
@ -124,7 +125,7 @@ def file(ctx, name):
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here")
@cli.command(help="Visit one url and get links. Start here")
@click.pass_context
@click.argument("link")
def start(ctx, link):