websucker-pip/websucker/cli.py
2020-07-04 09:34:31 +02:00

199 lines
7.2 KiB
Python

from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
from websucker.parser import load_parser
from websucker.db import Data
from websucker.db import get_schema
import click
import pprint
import greenstalk
import os
def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
def create_queue_from_context(ctx):
return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
@click.group()
@click.pass_context
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
@click.option("--parser",metavar="file_name",help="zzz")
@click.option("--visit",is_flag=True)
@click.option("--queue",is_flag=True)
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
ctx.ensure_object(dict)
p = BaseParser()
if parser is not None:
assert os.path.isfile(parser)
else:
suckerfile = os.getcwd() + "/Suckerfile.py"
if os.path.isfile(suckerfile):
parser = suckerfile
if parser is not None:
p = load_parser(parser)
assert p is not None
ctx.obj["parser"] = p
ctx.obj["cassandra_host"] = cassandra_host
ctx.obj["cassandra_port"] = cassandra_port
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
ctx.obj["beanstalkd_host"] = beanstalkd_host
ctx.obj["beanstalkd_port"] = beanstalkd_port
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
ctx.obj["visit"] = visit
ctx.obj["queue"] = queue
@cli.command(help="All domains")
@click.pass_context
@click.argument("count",type=int,default=20)
def all(ctx,count):
db = create_database_from_context(ctx)
res = db.all_domains(count)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
@cli.command(help="Work queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="find best domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="select random domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def blind(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_random_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@click.pass_context
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit domains from file")
@click.pass_context
@click.argument("name")
def file(ctx, name):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = []
with open(name) as f:
for l in f:
domains.append((l.strip(),0))
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here")
@click.pass_context
@click.argument("link")
def start(ctx, link):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
visit_links([link],c,p,db)
db.check_domain(domain)
@cli.command(help="Continue crawling of seen links from a domain")
@click.pass_context
@click.argument("domain")
def crawl(ctx, domain):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)
@cli.command(help="Update domain statistics")
@click.pass_context
@click.argument("domain")
def check(ctx,domain):
db = create_database_from_context(ctx)
res = db.check_domain(domain)
print(res)
@cli.command(help="Print daily report")
@click.pass_context
def report(ctx):
db = create_database_from_context(ctx)
db.daily_report()
try:
q = create_queue_from_context(ctx)
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"]
ready = stats["current-jobs-ready"]
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
print("{} ready jobs, {} burried jobs".format(ready,buried))
except Exception as err:
print(err)
@cli.command(help="Database summary")
@click.pass_context
def summary(ctx):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
db.summary(p)
@cli.command(help="Print keyspace schema")
def schema():
schema = get_schema()
print(schema)
@cli.command(help="Fetch given url (just for debug)")
@click.pass_context
@click.argument("urls")
def fetch(ctx,urls):
parser = ctx.obj["parser"]
# Visit first page
connection = Connection()
responses = connection.html_download2(urls)
for res in responses:
target_link = res.get_canonical()
pd = ParsedDocument(parser,target_link)
pd.extract(res.content, res.bs)
print(pd)
if __name__ == "__main__":
cli()