websucker-pip/websucker/cli.py
2023-02-28 19:29:51 +01:00

228 lines
8.8 KiB
Python

from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
from websucker.parser import load_parser
from websucker.db import Data
from websucker.db import get_schema
import websucker.db
import click
import pprint
import greenstalk
import os
import websucker.schema
def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
def create_queue_from_context(ctx):
return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
@click.group()
@click.pass_context
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
@click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME",default="cassandra",show_default=True)
@click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD",default="cassandra",show_default=True)
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
@click.option("--parser",metavar="file_name",help="zzz")
@click.option("--visit",is_flag=True)
@click.option("--queue",is_flag=True)
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
ctx.ensure_object(dict)
p = BaseParser()
if parser is not None:
assert os.path.isfile(parser)
else:
suckerfile = os.getcwd() + "/Suckerfile.py"
if os.path.isfile(suckerfile):
parser = suckerfile
if parser is not None:
p = load_parser(parser)
assert p is not None
ctx.obj["parser"] = p
ctx.obj["cassandra_host"] = cassandra_host
ctx.obj["cassandra_port"] = cassandra_port
ctx.obj["cassandra_username"] = cassandra_username
ctx.obj["cassandra_password"] = cassandra_password
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
ctx.obj["beanstalkd_host"] = beanstalkd_host
ctx.obj["beanstalkd_port"] = beanstalkd_port
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
ctx.obj["visit"] = visit
ctx.obj["queue"] = queue
@cli.command(help="Get visited domains from db")
@click.pass_context
@click.argument("count",type=int,default=20)
def all(ctx,count):
db = create_database_from_context(ctx)
res = db.all_domains(count)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
@cli.command(help="Get random domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def blind(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_random_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Visit domains from queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="Get best domains from db")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Get unvisited domains")
@click.pass_context
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit domains from file")
@click.pass_context
@click.argument("name")
def file(ctx, name):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = []
with open(name) as f:
for l in f:
domains.append((l.strip(),0))
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit one url and get links. Start here")
@click.pass_context
@click.argument("link")
def start(ctx, link):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
visit_links([link],c,p,db)
#db.check_domain(domain)
@cli.command(help="Continue crawling of seen links from a domain")
@click.pass_context
@click.argument("domain")
def crawl(ctx, domain):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)
@cli.command(help="Update domain statistics")
@click.pass_context
@click.argument("domain")
def check(ctx,domain):
db = create_database_from_context(ctx)
res = db.check_domain(domain)
print(res)
@cli.command(help="Export domain as JSON doc per line")
@click.pass_context
@click.argument("domain")
def tojson(ctx,domain):
db = create_database_from_context(ctx)
db.export_domain(domain)
@cli.command(help="Print daily report")
@click.pass_context
def report(ctx):
db = create_database_from_context(ctx)
db.daily_report()
try:
q = create_queue_from_context(ctx)
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"]
ready = stats["current-jobs-ready"]
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
print("{} ready jobs, {} burried jobs".format(ready,buried))
except Exception as err:
print(err)
@cli.command(help="Database summary")
@click.pass_context
def summary(ctx):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
db.summary(p)
@cli.command(help="Create database")
@click.pass_context
@click.argument("replication",default=1)
def create_database(ctx,replication):
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
with cluster.connect() as session:
import cassandra.cqlengine.connection
import cassandra.cqlengine.management as man
session.set_keyspace(ctx.obj["cassandra_keyspace"])
cassandra.cqlengine.connection.set_session(session)
keyspace = ctx.obj["cassandra_keyspace"]
man.drop_keyspace(keyspace)
man.create_keyspace_simple(keyspace,replication)
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
@cli.command(help="Print keyspace schema")
def schema():
schema = get_schema()
print(schema)
@cli.command(help="Fetch given url (just for debug)")
@click.pass_context
@click.argument("urls")
def fetch(ctx,urls):
parser = ctx.obj["parser"]
# Visit first page
connection = Connection()
responses = connection.html_download2(urls)
for res in responses:
target_link = res.get_canonical()
pd = parser.full_extract(res.content,res.bs,parser,target_link)
print(pd)
if __name__ == "__main__":
cli()