websucker-pip/websucker/cli.py

226 lines
8.7 KiB
Python
Raw Normal View History

2020-05-09 09:50:50 +00:00
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
2020-05-07 14:09:45 +00:00
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
2020-05-08 05:53:50 +00:00
from websucker.parser import load_parser
2020-05-07 14:09:45 +00:00
from websucker.db import Data
from websucker.db import get_schema
2023-02-28 07:56:35 +00:00
import websucker.db
2020-05-07 14:09:45 +00:00
import click
import pprint
2020-05-09 09:50:50 +00:00
import greenstalk
import os
2020-05-07 14:09:45 +00:00
2023-02-28 07:56:35 +00:00
from websucker import schema
2020-05-07 14:09:45 +00:00
def create_database_from_context(ctx):
2023-02-23 15:08:02 +00:00
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
def create_queue_from_context(ctx):
2021-01-20 11:34:38 +00:00
return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
2020-05-09 09:50:50 +00:00
2020-05-07 14:09:45 +00:00
@click.group()
@click.pass_context
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
2023-02-28 07:56:35 +00:00
@click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME",default="cassandra",show_default=True)
@click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD",default="cassandra",show_default=True)
2020-05-09 09:50:50 +00:00
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
2020-05-08 05:53:50 +00:00
@click.option("--parser",metavar="file_name",help="zzz")
2020-05-07 14:09:45 +00:00
@click.option("--visit",is_flag=True)
2020-05-09 09:50:50 +00:00
@click.option("--queue",is_flag=True)
2023-02-23 15:08:02 +00:00
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
2020-05-07 14:09:45 +00:00
ctx.ensure_object(dict)
p = BaseParser()
2020-05-10 09:48:17 +00:00
if parser is not None:
assert os.path.isfile(parser)
else:
suckerfile = os.getcwd() + "/Suckerfile.py"
if os.path.isfile(suckerfile):
parser = suckerfile
2020-05-08 05:53:50 +00:00
if parser is not None:
p = load_parser(parser)
assert p is not None
2020-05-07 14:09:45 +00:00
ctx.obj["parser"] = p
ctx.obj["cassandra_host"] = cassandra_host
ctx.obj["cassandra_port"] = cassandra_port
2023-02-23 15:08:02 +00:00
ctx.obj["cassandra_username"] = cassandra_username
ctx.obj["cassandra_password"] = cassandra_password
2020-05-07 14:09:45 +00:00
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
2020-05-09 09:50:50 +00:00
ctx.obj["beanstalkd_host"] = beanstalkd_host
ctx.obj["beanstalkd_port"] = beanstalkd_port
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
2020-05-07 14:09:45 +00:00
ctx.obj["visit"] = visit
2020-05-09 09:50:50 +00:00
ctx.obj["queue"] = queue
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command(help="Get visited domains from db")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("count",type=int,default=20)
def all(ctx,count):
db = create_database_from_context(ctx)
res = db.all_domains(count)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command(help="Get random domains")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
2021-01-21 09:47:29 +00:00
def blind(ctx, count):
2020-05-07 14:09:45 +00:00
db = create_database_from_context(ctx)
2020-05-20 07:22:19 +00:00
p = ctx.obj["parser"]
2021-01-21 09:47:29 +00:00
domains = db.get_random_domains(count,p)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
2020-05-20 07:22:19 +00:00
process_domains(domains,ctx.obj["visit"],p ,db,q)
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command(help="Visit domains from queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="Get best domains from db")
2020-07-04 07:34:31 +00:00
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
2021-01-21 09:47:29 +00:00
def best(ctx, count):
2020-07-04 07:34:31 +00:00
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
2021-01-21 09:47:29 +00:00
domains = db.get_best_domains(count,p)
2020-07-04 07:34:31 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command(help="Get unvisited domains")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
db = create_database_from_context(ctx)
2020-05-20 07:22:19 +00:00
p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
2020-05-20 07:22:19 +00:00
process_domains(domains,ctx.obj["visit"],p,db,q)
2020-05-09 09:50:50 +00:00
2020-06-08 14:09:47 +00:00
@cli.command(help="Visit domains from file")
@click.pass_context
@click.argument("name")
def file(ctx, name):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = []
with open(name) as f:
for l in f:
domains.append((l.strip(),0))
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q)
2021-01-21 09:47:29 +00:00
@cli.command(help="Visit one url and get links. Start here")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("link")
2020-05-09 09:50:50 +00:00
def start(ctx, link):
2020-05-07 14:09:45 +00:00
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
2020-05-09 09:50:50 +00:00
visit_links([link],c,p,db)
2021-01-20 12:54:47 +00:00
#db.check_domain(domain)
2020-05-09 09:50:50 +00:00
@cli.command(help="Continue crawling of seen links from a domain")
@click.pass_context
@click.argument("domain")
def crawl(ctx, domain):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)
2020-05-07 14:09:45 +00:00
@cli.command(help="Update domain statistics")
@click.pass_context
@click.argument("domain")
def check(ctx,domain):
db = create_database_from_context(ctx)
res = db.check_domain(domain)
print(res)
2023-02-26 13:10:58 +00:00
@cli.command(help="Export domain as JSON doc per line")
@click.pass_context
@click.argument("domain")
def tojson(ctx,domain):
db = create_database_from_context(ctx)
db.export_domain(domain)
2020-05-07 14:09:45 +00:00
@cli.command(help="Print daily report")
@click.pass_context
def report(ctx):
db = create_database_from_context(ctx)
db.daily_report()
2020-05-10 09:48:17 +00:00
try:
2020-05-09 09:50:50 +00:00
q = create_queue_from_context(ctx)
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"]
2020-05-10 09:48:17 +00:00
ready = stats["current-jobs-ready"]
2020-05-13 13:20:20 +00:00
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
2020-05-09 09:50:50 +00:00
print("{} ready jobs, {} burried jobs".format(ready,buried))
2020-06-04 11:44:22 +00:00
except Exception as err:
2020-05-10 09:48:17 +00:00
print(err)
@cli.command(help="Database summary")
@click.pass_context
def summary(ctx):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
db.summary(p)
2023-02-28 07:56:35 +00:00
@cli.command(help="Create database")
@click.pass_context
@click.argument("replication",default=1)
@click.argument("strategy",default="SimpleStrategy")
def create_database(ctx,replication,strategy):
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
with cluster.connect() as session:
query = "CREATE KEYSPACE {} WITH replication = {'class':'{}', 'replication_factor' : {}}".format(ctx.obj["cassandra_keyspace"],strategy,replication)
session.execute(query)
session.set_keyspace(ctx.obj["cassandra_keyspace"])
model.create_database()
2020-05-07 14:09:45 +00:00
@cli.command(help="Print keyspace schema")
def schema():
schema = get_schema()
print(schema)
@cli.command(help="Fetch given url (just for debug)")
@click.pass_context
@click.argument("urls")
def fetch(ctx,urls):
parser = ctx.obj["parser"]
# Visit first page
connection = Connection()
responses = connection.html_download2(urls)
for res in responses:
target_link = res.get_canonical()
pd = ParsedDocument(parser,target_link)
pd.extract(res.content, res.bs)
print(pd)
if __name__ == "__main__":
cli()