websucker-pip/websucker/cli.py

172 lines
6.4 KiB
Python
Raw Normal View History

2020-05-09 09:50:50 +00:00
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
2020-05-07 14:09:45 +00:00
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
2020-05-08 05:53:50 +00:00
from websucker.parser import load_parser
2020-05-07 14:09:45 +00:00
from websucker.db import Data
from websucker.db import get_schema
import click
import pprint
2020-05-09 09:50:50 +00:00
import greenstalk
import os
2020-05-07 14:09:45 +00:00
def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
2020-05-09 09:50:50 +00:00
def create_queue_from_context(ctx):
return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
2020-05-07 14:09:45 +00:00
@click.group()
@click.pass_context
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
2020-05-09 09:50:50 +00:00
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
2020-05-08 05:53:50 +00:00
@click.option("--parser",metavar="file_name",help="zzz")
2020-05-07 14:09:45 +00:00
@click.option("--visit",is_flag=True)
2020-05-09 09:50:50 +00:00
@click.option("--queue",is_flag=True)
2020-05-10 09:48:17 +00:00
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
2020-05-07 14:09:45 +00:00
ctx.ensure_object(dict)
p = BaseParser()
2020-05-10 09:48:17 +00:00
if parser is not None:
assert os.path.isfile(parser)
else:
suckerfile = os.getcwd() + "/Suckerfile.py"
if os.path.isfile(suckerfile):
parser = suckerfile
2020-05-08 05:53:50 +00:00
if parser is not None:
p = load_parser(parser)
assert p is not None
2020-05-07 14:09:45 +00:00
ctx.obj["parser"] = p
ctx.obj["cassandra_host"] = cassandra_host
ctx.obj["cassandra_port"] = cassandra_port
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
2020-05-09 09:50:50 +00:00
ctx.obj["beanstalkd_host"] = beanstalkd_host
ctx.obj["beanstalkd_port"] = beanstalkd_port
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
2020-05-07 14:09:45 +00:00
ctx.obj["visit"] = visit
2020-05-09 09:50:50 +00:00
ctx.obj["queue"] = queue
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
@cli.command(help="All domains")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("count",type=int,default=20)
def all(ctx,count):
db = create_database_from_context(ctx)
res = db.all_domains(count)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
@cli.command(help="Work queue")
2020-05-07 14:09:45 +00:00
@click.pass_context
2020-05-09 09:50:50 +00:00
def work(ctx):
2020-05-07 14:09:45 +00:00
db = create_database_from_context(ctx)
2020-05-09 09:50:50 +00:00
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
2020-05-07 14:09:45 +00:00
@cli.command(help="find best domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
2020-05-20 07:22:19 +00:00
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
2020-05-20 07:22:19 +00:00
process_domains(domains,ctx.obj["visit"],p ,db,q)
2020-05-07 14:09:45 +00:00
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
@click.pass_context
@click.argument("count",type=int,default=20)
def unvisited(ctx, count):
db = create_database_from_context(ctx)
2020-05-20 07:22:19 +00:00
p = ctx.obj["parser"]
domains = db.get_unvisited_domains(count,p)
2020-05-09 09:50:50 +00:00
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
2020-05-20 07:22:19 +00:00
process_domains(domains,ctx.obj["visit"],p,db,q)
2020-05-09 09:50:50 +00:00
@cli.command(help="Visit url and get links. Start here")
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument("link")
2020-05-09 09:50:50 +00:00
def start(ctx, link):
2020-05-07 14:09:45 +00:00
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
2020-05-09 09:50:50 +00:00
visit_links([link],c,p,db)
db.check_domain(domain)
@cli.command(help="Continue crawling of seen links from a domain")
@click.pass_context
@click.argument("domain")
def crawl(ctx, domain):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
c = Connection()
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)
2020-05-07 14:09:45 +00:00
@cli.command(help="Update domain statistics")
@click.pass_context
@click.argument("domain")
def check(ctx,domain):
db = create_database_from_context(ctx)
res = db.check_domain(domain)
print(res)
@cli.command(help="Print daily report")
@click.pass_context
def report(ctx):
db = create_database_from_context(ctx)
db.daily_report()
2020-05-10 09:48:17 +00:00
try:
2020-05-09 09:50:50 +00:00
q = create_queue_from_context(ctx)
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"]
2020-05-10 09:48:17 +00:00
ready = stats["current-jobs-ready"]
2020-05-13 13:20:20 +00:00
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
2020-05-09 09:50:50 +00:00
print("{} ready jobs, {} burried jobs".format(ready,buried))
2020-06-04 11:44:22 +00:00
except Exception as err:
2020-05-10 09:48:17 +00:00
print(err)
@cli.command(help="Database summary")
@click.pass_context
def summary(ctx):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
db.summary(p)
2020-05-07 14:09:45 +00:00
@cli.command(help="Print keyspace schema")
def schema():
schema = get_schema()
print(schema)
@cli.command(help="Fetch given url (just for debug)")
@click.pass_context
@click.argument("urls")
def fetch(ctx,urls):
parser = ctx.obj["parser"]
# Visit first page
connection = Connection()
responses = connection.html_download2(urls)
for res in responses:
target_link = res.get_canonical()
pd = ParsedDocument(parser,target_link)
pd.extract(res.content, res.bs)
print(pd)
if __name__ == "__main__":
cli()