170 lines
6.2 KiB
Python
170 lines
6.2 KiB
Python
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
|
|
from websucker.agent import ParsedDocument
|
|
from websucker.parser import BaseParser
|
|
from websucker.parser import normalize_link,urlunparse
|
|
from websucker.parser import load_parser
|
|
from websucker.db import Data
|
|
from websucker.db import get_schema
|
|
import click
|
|
import pprint
|
|
import greenstalk
|
|
import os
|
|
|
|
|
|
def create_database_from_context(ctx):
|
|
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
|
|
|
|
def create_queue_from_context(ctx):
|
|
return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
|
|
|
|
|
|
@click.group()
|
|
@click.pass_context
|
|
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
|
|
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
|
|
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
|
|
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
|
|
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
|
|
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
|
|
@click.option("--parser",metavar="file_name",help="zzz")
|
|
@click.option("--visit",is_flag=True)
|
|
@click.option("--queue",is_flag=True)
|
|
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
|
ctx.ensure_object(dict)
|
|
p = BaseParser()
|
|
if parser is not None:
|
|
assert os.path.isfile(parser)
|
|
else:
|
|
suckerfile = os.getcwd() + "/Suckerfile.py"
|
|
if os.path.isfile(suckerfile):
|
|
parser = suckerfile
|
|
if parser is not None:
|
|
p = load_parser(parser)
|
|
assert p is not None
|
|
ctx.obj["parser"] = p
|
|
ctx.obj["cassandra_host"] = cassandra_host
|
|
ctx.obj["cassandra_port"] = cassandra_port
|
|
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
|
|
ctx.obj["beanstalkd_host"] = beanstalkd_host
|
|
ctx.obj["beanstalkd_port"] = beanstalkd_port
|
|
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
|
|
ctx.obj["visit"] = visit
|
|
ctx.obj["queue"] = queue
|
|
|
|
|
|
@cli.command(help="All domains")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
def all(ctx,count):
|
|
db = create_database_from_context(ctx)
|
|
res = db.all_domains(count)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
|
|
|
@cli.command(help="Work queue")
|
|
@click.pass_context
|
|
def work(ctx):
|
|
db = create_database_from_context(ctx)
|
|
q = create_queue_from_context(ctx)
|
|
work_domains(ctx.obj["parser"],db,q)
|
|
|
|
|
|
@cli.command(help="find best domains")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
#@click.option("visit",is_flag=True)
|
|
def best(ctx, count):
|
|
db = create_database_from_context(ctx)
|
|
domains = db.get_best_domains(count)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
|
|
|
|
|
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
def unvisited(ctx, count):
|
|
db = create_database_from_context(ctx)
|
|
domains = db.get_unvisited_domains(count)
|
|
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
|
|
|
@cli.command(help="Visit url and get links. Start here")
|
|
@click.pass_context
|
|
@click.argument("link")
|
|
def start(ctx, link):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
c = Connection()
|
|
visit_links([link],c,p,db)
|
|
db.check_domain(domain)
|
|
|
|
@cli.command(help="Continue crawling of seen links from a domain")
|
|
@click.pass_context
|
|
@click.argument("domain")
|
|
def crawl(ctx, domain):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
c = Connection()
|
|
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
|
visit_links(links,c,p,db)
|
|
db.check_domain(domain)
|
|
|
|
@cli.command(help="Update domain statistics")
|
|
@click.pass_context
|
|
@click.argument("domain")
|
|
def check(ctx,domain):
|
|
db = create_database_from_context(ctx)
|
|
res = db.check_domain(domain)
|
|
print(res)
|
|
|
|
@cli.command(help="Print daily report")
|
|
@click.pass_context
|
|
def report(ctx):
|
|
db = create_database_from_context(ctx)
|
|
db.daily_report()
|
|
try:
|
|
q = create_queue_from_context(ctx)
|
|
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
|
buried = stats["current-jobs-buried"]
|
|
ready = stats["current-jobs-ready"]
|
|
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
|
except Error as err:
|
|
print(err)
|
|
|
|
@cli.command(help="Database summary")
|
|
@click.pass_context
|
|
def summary(ctx):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
db.summary(p)
|
|
|
|
@cli.command(help="Print keyspace schema")
|
|
def schema():
|
|
schema = get_schema()
|
|
print(schema)
|
|
|
|
@cli.command(help="Fetch given url (just for debug)")
|
|
@click.pass_context
|
|
@click.argument("urls")
|
|
def fetch(ctx,urls):
|
|
parser = ctx.obj["parser"]
|
|
# Visit first page
|
|
connection = Connection()
|
|
responses = connection.html_download2(urls)
|
|
for res in responses:
|
|
target_link = res.get_canonical()
|
|
pd = ParsedDocument(parser,target_link)
|
|
pd.extract(res.content, res.bs)
|
|
print(pd)
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|