228 lines
8.8 KiB
Python
228 lines
8.8 KiB
Python
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
|
|
from websucker.agent import ParsedDocument
|
|
from websucker.parser import BaseParser
|
|
from websucker.parser import normalize_link,urlunparse
|
|
from websucker.parser import load_parser
|
|
from websucker.db import Data
|
|
from websucker.db import get_schema
|
|
import websucker.db
|
|
import click
|
|
import pprint
|
|
import greenstalk
|
|
import os
|
|
|
|
import websucker.schema
|
|
|
|
|
|
def create_database_from_context(ctx):
|
|
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
|
|
|
|
def create_queue_from_context(ctx):
|
|
return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
|
|
|
|
|
|
@click.group()
|
|
@click.pass_context
|
|
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
|
|
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
|
|
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
|
|
@click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME",default="cassandra",show_default=True)
|
|
@click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD",default="cassandra",show_default=True)
|
|
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
|
|
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
|
|
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
|
|
@click.option("--parser",metavar="file_name",help="zzz")
|
|
@click.option("--visit",is_flag=True)
|
|
@click.option("--queue",is_flag=True)
|
|
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
|
ctx.ensure_object(dict)
|
|
p = BaseParser()
|
|
if parser is not None:
|
|
assert os.path.isfile(parser)
|
|
else:
|
|
suckerfile = os.getcwd() + "/Suckerfile.py"
|
|
if os.path.isfile(suckerfile):
|
|
parser = suckerfile
|
|
if parser is not None:
|
|
p = load_parser(parser)
|
|
assert p is not None
|
|
ctx.obj["parser"] = p
|
|
ctx.obj["cassandra_host"] = cassandra_host
|
|
ctx.obj["cassandra_port"] = cassandra_port
|
|
ctx.obj["cassandra_username"] = cassandra_username
|
|
ctx.obj["cassandra_password"] = cassandra_password
|
|
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
|
|
ctx.obj["beanstalkd_host"] = beanstalkd_host
|
|
ctx.obj["beanstalkd_port"] = beanstalkd_port
|
|
ctx.obj["beanstalkd_tube"] = beanstalkd_tube
|
|
ctx.obj["visit"] = visit
|
|
ctx.obj["queue"] = queue
|
|
|
|
|
|
@cli.command(help="Get visited domains from db")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
def all(ctx,count):
|
|
db = create_database_from_context(ctx)
|
|
res = db.all_domains(count)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
|
|
|
@cli.command(help="Get random domains")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
#@click.option("visit",is_flag=True)
|
|
def blind(ctx, count):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
domains = db.get_random_domains(count,p)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
|
|
|
@cli.command(help="Visit domains from queue")
|
|
@click.pass_context
|
|
def work(ctx):
|
|
db = create_database_from_context(ctx)
|
|
q = create_queue_from_context(ctx)
|
|
work_domains(ctx.obj["parser"],db,q)
|
|
|
|
|
|
@cli.command(help="Get best domains from db")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
#@click.option("visit",is_flag=True)
|
|
def best(ctx, count):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
domains = db.get_best_domains(count,p)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
|
|
|
|
|
@cli.command(help="Get unvisited domains")
|
|
@click.pass_context
|
|
@click.argument("count",type=int,default=20)
|
|
def unvisited(ctx, count):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
domains = db.get_unvisited_domains(count,p)
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
|
|
|
@cli.command(help="Visit domains from file")
|
|
@click.pass_context
|
|
@click.argument("name")
|
|
def file(ctx, name):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
domains = []
|
|
with open(name) as f:
|
|
for l in f:
|
|
domains.append((l.strip(),0))
|
|
q = None
|
|
if ctx.obj["queue"]:
|
|
q = create_queue_from_context(ctx)
|
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
|
|
|
@cli.command(help="Visit one url and get links. Start here")
|
|
@click.pass_context
|
|
@click.argument("link")
|
|
def start(ctx, link):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
c = Connection()
|
|
visit_links([link],c,p,db)
|
|
#db.check_domain(domain)
|
|
|
|
@cli.command(help="Continue crawling of seen links from a domain")
|
|
@click.pass_context
|
|
@click.argument("domain")
|
|
def crawl(ctx, domain):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
c = Connection()
|
|
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
|
visit_links(links,c,p,db)
|
|
db.check_domain(domain)
|
|
|
|
@cli.command(help="Update domain statistics")
|
|
@click.pass_context
|
|
@click.argument("domain")
|
|
def check(ctx,domain):
|
|
db = create_database_from_context(ctx)
|
|
res = db.check_domain(domain)
|
|
print(res)
|
|
|
|
@cli.command(help="Export domain as JSON doc per line")
|
|
@click.pass_context
|
|
@click.argument("domain")
|
|
def tojson(ctx,domain):
|
|
db = create_database_from_context(ctx)
|
|
db.export_domain(domain)
|
|
|
|
@cli.command(help="Print daily report")
|
|
@click.pass_context
|
|
def report(ctx):
|
|
db = create_database_from_context(ctx)
|
|
db.daily_report()
|
|
try:
|
|
q = create_queue_from_context(ctx)
|
|
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
|
buried = stats["current-jobs-buried"]
|
|
ready = stats["current-jobs-ready"]
|
|
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
|
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
|
except Exception as err:
|
|
print(err)
|
|
|
|
@cli.command(help="Database summary")
|
|
@click.pass_context
|
|
def summary(ctx):
|
|
db = create_database_from_context(ctx)
|
|
p = ctx.obj["parser"]
|
|
db.summary(p)
|
|
|
|
@cli.command(help="Create database")
|
|
@click.pass_context
|
|
@click.argument("replication",default=1)
|
|
def create_database(ctx,replication):
|
|
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
|
|
with cluster.connect() as session:
|
|
import cassandra.cqlengine.connection
|
|
import cassandra.cqlengine.management as man
|
|
session.set_keyspace(ctx.obj["cassandra_keyspace"])
|
|
cassandra.cqlengine.connection.set_session(session)
|
|
keyspace = ctx.obj["cassandra_keyspace"]
|
|
man.drop_keyspace(keyspace)
|
|
man.create_keyspace_simple(keyspace,replication)
|
|
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
|
|
|
|
@cli.command(help="Print keyspace schema")
|
|
def schema():
|
|
schema = get_schema()
|
|
print(schema)
|
|
|
|
@cli.command(help="Fetch given url (just for debug)")
|
|
@click.pass_context
|
|
@click.argument("urls")
|
|
def fetch(ctx,urls):
|
|
parser = ctx.obj["parser"]
|
|
# Visit first page
|
|
connection = Connection()
|
|
responses = connection.html_download2(urls)
|
|
for res in responses:
|
|
target_link = res.get_canonical()
|
|
pd = parser.full_extract(res.content,res.bs,parser,target_link)
|
|
print(pd)
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|