from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains from websucker.parser import BaseParser from websucker.parser import SoupParser from websucker.parser import TrafilaturaParser from websucker.parser import load_parser from websucker.db import Data from websucker.db import get_schema import websucker.db import click import pprint import greenstalk import os import websucker.schema def create_database_from_context(ctx): return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) def create_queue_from_context(ctx): return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8") @click.group() @click.pass_context @click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True) @click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True) @click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True) @click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME",default="cassandra",show_default=True) @click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD",default="cassandra",show_default=True) @click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True) @click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True) @click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True) @click.option("--parser",metavar="file_name",help="zzz") @click.option("--visit",is_flag=True) @click.option("--queue",is_flag=True) def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue): ctx.ensure_object(dict) p = TrafilaturaParser() if parser is not None: assert os.path.isfile(parser) else: suckerfile = os.getcwd() + "/Suckerfile.py" if os.path.isfile(suckerfile): parser = suckerfile if parser is not None: p = load_parser(parser) assert p is not None ctx.obj["parser"] = p ctx.obj["cassandra_host"] = cassandra_host ctx.obj["cassandra_port"] = cassandra_port ctx.obj["cassandra_username"] = cassandra_username ctx.obj["cassandra_password"] = cassandra_password ctx.obj["cassandra_keyspace"] = cassandra_keyspace ctx.obj["beanstalkd_host"] = beanstalkd_host ctx.obj["beanstalkd_port"] = beanstalkd_port ctx.obj["beanstalkd_tube"] = beanstalkd_tube ctx.obj["visit"] = visit ctx.obj["queue"] = queue @cli.command(help="Get visited domains from db") @click.pass_context @click.argument("count",type=int,default=20) def all(ctx,count): db = create_database_from_context(ctx) res = db.all_domains(count) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q) @cli.command(help="Get random domains") @click.pass_context @click.argument("count",type=int,default=20) #@click.option("visit",is_flag=True) def blind(ctx, count): db = create_database_from_context(ctx) p = ctx.obj["parser"] domains = db.get_random_domains(count,p) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p ,db,q) @cli.command(help="Visit domains from queue") @click.pass_context def work(ctx): db = create_database_from_context(ctx) q = create_queue_from_context(ctx) work_domains(ctx.obj["parser"],db,q) @cli.command(help="Get best domains from db") @click.pass_context @click.argument("count",type=int,default=20) #@click.option("visit",is_flag=True) def best(ctx, count): db = create_database_from_context(ctx) p = ctx.obj["parser"] domains = db.get_best_domains(count,p) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p ,db,q) @cli.command(help="Get unvisited domains") @click.pass_context @click.argument("count",type=int,default=20) def unvisited(ctx, count): db = create_database_from_context(ctx) p = ctx.obj["parser"] domains = db.get_unvisited_domains(count,p) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p,db,q) @cli.command(help="Visit domains from file") @click.pass_context @click.argument("name") def file(ctx, name): db = create_database_from_context(ctx) p = ctx.obj["parser"] domains = [] with open(name) as f: for l in f: domains.append((l.strip(),0)) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],p,db,q) @cli.command(help="Visit one url and get links. Start here") @click.pass_context @click.argument("link") def start(ctx, link): db = create_database_from_context(ctx) p = ctx.obj["parser"] c = Connection() visit_links([link],c,p,db) #db.check_domain(domain) @cli.command(help="Continue crawling of seen links from a domain") @click.pass_context @click.argument("domain") def crawl(ctx, domain): db = create_database_from_context(ctx) p = ctx.obj["parser"] c = Connection() links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) visit_links(links,c,p,db) db.check_domain(domain) @cli.command(help="Update domain statistics") @click.pass_context @click.argument("domain") def check(ctx,domain): db = create_database_from_context(ctx) res = db.check_domain(domain) print(res) @cli.command(help="Export domain as JSON doc per line") @click.pass_context @click.argument("domain") def tojson(ctx,domain): db = create_database_from_context(ctx) db.export_domain(domain) @cli.command(help="Print daily report") @click.pass_context def report(ctx): db = create_database_from_context(ctx) db.daily_report() try: q = create_queue_from_context(ctx) stats = q.stats_tube(ctx.obj["beanstalkd_tube"]) buried = stats["current-jobs-buried"] ready = stats["current-jobs-ready"] print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"])) print("{} ready jobs, {} burried jobs".format(ready,buried)) except Exception as err: print(err) @cli.command(help="Database summary") @click.pass_context def summary(ctx): db = create_database_from_context(ctx) p = ctx.obj["parser"] db.summary(p) @cli.command(help="Create database") @click.pass_context @click.argument("replication",default=1) def create_database(ctx,replication): cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) with cluster.connect() as session: import cassandra.cqlengine.connection import cassandra.cqlengine.management as man session.set_keyspace(ctx.obj["cassandra_keyspace"]) cassandra.cqlengine.connection.set_session(session) keyspace = ctx.obj["cassandra_keyspace"] man.drop_keyspace(keyspace) man.create_keyspace_simple(keyspace,replication) websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session) @cli.command(help="Print keyspace schema") def schema(): schema = get_schema() print(schema) @cli.command(help="Fetch given url (just for debug)") @click.pass_context @click.argument("urls") def fetch(ctx,urls): parser = ctx.obj["parser"] # Visit first page connection = Connection() responses = connection.html_download2(urls) for res in responses: target_link = res.get_canonical() pd = parser.full_extract(res.content,res.bs,target_link) print(pd) if __name__ == "__main__": cli()