from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains from websucker.agent import ParsedDocument from websucker.parser import BaseParser from websucker.parser import normalize_link,urlunparse from websucker.parser import load_parser from websucker.db import Data from websucker.db import get_schema import click import pprint import greenstalk import os def create_database_from_context(ctx): return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"]) def create_queue_from_context(ctx): return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8") @click.group() @click.pass_context @click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True) @click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True) @click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True) @click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True) @click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True) @click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True) @click.option("--parser",metavar="file_name",help="zzz") @click.option("--visit",is_flag=True) @click.option("--queue",is_flag=True) def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue): ctx.ensure_object(dict) p = BaseParser() if parser is not None: assert os.path.isfile(parser) else: suckerfile = os.getcwd() + "/Suckerfile.py" if os.path.isfile(suckerfile): parser = suckerfile if parser is not None: p = load_parser(parser) assert p is not None ctx.obj["parser"] = p ctx.obj["cassandra_host"] = cassandra_host ctx.obj["cassandra_port"] = cassandra_port ctx.obj["cassandra_keyspace"] = cassandra_keyspace ctx.obj["beanstalkd_host"] = beanstalkd_host ctx.obj["beanstalkd_port"] = beanstalkd_port ctx.obj["beanstalkd_tube"] = beanstalkd_tube ctx.obj["visit"] = visit ctx.obj["queue"] = queue @cli.command(help="All domains") @click.pass_context @click.argument("count",type=int,default=20) def all(ctx,count): db = create_database_from_context(ctx) res = db.all_domains(count) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q) @cli.command(help="Work queue") @click.pass_context def work(ctx): db = create_database_from_context(ctx) q = create_queue_from_context(ctx) work_domains(ctx.obj["parser"],db,q) @cli.command(help="find best domains") @click.pass_context @click.argument("count",type=int,default=20) #@click.option("visit",is_flag=True) def best(ctx, count): db = create_database_from_context(ctx) domains = db.get_best_domains(count) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) @cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @click.pass_context @click.argument("count",type=int,default=20) def unvisited(ctx, count): db = create_database_from_context(ctx) domains = db.get_unvisited_domains(count) q = None if ctx.obj["queue"]: q = create_queue_from_context(ctx) process_domains(domains,ctx.obj["visit"],ctx.obj["parser"],db,q) @cli.command(help="Visit url and get links. Start here") @click.pass_context @click.argument("link") def start(ctx, link): db = create_database_from_context(ctx) p = ctx.obj["parser"] c = Connection() visit_links([link],c,p,db) db.check_domain(domain) @cli.command(help="Continue crawling of seen links from a domain") @click.pass_context @click.argument("domain") def crawl(ctx, domain): db = create_database_from_context(ctx) p = ctx.obj["parser"] c = Connection() links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) visit_links(links,c,p,db) db.check_domain(domain) @cli.command(help="Update domain statistics") @click.pass_context @click.argument("domain") def check(ctx,domain): db = create_database_from_context(ctx) res = db.check_domain(domain) print(res) @cli.command(help="Print daily report") @click.pass_context def report(ctx): db = create_database_from_context(ctx) db.daily_report() try: q = create_queue_from_context(ctx) stats = q.stats_tube(ctx.obj["beanstalkd_tube"]) buried = stats["current-jobs-buried"] ready = stats["current-jobs-ready"] print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"])) print("{} ready jobs, {} burried jobs".format(ready,buried)) except Error as err: print(err) @cli.command(help="Database summary") @click.pass_context def summary(ctx): db = create_database_from_context(ctx) p = ctx.obj["parser"] db.summary(p) @cli.command(help="Print keyspace schema") def schema(): schema = get_schema() print(schema) @cli.command(help="Fetch given url (just for debug)") @click.pass_context @click.argument("urls") def fetch(ctx,urls): parser = ctx.obj["parser"] # Visit first page connection = Connection() responses = connection.html_download2(urls) for res in responses: target_link = res.get_canonical() pd = ParsedDocument(parser,target_link) pd.extract(res.content, res.bs) print(pd) if __name__ == "__main__": cli()