import click import mongocrawler import rq import redis import sys import os REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") QUEUES=os.getenv("QUEUES","high,default,low") @click.group() @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") def cli(dbname): DBNAME=dbname pass @cli.command() def createdb(): mongocrawler.createdb() @cli.command() def dropdb(): mongocrawler.dropdb() @cli.command() @click.argument("link") def parseurl(link): """ Parse document on link for debug """ mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): """ Extract external domains from link """ mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): """ domain to to classify links for debug """ mongocrawler.classify(start_link) @cli.command() @click.argument("hostname") @click.option("--filter_content",default=True,help="Filter content") def visit(hostname,filter_content=True): """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) @cli.command() def summary(): mongocrawler.crawl_summary() @cli.command() def sampledomains(): mongocrawler.sample_domains() @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues q = rq.Queue(connection=redis.from_url(REDIS_URL)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) print(r) @cli.command() def importhtml(): mongocrawler.import_html() if __name__ == "__main__": cli()