import click import mongocrawler import rq import redis import sys import os REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") QUEUES=os.getenv("QUEUES","high,default,low") @click.group() def cli(): pass @cli.command() def createdb(): mongocrawler.createdb() @cli.command() @click.argument("link") def parseurl(link): mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): mongocrawler.classify(start_link) @cli.command() @click.argument("hostname") def visit(hostname): mongocrawler.visit(hostname) @cli.command() def summary(): mongocrawler.crawl_summary() @cli.command() def sampledomains(): mongocrawler.sample_domains() @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues q = rq.Queue(connection=redis.from_url(REDIS_URL)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) print(r) @cli.command() def importhtml(): mongocrawler.import_html() if __name__ == "__main__": cli()