import click import mongocrawler import rq import redis import sys import os import pymongo import courlan from config import * @click.group() @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") def cli(dbname): DBNAME=dbname pass @cli.command() def createdb(): mongocrawler.createdb() @cli.command() def dropdb(): mongocrawler.dropdb() @cli.command() @click.argument("link") def parseurl(link): """ Parse document on link for debug """ mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): """ Extract external domains from link """ mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): """ domain to to classify links for debug """ mongocrawler.classify(start_link) @cli.command() @click.argument("hostname") @click.option("--filter_content",default=True,help="Filter content") def visit(hostname,filter_content=True): """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) @cli.command() @click.argument("hostname") def linksummary(hostname): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] mongocrawler.link_summary(db,hostname) @cli.command() def summary(): mongocrawler.crawl_summary() @cli.command() def sampledomains(): mongocrawler.sample_domains() @cli.command() @click.argument("domain") def sample(domain): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) print(links) @cli.command() @click.argument("start_link") def fetchlinks(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) rules = mongocrawler.fetch_robot(hostname) front_links = mongocrawler.fetch_front_links(start_link,rules) print(front_links) mongocrawler.index_links(db,front_links) @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues q = rq.Queue(connection=redis.from_url(CONNECTION)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) print(r) @cli.command() def importhtml(): mongocrawler.import_html() if __name__ == "__main__": cli()