import click import mongocrawler import rq import redis import json import sys import os import pymongo import courlan from config import * @click.group() @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") def cli(dbname): DBNAME=dbname pass @cli.command() def createdb(): mongocrawler.createdb() @cli.command() def dropdb(): mongocrawler.dropdb() @cli.command() @click.argument("link") def parseurl(link): """ Parse document on link for debug """ mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): """ Extract external domains from link """ mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): """ domain to to classify links for debug """ mongocrawler.classify(start_link) @cli.command() @click.argument("hostname") @click.option("--filter_content",default=True,help="Filter content") def visit(hostname,filter_content=True): """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) @cli.command() @click.argument("hostname") def linksummary(hostname): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] mongocrawler.link_summary(db,hostname) @cli.command() def summary(): mongocrawler.crawl_summary() @cli.command() def sampledomains(): mongocrawler.sample_domains() @cli.command() @click.argument("domain") def sample(domain): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) for link in links: print(link) @cli.command() @click.argument("start_link") def fetchlinks(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) rules = mongocrawler.fetch_robot(hostname) links = mongocrawler.fetch_front_links(start_link,rules) for link in links: print(link[0]) #print(front_links) mongocrawler.index_links(db,links) @cli.command() @click.argument("hostname") def processlinks(hostname): rules = mongocrawler.fetch_robot(hostname) dname = "data" outfile = dname + "/data.jsonl" loutfile = dname + "/extracted.links" htmldir = dname + "/html/" links = [] os.mkdir(dname) os.mkdir(htmldir) for line in sys.stdin: links.append(line.rstrip()) extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules) # save extracted text with open(outfile,"w") as of: for page in extracted_pages: url,html,doc = page if "url" in doc and doc["url"] != url: doc["original_url"] = url else: doc["url"] = url import urllib.parse hname = htmldir + urllib.parse.quote(url,safe="") doc["html_filename"] = hname with open(hname,"w") as hf: print(html,file=hf) ddoc = json.dumps(doc) print(ddoc,file=of) # save extracted links with open(loutfile,"w") as of: for link in links: print(link,file=of) @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues q = rq.Queue(connection=redis.from_url(CONNECTION)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) print(r) @cli.command() def importhtml(): mongocrawler.import_html() if __name__ == "__main__": cli()