websucker-pip/mongo/cli.py

import click
import mongocrawler
import rq
import redis
import json
import sys
import os
import pymongo
import courlan
from config import *

@click.group()
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
    DBNAME=dbname
    pass

@cli.command()
def createdb():
    mongocrawler.createdb()

@cli.command()
def dropdb():
    mongocrawler.dropdb()

@cli.command()
@click.argument("link")
def parseurl(link):
    """ Parse document on link for debug """
    mongocrawler.parseurl(link)

@cli.command()
@click.argument("link")
def externaldomains(link):
    """ Extract external domains from link """
    mongocrawler.externaldomains(link)

@cli.command()
@click.argument("start_link")
def classify(start_link):
    """ domain to  to classify links for debug """
    mongocrawler.classify(start_link)

@cli.command()
@click.argument("hostname")
@click.option("--filter_content",default=True,help="Filter content")
def visit(hostname,filter_content=True):
    """ Hostname to crawl """
    mongocrawler.visit(hostname,filter_content=filter_content)

@cli.command()
@click.argument("hostname")
def linksummary(hostname):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    mongocrawler.link_summary(db,hostname)


@cli.command()
def summary():
    mongocrawler.crawl_summary()

@cli.command()
def sampledomains():
    mongocrawler.sample_domains()

@cli.command()
@click.argument("domain")
def sample(domain):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
    for link in links:
        print(link)

@cli.command()
@click.argument("start_link")
def fetchlinks(start_link):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    start_link,hostname = courlan.check_url(start_link)
    rules = mongocrawler.fetch_robot(hostname)
    links = mongocrawler.fetch_front_links(start_link,rules)
    for link in links:
        print(link[0])
    #print(front_links)
    mongocrawler.index_links(db,links)


@cli.command()
@click.argument("hostname")
def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
    dname = "data"
    outfile = dname + "/data.jsonl"
    loutfile = dname + "/extracted.links"
    htmldir = dname + "/html/"
    links = []
    os.mkdir(dname)
    os.mkdir(htmldir)
    for line in sys.stdin:
        links.append(line.rstrip())
    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
    # save extracted text
    with open(outfile,"w") as of:
        for page in extracted_pages:
            url,html,doc = page
            if "url" in doc and doc["url"] != url:
                doc["original_url"] = url
            else:
                doc["url"] = url
            import urllib.parse
            hname = htmldir + urllib.parse.quote(url,safe="")
            doc["html_filename"] = hname
            with open(hname,"w") as hf:
                print(html,file=hf)
            ddoc = json.dumps(doc)
            print(ddoc,file=of)

    # save extracted links
    with open(loutfile,"w") as of:
        for link in links:
            print(link,file=of)


@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
    # TODO: select queues
    q = rq.Queue(connection=redis.from_url(CONNECTION))
    for l in sys.stdin:
        print(l.strip())
        r = q.enqueue(mongocrawler.visit, l.strip())
        print(r)

@cli.command()
def importhtml():
    mongocrawler.import_html()

if __name__ == "__main__":
    cli()