64 lines
1.3 KiB
Python
64 lines
1.3 KiB
Python
import click
|
|
import mongocrawler
|
|
import rq
|
|
import redis
|
|
import sys
|
|
import os
|
|
|
|
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
|
|
|
@click.group()
|
|
def cli():
|
|
pass
|
|
|
|
@cli.command()
|
|
def createdb():
|
|
mongocrawler.createdb()
|
|
|
|
@cli.command()
|
|
@click.argument("link")
|
|
def parseurl(link):
|
|
mongocrawler.parseurl(link)
|
|
|
|
@cli.command()
|
|
@click.argument("link")
|
|
def externaldomains(link):
|
|
mongocrawler.externaldomains(link)
|
|
|
|
@cli.command()
|
|
@click.argument("start_link")
|
|
def classify(start_link):
|
|
mongocrawler.classify(start_link)
|
|
|
|
@cli.command()
|
|
@click.argument("hostname",help="Hostname to crawl")
|
|
@click.option("--filter_content",default=True,help="Filter content")
|
|
def visit(hostname,filter_content=True):
|
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
|
|
|
@cli.command()
|
|
def summary():
|
|
mongocrawler.crawl_summary()
|
|
|
|
@cli.command()
|
|
def sampledomains():
|
|
mongocrawler.sample_domains()
|
|
|
|
|
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
|
def enqueue():
|
|
# TODO: select queues
|
|
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
|
for l in sys.stdin:
|
|
print(l.strip())
|
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
|
print(r)
|
|
|
|
@cli.command()
|
|
def importhtml():
|
|
mongocrawler.import_html()
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|