63 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			63 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import click
 | 
						|
import mongocrawler
 | 
						|
import rq
 | 
						|
import redis
 | 
						|
import sys
 | 
						|
import os
 | 
						|
 | 
						|
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
 | 
						|
QUEUES=os.getenv("QUEUES","high,default,low")
 | 
						|
 | 
						|
@click.group()
 | 
						|
def cli():
 | 
						|
    pass
 | 
						|
 | 
						|
@cli.command()
 | 
						|
def createdb():
 | 
						|
    mongocrawler.createdb()
 | 
						|
 | 
						|
@cli.command()
 | 
						|
@click.argument("link")
 | 
						|
def parseurl(link):
 | 
						|
    mongocrawler.parseurl(link)
 | 
						|
 | 
						|
@cli.command()
 | 
						|
@click.argument("link")
 | 
						|
def externaldomains(link):
 | 
						|
    mongocrawler.externaldomains(link)
 | 
						|
 | 
						|
@cli.command()
 | 
						|
@click.argument("start_link")
 | 
						|
def classify(start_link):
 | 
						|
    mongocrawler.classify(start_link)
 | 
						|
 | 
						|
@cli.command()
 | 
						|
@click.argument("hostname")
 | 
						|
def visit(hostname):
 | 
						|
    mongocrawler.visit(hostname)
 | 
						|
 | 
						|
@cli.command()
 | 
						|
def summary():
 | 
						|
    mongocrawler.crawl_summary()
 | 
						|
 | 
						|
@cli.command()
 | 
						|
def sampledomains():
 | 
						|
    mongocrawler.sample_domains()
 | 
						|
 | 
						|
 | 
						|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
 | 
						|
def enqueue():
 | 
						|
    # TODO: select queues
 | 
						|
    q = rq.Queue(connection=redis.from_url(REDIS_URL))
 | 
						|
    for l in sys.stdin:
 | 
						|
        print(l.strip())
 | 
						|
        r = q.enqueue(mongocrawler.visit, l.strip())
 | 
						|
        print(r)
 | 
						|
 | 
						|
@cli.command()
 | 
						|
def importhtml():
 | 
						|
    mongocrawler.import_html()
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    cli()
 |