websucker-pip/mongo/cli.py

63 lines
1.2 KiB
Python
Raw Normal View History

2023-04-09 07:13:15 +00:00
import click
import mongocrawler
2023-04-13 07:10:03 +00:00
import rq
2023-04-13 14:11:19 +00:00
import redis
import sys
2023-04-13 07:10:03 +00:00
import os
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
QUEUES=os.getenv("QUEUES","high,default,low")
2023-04-09 07:13:15 +00:00
@click.group()
def cli():
pass
@cli.command()
def createdb():
mongocrawler.createdb()
@cli.command()
@click.argument("link")
def parseurl(link):
mongocrawler.parseurl(link)
@cli.command()
@click.argument("link")
def externaldomains(link):
mongocrawler.externaldomains(link)
@cli.command()
@click.argument("start_link")
def classify(start_link):
mongocrawler.classify(start_link)
@cli.command()
2023-04-12 12:35:35 +00:00
@click.argument("hostname")
def visit(hostname):
mongocrawler.visit(hostname)
@cli.command()
def summary():
mongocrawler.crawl_summary()
2023-04-09 07:13:15 +00:00
2023-04-12 14:39:44 +00:00
@cli.command()
def sampledomains():
mongocrawler.sample_domains()
2023-04-13 07:10:03 +00:00
@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
2023-04-13 14:37:32 +00:00
# TODO: select queues
2023-04-13 07:10:03 +00:00
q = rq.Queue(connection=redis.from_url(REDIS_URL))
for l in sys.stdin:
print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip())
print(r)
2023-04-13 14:37:32 +00:00
@cli.command()
def importhtml():
mongocrawler.import_html()
2023-04-09 07:13:15 +00:00
if __name__ == "__main__":
cli()