2023-04-09 07:13:15 +00:00
|
|
|
import click
|
|
|
|
import mongocrawler
|
2023-04-13 07:10:03 +00:00
|
|
|
import rq
|
2023-04-13 14:11:19 +00:00
|
|
|
import redis
|
|
|
|
import sys
|
2023-04-13 07:10:03 +00:00
|
|
|
import os
|
2024-03-19 11:03:33 +00:00
|
|
|
import pymongo
|
|
|
|
from config import *
|
2023-04-09 07:13:15 +00:00
|
|
|
|
|
|
|
@click.group()
|
2023-04-30 11:54:36 +00:00
|
|
|
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
|
|
|
def cli(dbname):
|
|
|
|
DBNAME=dbname
|
2023-04-09 07:13:15 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
@cli.command()
|
|
|
|
def createdb():
|
|
|
|
mongocrawler.createdb()
|
|
|
|
|
2023-04-30 11:54:36 +00:00
|
|
|
@cli.command()
|
|
|
|
def dropdb():
|
|
|
|
mongocrawler.dropdb()
|
|
|
|
|
2023-04-09 07:13:15 +00:00
|
|
|
@cli.command()
|
|
|
|
@click.argument("link")
|
|
|
|
def parseurl(link):
|
2023-04-30 11:54:36 +00:00
|
|
|
""" Parse document on link for debug """
|
2023-04-09 07:13:15 +00:00
|
|
|
mongocrawler.parseurl(link)
|
|
|
|
|
|
|
|
@cli.command()
|
|
|
|
@click.argument("link")
|
|
|
|
def externaldomains(link):
|
2023-04-30 11:54:36 +00:00
|
|
|
""" Extract external domains from link """
|
2023-04-09 07:13:15 +00:00
|
|
|
mongocrawler.externaldomains(link)
|
|
|
|
|
|
|
|
@cli.command()
|
|
|
|
@click.argument("start_link")
|
|
|
|
def classify(start_link):
|
2023-04-30 11:54:36 +00:00
|
|
|
""" domain to to classify links for debug """
|
2023-04-09 07:13:15 +00:00
|
|
|
mongocrawler.classify(start_link)
|
|
|
|
|
|
|
|
@cli.command()
|
2023-04-23 08:02:52 +00:00
|
|
|
@click.argument("hostname")
|
2023-04-17 13:07:58 +00:00
|
|
|
@click.option("--filter_content",default=True,help="Filter content")
|
|
|
|
def visit(hostname,filter_content=True):
|
2023-04-23 08:02:52 +00:00
|
|
|
""" Hostname to crawl """
|
2023-04-17 13:07:58 +00:00
|
|
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
2023-04-12 12:35:35 +00:00
|
|
|
|
|
|
|
@cli.command()
|
|
|
|
def summary():
|
|
|
|
mongocrawler.crawl_summary()
|
2023-04-09 07:13:15 +00:00
|
|
|
|
2023-04-12 14:39:44 +00:00
|
|
|
@cli.command()
|
|
|
|
def sampledomains():
|
|
|
|
mongocrawler.sample_domains()
|
|
|
|
|
2024-03-19 11:03:33 +00:00
|
|
|
@cli.command()
|
|
|
|
@click.argument("domain")
|
|
|
|
def sample(domain):
|
|
|
|
myclient = pymongo.MongoClient(CONNECTION)
|
|
|
|
db=myclient[DBNAME]
|
|
|
|
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
|
|
|
print(links)
|
|
|
|
|
2023-04-13 07:10:03 +00:00
|
|
|
|
|
|
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
|
|
|
def enqueue():
|
2023-04-13 14:37:32 +00:00
|
|
|
# TODO: select queues
|
2024-03-19 11:03:33 +00:00
|
|
|
q = rq.Queue(connection=redis.from_url(CONNECTION))
|
2023-04-13 07:10:03 +00:00
|
|
|
for l in sys.stdin:
|
|
|
|
print(l.strip())
|
|
|
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
|
|
|
print(r)
|
|
|
|
|
2023-04-13 14:37:32 +00:00
|
|
|
@cli.command()
|
|
|
|
def importhtml():
|
|
|
|
mongocrawler.import_html()
|
|
|
|
|
2023-04-09 07:13:15 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
cli()
|