websucker-pip/mongo/cli.py

141 lines
3.5 KiB
Python
Raw Normal View History

2023-04-09 07:13:15 +00:00
import click
import mongocrawler
2023-04-13 07:10:03 +00:00
import rq
2023-04-13 14:11:19 +00:00
import redis
2024-03-21 16:31:48 +00:00
import json
2023-04-13 14:11:19 +00:00
import sys
2023-04-13 07:10:03 +00:00
import os
2024-03-19 11:03:33 +00:00
import pymongo
2024-03-21 08:00:31 +00:00
import courlan
2024-03-19 11:03:33 +00:00
from config import *
2023-04-09 07:13:15 +00:00
@click.group()
2023-04-30 11:54:36 +00:00
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
DBNAME=dbname
2023-04-09 07:13:15 +00:00
pass
@cli.command()
def createdb():
mongocrawler.createdb()
2023-04-30 11:54:36 +00:00
@cli.command()
def dropdb():
mongocrawler.dropdb()
2023-04-09 07:13:15 +00:00
@cli.command()
@click.argument("link")
def parseurl(link):
2023-04-30 11:54:36 +00:00
""" Parse document on link for debug """
2023-04-09 07:13:15 +00:00
mongocrawler.parseurl(link)
@cli.command()
@click.argument("link")
def externaldomains(link):
2023-04-30 11:54:36 +00:00
""" Extract external domains from link """
2023-04-09 07:13:15 +00:00
mongocrawler.externaldomains(link)
@cli.command()
@click.argument("start_link")
def classify(start_link):
2023-04-30 11:54:36 +00:00
""" domain to to classify links for debug """
2023-04-09 07:13:15 +00:00
mongocrawler.classify(start_link)
@cli.command()
2023-04-23 08:02:52 +00:00
@click.argument("hostname")
2023-04-17 13:07:58 +00:00
@click.option("--filter_content",default=True,help="Filter content")
def visit(hostname,filter_content=True):
2023-04-23 08:02:52 +00:00
""" Hostname to crawl """
2023-04-17 13:07:58 +00:00
mongocrawler.visit(hostname,filter_content=filter_content)
2023-04-12 12:35:35 +00:00
2024-03-21 11:58:42 +00:00
@cli.command()
@click.argument("hostname")
def linksummary(hostname):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
mongocrawler.link_summary(db,hostname)
2023-04-12 12:35:35 +00:00
@cli.command()
def summary():
mongocrawler.crawl_summary()
2023-04-09 07:13:15 +00:00
2023-04-12 14:39:44 +00:00
@cli.command()
def sampledomains():
mongocrawler.sample_domains()
2024-03-19 11:03:33 +00:00
@cli.command()
@click.argument("domain")
def sample(domain):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
2024-03-21 16:31:48 +00:00
for link in links:
print(link)
2024-03-19 11:03:33 +00:00
2024-03-21 08:00:31 +00:00
@cli.command()
@click.argument("start_link")
def fetchlinks(start_link):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
rules = mongocrawler.fetch_robot(hostname)
2024-03-21 12:21:43 +00:00
links = mongocrawler.fetch_front_links(start_link,rules)
for link in links:
print(link[0])
#print(front_links)
mongocrawler.index_links(db,links)
2024-03-21 08:00:31 +00:00
2024-03-21 12:21:43 +00:00
@cli.command()
2024-03-21 16:31:48 +00:00
@click.argument("hostname")
def processlinks(hostname):
2024-03-21 12:21:43 +00:00
rules = mongocrawler.fetch_robot(hostname)
2024-03-21 18:36:59 +00:00
dname = "data"
outfile = dname + "/data.jsonl"
loutfile = dname + "/extracted.links"
htmldir = dname + "/html/"
2024-03-21 12:21:43 +00:00
links = []
2024-03-21 18:36:59 +00:00
os.mkdir(dname)
os.mkdir(htmldir)
2024-03-21 12:21:43 +00:00
for line in sys.stdin:
links.append(line.rstrip())
2024-03-21 16:31:48 +00:00
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
2024-03-21 18:36:59 +00:00
# save extracted text
2024-03-21 16:31:48 +00:00
with open(outfile,"w") as of:
for page in extracted_pages:
2024-03-21 18:36:59 +00:00
url,html,doc = page
if "url" in doc and doc["url"] != url:
doc["original_url"] = url
else:
doc["url"] = url
import urllib.parse
hname = htmldir + urllib.parse.quote(url,safe="")
doc["html_filename"] = hname
with open(hname,"w") as hf:
print(html,file=hf)
ddoc = json.dumps(doc)
print(ddoc,file=of)
# save extracted links
with open(loutfile,"w") as of:
for link in links:
print(link,file=of)
2024-03-21 12:21:43 +00:00
2023-04-13 07:10:03 +00:00
@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
2023-04-13 14:37:32 +00:00
# TODO: select queues
2024-03-19 11:03:33 +00:00
q = rq.Queue(connection=redis.from_url(CONNECTION))
2023-04-13 07:10:03 +00:00
for l in sys.stdin:
print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip())
print(r)
2023-04-13 14:37:32 +00:00
@cli.command()
def importhtml():
mongocrawler.import_html()
2023-04-09 07:13:15 +00:00
if __name__ == "__main__":
cli()