websucker-pip/mongo/cli.py

import click
import mongocrawler
import rq
import redis
import json
import sys
import os
import pymongo
import courlan
from config import *

@click.group()
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
    DBNAME=dbname
    pass

@cli.command()
def createdb():
    mongocrawler.createdb()

@cli.command()
def dropdb():
    mongocrawler.dropdb()

@cli.command()
@click.argument("link")
def parseurl(link):
    """ Parse document on link for debug """
    mongocrawler.parseurl(link)

@cli.command()
@click.argument("link")
def externaldomains(link):
    """ Extract external domains from link """
    mongocrawler.externaldomains(link)

@cli.command()
@click.argument("start_link")
def classify(start_link):
    """ domain to  to classify links for debug """
    mongocrawler.classify(start_link)

@cli.command()
@click.argument("hostname")
@click.option("--filter_content",default=True,help="Filter content")
def visit(hostname,filter_content=True):
    """ Hostname to crawl """
    mongocrawler.visit(hostname,filter_content=filter_content)

@cli.command()
@click.argument("hostname")
def linksummary(hostname):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    mongocrawler.link_summary(db,hostname)


@cli.command()
def summary():
    mongocrawler.crawl_summary()

@cli.command()
def sampledomains():
    mongocrawler.sample_domains()

@cli.command()
@click.argument("domain")
def sample(domain):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
    for link in links:
        print(link)

@cli.command()
@click.argument("start_link")
def fetchlinks(start_link):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    start_link,hostname = courlan.check_url(start_link)
    rules = mongocrawler.fetch_robot(hostname)
    links = mongocrawler.fetch_front_links(start_link,rules)
    for link in links:
        print(link[0])
    #print(front_links)
    mongocrawler.index_links(db,links)


@cli.command()
@click.argument("hostname")
def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
    outfile = "data.jsonl"
    links = []
    for line in sys.stdin:
        links.append(line.rstrip())
    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
    with open(outfile,"w") as of:
        for page in extracted_pages:
            doc = json.dumps(page)
            print(page,file=of)


@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
    # TODO: select queues
    q = rq.Queue(connection=redis.from_url(CONNECTION))
    for l in sys.stdin:
        print(l.strip())
        r = q.enqueue(mongocrawler.visit, l.strip())
        print(r)

@cli.command()
def importhtml():
    mongocrawler.import_html()

if __name__ == "__main__":
    cli()
zz 2023-04-09 07:13:15 +00:00			`import click`
			`import mongocrawler`
zz 2023-04-13 07:10:03 +00:00			`import rq`
zz 2023-04-13 14:11:19 +00:00			`import redis`
zz 2024-03-21 16:31:48 +00:00			`import json`
zz 2023-04-13 14:11:19 +00:00			`import sys`
zz 2023-04-13 07:10:03 +00:00			`import os`
zz 2024-03-19 11:03:33 +00:00			`import pymongo`
zz 2024-03-21 08:00:31 +00:00			`import courlan`
zz 2024-03-19 11:03:33 +00:00			`from config import *`
zz 2023-04-09 07:13:15 +00:00
			`@click.group()`
zz 2023-04-30 11:54:36 +00:00			`@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")`
			`def cli(dbname):`
			`DBNAME=dbname`
zz 2023-04-09 07:13:15 +00:00			`pass`

			`@cli.command()`
			`def createdb():`
			`mongocrawler.createdb()`

zz 2023-04-30 11:54:36 +00:00			`@cli.command()`
			`def dropdb():`
			`mongocrawler.dropdb()`

zz 2023-04-09 07:13:15 +00:00			`@cli.command()`
			`@click.argument("link")`
			`def parseurl(link):`
zz 2023-04-30 11:54:36 +00:00			`""" Parse document on link for debug """`
zz 2023-04-09 07:13:15 +00:00			`mongocrawler.parseurl(link)`

			`@cli.command()`
			`@click.argument("link")`
			`def externaldomains(link):`
zz 2023-04-30 11:54:36 +00:00			`""" Extract external domains from link """`
zz 2023-04-09 07:13:15 +00:00			`mongocrawler.externaldomains(link)`

			`@cli.command()`
			`@click.argument("start_link")`
			`def classify(start_link):`
zz 2023-04-30 11:54:36 +00:00			`""" domain to to classify links for debug """`
zz 2023-04-09 07:13:15 +00:00			`mongocrawler.classify(start_link)`

			`@cli.command()`
zz 2023-04-23 08:02:52 +00:00			`@click.argument("hostname")`
zz 2023-04-17 13:07:58 +00:00			`@click.option("--filter_content",default=True,help="Filter content")`
			`def visit(hostname,filter_content=True):`
zz 2023-04-23 08:02:52 +00:00			`""" Hostname to crawl """`
zz 2023-04-17 13:07:58 +00:00			`mongocrawler.visit(hostname,filter_content=filter_content)`
zz 2023-04-12 12:35:35 +00:00
zz 2024-03-21 11:58:42 +00:00			`@cli.command()`
			`@click.argument("hostname")`
			`def linksummary(hostname):`
			`myclient = pymongo.MongoClient(CONNECTION)`
			`db=myclient[DBNAME]`
			`mongocrawler.link_summary(db,hostname)`


zz 2023-04-12 12:35:35 +00:00			`@cli.command()`
			`def summary():`
			`mongocrawler.crawl_summary()`
zz 2023-04-09 07:13:15 +00:00
zz 2023-04-12 14:39:44 +00:00			`@cli.command()`
			`def sampledomains():`
			`mongocrawler.sample_domains()`

zz 2024-03-19 11:03:33 +00:00			`@cli.command()`
			`@click.argument("domain")`
			`def sample(domain):`
			`myclient = pymongo.MongoClient(CONNECTION)`
			`db=myclient[DBNAME]`
			`links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)`
zz 2024-03-21 16:31:48 +00:00			`for link in links:`
			`print(link)`
zz 2024-03-19 11:03:33 +00:00
zz 2024-03-21 08:00:31 +00:00			`@cli.command()`
			`@click.argument("start_link")`
			`def fetchlinks(start_link):`
			`myclient = pymongo.MongoClient(CONNECTION)`
			`db=myclient[DBNAME]`
			`start_link,hostname = courlan.check_url(start_link)`
			`rules = mongocrawler.fetch_robot(hostname)`
zz 2024-03-21 12:21:43 +00:00			`links = mongocrawler.fetch_front_links(start_link,rules)`
			`for link in links:`
			`print(link[0])`
			`#print(front_links)`
			`mongocrawler.index_links(db,links)`
zz 2024-03-21 08:00:31 +00:00

zz 2024-03-21 12:21:43 +00:00			`@cli.command()`
zz 2024-03-21 16:31:48 +00:00			`@click.argument("hostname")`
			`def processlinks(hostname):`
zz 2024-03-21 12:21:43 +00:00			`rules = mongocrawler.fetch_robot(hostname)`
			`outfile = "data.jsonl"`
			`links = []`
			`for line in sys.stdin:`
			`links.append(line.rstrip())`
zz 2024-03-21 16:31:48 +00:00			`extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)`
			`with open(outfile,"w") as of:`
			`for page in extracted_pages:`
			`doc = json.dumps(page)`
			`print(page,file=of)`
zz 2024-03-21 12:21:43 +00:00
zz 2023-04-13 07:10:03 +00:00
			`@cli.command(help="Enqueue a list of links into redis queue for crawling")`
			`def enqueue():`
zz 2023-04-13 14:37:32 +00:00			`# TODO: select queues`
zz 2024-03-19 11:03:33 +00:00			`q = rq.Queue(connection=redis.from_url(CONNECTION))`
zz 2023-04-13 07:10:03 +00:00			`for l in sys.stdin:`
			`print(l.strip())`
			`r = q.enqueue(mongocrawler.visit, l.strip())`
			`print(r)`

zz 2023-04-13 14:37:32 +00:00			`@cli.command()`
			`def importhtml():`
			`mongocrawler.import_html()`

zz 2023-04-09 07:13:15 +00:00			`if __name__ == "__main__":`
			`cli()`