zz
This commit is contained in:
parent
ed1d4701b8
commit
5d45569651
17
mongo/cli.py
17
mongo/cli.py
@ -2,6 +2,7 @@ import click
|
||||
import mongocrawler
|
||||
import rq
|
||||
import redis
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import pymongo
|
||||
@ -69,7 +70,8 @@ def sample(domain):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||
print(links)
|
||||
for link in links:
|
||||
print(link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
@ -86,17 +88,18 @@ def fetchlinks(start_link):
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(hostname)
|
||||
def process_links():
|
||||
@click.argument("hostname")
|
||||
def processlinks(hostname):
|
||||
rules = mongocrawler.fetch_robot(hostname)
|
||||
outfile = "data.jsonl"
|
||||
links = []
|
||||
for line in sys.stdin:
|
||||
links.append(line.rstrip())
|
||||
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||
for page in extracted_pages:
|
||||
print(page)
|
||||
pass
|
||||
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
|
||||
with open(outfile,"w") as of:
|
||||
for page in extracted_pages:
|
||||
doc = json.dumps(page)
|
||||
print(page,file=of)
|
||||
|
||||
|
||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||
|
@ -530,7 +530,6 @@ def link_summary(db,hostname):
|
||||
print(res)
|
||||
|
||||
def sample_links(db,hostname,status,batch_size):
|
||||
print("Sampling links")
|
||||
linkcol = db["links"]
|
||||
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
|
||||
cl = LinkClassifier()
|
||||
|
Loading…
Reference in New Issue
Block a user