Compare commits
5 Commits
2f8ead3964
...
5d45569651
Author | SHA1 | Date | |
---|---|---|---|
5d45569651 | |||
ed1d4701b8 | |||
bd32d32557 | |||
3bdac3642b | |||
87f84b8eb8 |
15
mongo/cli.py
15
mongo/cli.py
@ -2,6 +2,7 @@ import click
|
|||||||
import mongocrawler
|
import mongocrawler
|
||||||
import rq
|
import rq
|
||||||
import redis
|
import redis
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import pymongo
|
import pymongo
|
||||||
@ -69,7 +70,8 @@ def sample(domain):
|
|||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||||
print(links)
|
for link in links:
|
||||||
|
print(link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
@ -86,17 +88,18 @@ def fetchlinks(start_link):
|
|||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument(hostname)
|
@click.argument("hostname")
|
||||||
def process_links():
|
def processlinks(hostname):
|
||||||
rules = mongocrawler.fetch_robot(hostname)
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
outfile = "data.jsonl"
|
outfile = "data.jsonl"
|
||||||
links = []
|
links = []
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
links.append(line.rstrip())
|
links.append(line.rstrip())
|
||||||
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
|
||||||
|
with open(outfile,"w") as of:
|
||||||
for page in extracted_pages:
|
for page in extracted_pages:
|
||||||
print(page)
|
doc = json.dumps(page)
|
||||||
pass
|
print(page,file=of)
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
|
@ -530,7 +530,6 @@ def link_summary(db,hostname):
|
|||||||
print(res)
|
print(res)
|
||||||
|
|
||||||
def sample_links(db,hostname,status,batch_size):
|
def sample_links(db,hostname,status,batch_size):
|
||||||
print("Sampling links")
|
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
|
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
|
||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
@ -663,6 +662,7 @@ def classify(start_link):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
|
|
||||||
def index_pages(db,hostname,extracted_pages,filter_content):
|
def index_pages(db,hostname,extracted_pages,filter_content):
|
||||||
final_states = []
|
final_states = []
|
||||||
docs = []
|
docs = []
|
||||||
@ -707,6 +707,16 @@ def visit(hostname,filter_content=True):
|
|||||||
print("Processing links")
|
print("Processing links")
|
||||||
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||||
|
|
||||||
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
|
|
||||||
|
index_links(db,extracted_links)
|
||||||
|
final_states = []
|
||||||
|
docs = []
|
||||||
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
|
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||||
|
final_states.append(status)
|
||||||
|
docs.append(doc)
|
||||||
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
index_pages(db,hostname,extracted_pages,filter_content)
|
index_pages(db,hostname,extracted_pages,filter_content)
|
||||||
index_links(db, extracted_links)
|
index_links(db, extracted_links)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
Loading…
Reference in New Issue
Block a user