Compare commits

..

No commits in common. "5d45569651b426da014b4445bebcb799c9bdb80d" and "2f8ead396455ed6deabbc269fb4ac9c2377013f6" have entirely different histories.

2 changed files with 8 additions and 21 deletions

View File

@ -2,7 +2,6 @@ import click
import mongocrawler import mongocrawler
import rq import rq
import redis import redis
import json
import sys import sys
import os import os
import pymongo import pymongo
@ -70,8 +69,7 @@ def sample(domain):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
for link in links: print(links)
print(link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
@ -88,18 +86,17 @@ def fetchlinks(start_link):
@cli.command() @cli.command()
@click.argument("hostname") @click.argument(hostname)
def processlinks(hostname): def process_links():
rules = mongocrawler.fetch_robot(hostname) rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl" outfile = "data.jsonl"
links = [] links = []
for line in sys.stdin: for line in sys.stdin:
links.append(line.rstrip()) links.append(line.rstrip())
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules) extracted_pages, extracted_links = fetch_and_extract(links,rules)
with open(outfile,"w") as of: for page in extracted_pages:
for page in extracted_pages: print(page)
doc = json.dumps(page) pass
print(page,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")

View File

@ -530,6 +530,7 @@ def link_summary(db,hostname):
print(res) print(res)
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Sampling links")
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
cl = LinkClassifier() cl = LinkClassifier()
@ -662,7 +663,6 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
def index_pages(db,hostname,extracted_pages,filter_content): def index_pages(db,hostname,extracted_pages,filter_content):
final_states = [] final_states = []
docs = [] docs = []
@ -707,16 +707,6 @@ def visit(hostname,filter_content=True):
print("Processing links") print("Processing links")
extracted_pages, extracted_links = fetch_and_extract(links,rules) extracted_pages, extracted_links = fetch_and_extract(links,rules)
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links)
final_states = []
docs = []
for original_link,final_link,html,doc in extracted_pages:
status = index_page(db,original_link,final_link,html,doc,filter_content)
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
index_pages(db,hostname,extracted_pages,filter_content) index_pages(db,hostname,extracted_pages,filter_content)
index_links(db, extracted_links) index_links(db, extracted_links)
link_summary(db,hostname) link_summary(db,hostname)