This commit is contained in:
Daniel Hládek 2024-03-21 17:31:48 +01:00
parent ed1d4701b8
commit 5d45569651
2 changed files with 10 additions and 8 deletions

View File

@ -2,6 +2,7 @@ import click
import mongocrawler import mongocrawler
import rq import rq
import redis import redis
import json
import sys import sys
import os import os
import pymongo import pymongo
@ -69,7 +70,8 @@ def sample(domain):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links) for link in links:
print(link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):
@cli.command() @cli.command()
@click.argument(hostname) @click.argument("hostname")
def process_links(): def processlinks(hostname):
rules = mongocrawler.fetch_robot(hostname) rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl" outfile = "data.jsonl"
links = [] links = []
for line in sys.stdin: for line in sys.stdin:
links.append(line.rstrip()) links.append(line.rstrip())
extracted_pages, extracted_links = fetch_and_extract(links,rules) extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
with open(outfile,"w") as of:
for page in extracted_pages: for page in extracted_pages:
print(page) doc = json.dumps(page)
pass print(page,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")

View File

@ -530,7 +530,6 @@ def link_summary(db,hostname):
print(res) print(res)
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Sampling links")
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
cl = LinkClassifier() cl = LinkClassifier()