This commit is contained in:
Daniel Hládek 2024-03-21 17:31:48 +01:00
parent ed1d4701b8
commit 5d45569651
2 changed files with 10 additions and 8 deletions

View File

@ -2,6 +2,7 @@ import click
import mongocrawler
import rq
import redis
import json
import sys
import os
import pymongo
@ -69,7 +70,8 @@ def sample(domain):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links)
for link in links:
print(link)
@cli.command()
@click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):
@cli.command()
@click.argument(hostname)
def process_links():
@click.argument("hostname")
def processlinks(hostname):
rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl"
links = []
for line in sys.stdin:
links.append(line.rstrip())
extracted_pages, extracted_links = fetch_and_extract(links,rules)
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
with open(outfile,"w") as of:
for page in extracted_pages:
print(page)
pass
doc = json.dumps(page)
print(page,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling")

View File

@ -530,7 +530,6 @@ def link_summary(db,hostname):
print(res)
def sample_links(db,hostname,status,batch_size):
print("Sampling links")
linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
cl = LinkClassifier()