From b838a9bbd63b74f6896840e8d5863ef9185c903a Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 21 Mar 2024 09:00:31 +0100 Subject: [PATCH] zz --- mongo/cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mongo/cli.py b/mongo/cli.py index c0adb3b..ccec053 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -5,6 +5,7 @@ import redis import sys import os import pymongo +import courlan from config import * @click.group() @@ -62,6 +63,18 @@ def sample(domain): links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) print(links) +@cli.command() +@click.argument("start_link") +def fetchlinks(start_link): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + start_link,hostname = courlan.check_url(start_link) + rules = mongocrawler.fetch_robot(hostname) + front_links = mongocrawler.fetch_front_links(start_link,rules) + print(front_links) + mongocrawler.index_links(db,front_links) + + @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue():