This commit is contained in:
Daniel Hládek 2024-03-21 09:00:31 +01:00
parent b6d9260882
commit b838a9bbd6

View File

@ -5,6 +5,7 @@ import redis
import sys import sys
import os import os
import pymongo import pymongo
import courlan
from config import * from config import *
@click.group() @click.group()
@ -62,6 +63,18 @@ def sample(domain):
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links) print(links)
@cli.command()
@click.argument("start_link")
def fetchlinks(start_link):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
rules = mongocrawler.fetch_robot(hostname)
front_links = mongocrawler.fetch_front_links(start_link,rules)
print(front_links)
mongocrawler.index_links(db,front_links)
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue(): def enqueue():