This commit is contained in:
Dnaiel Hladek 2024-03-19 12:03:33 +01:00
parent c06348080a
commit b6d9260882
3 changed files with 36 additions and 23 deletions

View File

@ -4,9 +4,8 @@ import rq
import redis import redis
import sys import sys
import os import os
import pymongo
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") from config import *
QUEUES=os.getenv("QUEUES","high,default,low")
@click.group() @click.group()
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
@ -55,11 +54,19 @@ def summary():
def sampledomains(): def sampledomains():
mongocrawler.sample_domains() mongocrawler.sample_domains()
@cli.command()
@click.argument("domain")
def sample(domain):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links)
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue(): def enqueue():
# TODO: select queues # TODO: select queues
q = rq.Queue(connection=redis.from_url(REDIS_URL)) q = rq.Queue(connection=redis.from_url(CONNECTION))
for l in sys.stdin: for l in sys.stdin:
print(l.strip()) print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip()) r = q.enqueue(mongocrawler.visit, l.strip())

24
mongo/config.py Normal file
View File

@ -0,0 +1,24 @@
import os
# database options
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
QUEUES=os.getenv("QUEUES","high,default,low")
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
# retrieving filter
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
# document originality filter
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
# link and domain sampling
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
# link filter
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")

View File

@ -27,25 +27,7 @@ import os.path
import binascii import binascii
import json import json
# database options from config import *
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
# retrieving filter
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
# document originality filter
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
# link and domain sampling
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
# link filter
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
def get_bs_links(link,html): def get_bs_links(link,html):
# Extrakcia linkov zo stranky # Extrakcia linkov zo stranky