diff --git a/mongo/cli.py b/mongo/cli.py index 8dc2d7f..c0adb3b 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -4,9 +4,8 @@ import rq import redis import sys import os - -REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") -QUEUES=os.getenv("QUEUES","high,default,low") +import pymongo +from config import * @click.group() @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") @@ -55,11 +54,19 @@ def summary(): def sampledomains(): mongocrawler.sample_domains() +@cli.command() +@click.argument("domain") +def sample(domain): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) + print(links) + @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues - q = rq.Queue(connection=redis.from_url(REDIS_URL)) + q = rq.Queue(connection=redis.from_url(CONNECTION)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) diff --git a/mongo/config.py b/mongo/config.py new file mode 100644 index 0000000..7f959c8 --- /dev/null +++ b/mongo/config.py @@ -0,0 +1,24 @@ +import os +# database options +CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") + +QUEUES=os.getenv("QUEUES","high,default,low") +DBNAME=os.getenv("SUCKER_DBNAME","crawler") +# retrieving filter +BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) +MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) +MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) +# document originality filter +MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) +CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) +TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) +# link and domain sampling +DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) +SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) +CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) +# link filter +LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") +DOMAIN = os.getenv("SUCKER_DOMAIN","sk") +STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",") + + diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 19a7353..6bae2b2 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -27,25 +27,7 @@ import os.path import binascii import json -# database options -CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") -DBNAME=os.getenv("SUCKER_DBNAME","crawler") -# retrieving filter -BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) -MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) -MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) -# document originality filter -MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) -CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) -TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) -# link and domain sampling -DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) -SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) -CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) -# link filter -LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") -DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",") +from config import * def get_bs_links(link,html): # Extrakcia linkov zo stranky