zz
This commit is contained in:
parent
c06348080a
commit
b6d9260882
15
mongo/cli.py
15
mongo/cli.py
@ -4,9 +4,8 @@ import rq
|
||||
import redis
|
||||
import sys
|
||||
import os
|
||||
|
||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||
import pymongo
|
||||
from config import *
|
||||
|
||||
@click.group()
|
||||
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||
@ -55,11 +54,19 @@ def summary():
|
||||
def sampledomains():
|
||||
mongocrawler.sample_domains()
|
||||
|
||||
@cli.command()
|
||||
@click.argument("domain")
|
||||
def sample(domain):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||
print(links)
|
||||
|
||||
|
||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||
def enqueue():
|
||||
# TODO: select queues
|
||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
||||
q = rq.Queue(connection=redis.from_url(CONNECTION))
|
||||
for l in sys.stdin:
|
||||
print(l.strip())
|
||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||
|
24
mongo/config.py
Normal file
24
mongo/config.py
Normal file
@ -0,0 +1,24 @@
|
||||
import os
|
||||
# database options
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
|
||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
# retrieving filter
|
||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||
# document originality filter
|
||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||
# link and domain sampling
|
||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||
# link filter
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||
|
||||
|
@ -27,25 +27,7 @@ import os.path
|
||||
import binascii
|
||||
import json
|
||||
|
||||
# database options
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
# retrieving filter
|
||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||
# document originality filter
|
||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||
# link and domain sampling
|
||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||
# link filter
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||
from config import *
|
||||
|
||||
def get_bs_links(link,html):
|
||||
# Extrakcia linkov zo stranky
|
||||
|
Loading…
Reference in New Issue
Block a user