25 lines
1.0 KiB
Python
25 lines
1.0 KiB
Python
import os
|
|
# database options
|
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
|
|
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
|
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
|
# retrieving filter
|
|
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
|
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
|
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
|
# document originality filter
|
|
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
|
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
|
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
|
# link and domain sampling
|
|
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
|
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
|
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
|
# link filter
|
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
|
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
|
|
|
|