import os # database options CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") QUEUES=os.getenv("QUEUES","high,default,low") DBNAME=os.getenv("SUCKER_DBNAME","crawler") # retrieving filter BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) # document originality filter MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) # link and domain sampling DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) # link filter LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")