zz
This commit is contained in:
parent
c06348080a
commit
b6d9260882
15
mongo/cli.py
15
mongo/cli.py
@ -4,9 +4,8 @@ import rq
|
|||||||
import redis
|
import redis
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import pymongo
|
||||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
from config import *
|
||||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||||
@ -55,11 +54,19 @@ def summary():
|
|||||||
def sampledomains():
|
def sampledomains():
|
||||||
mongocrawler.sample_domains()
|
mongocrawler.sample_domains()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("domain")
|
||||||
|
def sample(domain):
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||||
|
print(links)
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
def enqueue():
|
def enqueue():
|
||||||
# TODO: select queues
|
# TODO: select queues
|
||||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
q = rq.Queue(connection=redis.from_url(CONNECTION))
|
||||||
for l in sys.stdin:
|
for l in sys.stdin:
|
||||||
print(l.strip())
|
print(l.strip())
|
||||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||||
|
24
mongo/config.py
Normal file
24
mongo/config.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import os
|
||||||
|
# database options
|
||||||
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||||
|
|
||||||
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||||
|
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||||
|
# retrieving filter
|
||||||
|
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||||
|
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||||
|
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||||
|
# document originality filter
|
||||||
|
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||||
|
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||||
|
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||||
|
# link and domain sampling
|
||||||
|
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||||
|
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||||
|
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||||
|
# link filter
|
||||||
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
|
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||||
|
|
||||||
|
|
@ -27,25 +27,7 @@ import os.path
|
|||||||
import binascii
|
import binascii
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# database options
|
from config import *
|
||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
|
||||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
|
||||||
# retrieving filter
|
|
||||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
|
||||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
|
||||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
|
||||||
# document originality filter
|
|
||||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
|
||||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
|
||||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
|
||||||
# link and domain sampling
|
|
||||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
|
||||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
|
||||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
|
||||||
# link filter
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
|
||||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
|
||||||
|
|
||||||
def get_bs_links(link,html):
|
def get_bs_links(link,html):
|
||||||
# Extrakcia linkov zo stranky
|
# Extrakcia linkov zo stranky
|
||||||
|
Loading…
Reference in New Issue
Block a user