Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
This commit is contained in:
commit
ed1d4701b8
51
mongo/cli.py
51
mongo/cli.py
@ -4,9 +4,9 @@ import rq
|
||||
import redis
|
||||
import sys
|
||||
import os
|
||||
|
||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||
import pymongo
|
||||
import courlan
|
||||
from config import *
|
||||
|
||||
@click.group()
|
||||
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
|
||||
""" Hostname to crawl """
|
||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("hostname")
|
||||
def linksummary(hostname):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
mongocrawler.link_summary(db,hostname)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def summary():
|
||||
mongocrawler.crawl_summary()
|
||||
@ -55,11 +63,46 @@ def summary():
|
||||
def sampledomains():
|
||||
mongocrawler.sample_domains()
|
||||
|
||||
@cli.command()
|
||||
@click.argument("domain")
|
||||
def sample(domain):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||
print(links)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def fetchlinks(start_link):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
start_link,hostname = courlan.check_url(start_link)
|
||||
rules = mongocrawler.fetch_robot(hostname)
|
||||
links = mongocrawler.fetch_front_links(start_link,rules)
|
||||
for link in links:
|
||||
print(link[0])
|
||||
#print(front_links)
|
||||
mongocrawler.index_links(db,links)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(hostname)
|
||||
def process_links():
|
||||
rules = mongocrawler.fetch_robot(hostname)
|
||||
outfile = "data.jsonl"
|
||||
links = []
|
||||
for line in sys.stdin:
|
||||
links.append(line.rstrip())
|
||||
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||
for page in extracted_pages:
|
||||
print(page)
|
||||
pass
|
||||
|
||||
|
||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||
def enqueue():
|
||||
# TODO: select queues
|
||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
||||
q = rq.Queue(connection=redis.from_url(CONNECTION))
|
||||
for l in sys.stdin:
|
||||
print(l.strip())
|
||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||
|
24
mongo/config.py
Normal file
24
mongo/config.py
Normal file
@ -0,0 +1,24 @@
|
||||
import os
|
||||
# database options
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
|
||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
# retrieving filter
|
||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||
# document originality filter
|
||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||
# link and domain sampling
|
||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||
# link filter
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||
|
||||
|
@ -27,25 +27,7 @@ import os.path
|
||||
import binascii
|
||||
import json
|
||||
|
||||
# database options
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
# retrieving filter
|
||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||
# document originality filter
|
||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||
# link and domain sampling
|
||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||
# link filter
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||
from config import *
|
||||
|
||||
def get_bs_links(link,html):
|
||||
# Extrakcia linkov zo stranky
|
||||
@ -691,6 +673,20 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
|
||||
def fetch_and_extract(links,rules):
|
||||
print("Processing links")
|
||||
responses = []
|
||||
for link in links:
|
||||
responses.append(fetch_page(link))
|
||||
extracted_pages = []
|
||||
for original_link,(final_link,html) in zip(links,responses):
|
||||
doc = None
|
||||
assert original_link is not None
|
||||
doc = extract_page(final_link,html)
|
||||
extracted_pages.append((original_link,html,doc))
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
return extracted_pages, extracted_links
|
||||
|
||||
|
||||
def visit(hostname,filter_content=True):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
@ -710,15 +706,7 @@ def visit(hostname,filter_content=True):
|
||||
print(links)
|
||||
# index results
|
||||
print("Processing links")
|
||||
responses = []
|
||||
for link in links:
|
||||
responses.append(fetch_page(link))
|
||||
extracted_pages = []
|
||||
for original_link,(final_link,html) in zip(links,responses):
|
||||
doc = None
|
||||
assert original_link is not None
|
||||
doc = extract_page(final_link,html)
|
||||
extracted_pages.append((original_link,html,doc))
|
||||
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||
|
||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||
|
||||
@ -731,7 +719,6 @@ def visit(hostname,filter_content=True):
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
index_pages(db,hostname,extracted_pages,filter_content)
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
index_links(db, extracted_links)
|
||||
link_summary(db,hostname)
|
||||
|
||||
|
5
mongo/start-docker-devstack.sh
Executable file
5
mongo/start-docker-devstack.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/sh
|
||||
docker pull redis
|
||||
docker pull mongo
|
||||
docker pull mongo-express
|
||||
docker stack deploy -c ./docker-compose.yaml websucker
|
Loading…
Reference in New Issue
Block a user