Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
This commit is contained in:
commit
ed1d4701b8
51
mongo/cli.py
51
mongo/cli.py
@ -4,9 +4,9 @@ import rq
|
|||||||
import redis
|
import redis
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import pymongo
|
||||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
import courlan
|
||||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
from config import *
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||||
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
|
|||||||
""" Hostname to crawl """
|
""" Hostname to crawl """
|
||||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("hostname")
|
||||||
|
def linksummary(hostname):
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
mongocrawler.link_summary(db,hostname)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def summary():
|
def summary():
|
||||||
mongocrawler.crawl_summary()
|
mongocrawler.crawl_summary()
|
||||||
@ -55,11 +63,46 @@ def summary():
|
|||||||
def sampledomains():
|
def sampledomains():
|
||||||
mongocrawler.sample_domains()
|
mongocrawler.sample_domains()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("domain")
|
||||||
|
def sample(domain):
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
|
||||||
|
print(links)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("start_link")
|
||||||
|
def fetchlinks(start_link):
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
|
links = mongocrawler.fetch_front_links(start_link,rules)
|
||||||
|
for link in links:
|
||||||
|
print(link[0])
|
||||||
|
#print(front_links)
|
||||||
|
mongocrawler.index_links(db,links)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument(hostname)
|
||||||
|
def process_links():
|
||||||
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
|
outfile = "data.jsonl"
|
||||||
|
links = []
|
||||||
|
for line in sys.stdin:
|
||||||
|
links.append(line.rstrip())
|
||||||
|
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||||
|
for page in extracted_pages:
|
||||||
|
print(page)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
def enqueue():
|
def enqueue():
|
||||||
# TODO: select queues
|
# TODO: select queues
|
||||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
q = rq.Queue(connection=redis.from_url(CONNECTION))
|
||||||
for l in sys.stdin:
|
for l in sys.stdin:
|
||||||
print(l.strip())
|
print(l.strip())
|
||||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||||
|
24
mongo/config.py
Normal file
24
mongo/config.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import os
|
||||||
|
# database options
|
||||||
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||||
|
|
||||||
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||||
|
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||||
|
# retrieving filter
|
||||||
|
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||||
|
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||||
|
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||||
|
# document originality filter
|
||||||
|
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||||
|
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||||
|
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||||
|
# link and domain sampling
|
||||||
|
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||||
|
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||||
|
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||||
|
# link filter
|
||||||
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
|
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||||
|
|
||||||
|
|
@ -27,25 +27,7 @@ import os.path
|
|||||||
import binascii
|
import binascii
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# database options
|
from config import *
|
||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
|
||||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
|
||||||
# retrieving filter
|
|
||||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
|
||||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
|
||||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
|
||||||
# document originality filter
|
|
||||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
|
||||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
|
||||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
|
||||||
# link and domain sampling
|
|
||||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
|
||||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
|
||||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
|
||||||
# link filter
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
|
||||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
|
||||||
|
|
||||||
def get_bs_links(link,html):
|
def get_bs_links(link,html):
|
||||||
# Extrakcia linkov zo stranky
|
# Extrakcia linkov zo stranky
|
||||||
@ -691,6 +673,20 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
|||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
|
||||||
|
def fetch_and_extract(links,rules):
|
||||||
|
print("Processing links")
|
||||||
|
responses = []
|
||||||
|
for link in links:
|
||||||
|
responses.append(fetch_page(link))
|
||||||
|
extracted_pages = []
|
||||||
|
for original_link,(final_link,html) in zip(links,responses):
|
||||||
|
doc = None
|
||||||
|
assert original_link is not None
|
||||||
|
doc = extract_page(final_link,html)
|
||||||
|
extracted_pages.append((original_link,html,doc))
|
||||||
|
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||||
|
return extracted_pages, extracted_links
|
||||||
|
|
||||||
|
|
||||||
def visit(hostname,filter_content=True):
|
def visit(hostname,filter_content=True):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
@ -710,15 +706,7 @@ def visit(hostname,filter_content=True):
|
|||||||
print(links)
|
print(links)
|
||||||
# index results
|
# index results
|
||||||
print("Processing links")
|
print("Processing links")
|
||||||
responses = []
|
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||||
for link in links:
|
|
||||||
responses.append(fetch_page(link))
|
|
||||||
extracted_pages = []
|
|
||||||
for original_link,(final_link,html) in zip(links,responses):
|
|
||||||
doc = None
|
|
||||||
assert original_link is not None
|
|
||||||
doc = extract_page(final_link,html)
|
|
||||||
extracted_pages.append((original_link,html,doc))
|
|
||||||
|
|
||||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
|
|
||||||
@ -731,7 +719,6 @@ def visit(hostname,filter_content=True):
|
|||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
index_pages(db,hostname,extracted_pages,filter_content)
|
index_pages(db,hostname,extracted_pages,filter_content)
|
||||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
|
||||||
index_links(db, extracted_links)
|
index_links(db, extracted_links)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
|
5
mongo/start-docker-devstack.sh
Executable file
5
mongo/start-docker-devstack.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/sh
|
||||||
|
docker pull redis
|
||||||
|
docker pull mongo
|
||||||
|
docker pull mongo-express
|
||||||
|
docker stack deploy -c ./docker-compose.yaml websucker
|
Loading…
Reference in New Issue
Block a user