diff --git a/mongo/cli.py b/mongo/cli.py index 8dc2d7f..50a24ea 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -4,9 +4,9 @@ import rq import redis import sys import os - -REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") -QUEUES=os.getenv("QUEUES","high,default,low") +import pymongo +import courlan +from config import * @click.group() @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") @@ -47,6 +47,14 @@ def visit(hostname,filter_content=True): """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) +@cli.command() +@click.argument("hostname") +def linksummary(hostname): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + mongocrawler.link_summary(db,hostname) + + @cli.command() def summary(): mongocrawler.crawl_summary() @@ -55,11 +63,46 @@ def summary(): def sampledomains(): mongocrawler.sample_domains() +@cli.command() +@click.argument("domain") +def sample(domain): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) + print(links) + +@cli.command() +@click.argument("start_link") +def fetchlinks(start_link): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + start_link,hostname = courlan.check_url(start_link) + rules = mongocrawler.fetch_robot(hostname) + links = mongocrawler.fetch_front_links(start_link,rules) + for link in links: + print(link[0]) + #print(front_links) + mongocrawler.index_links(db,links) + + +@cli.command() +@click.argument(hostname) +def process_links(): + rules = mongocrawler.fetch_robot(hostname) + outfile = "data.jsonl" + links = [] + for line in sys.stdin: + links.append(line.rstrip()) + extracted_pages, extracted_links = fetch_and_extract(links,rules) + for page in extracted_pages: + print(page) + pass + @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): # TODO: select queues - q = rq.Queue(connection=redis.from_url(REDIS_URL)) + q = rq.Queue(connection=redis.from_url(CONNECTION)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) diff --git a/mongo/config.py b/mongo/config.py new file mode 100644 index 0000000..7f959c8 --- /dev/null +++ b/mongo/config.py @@ -0,0 +1,24 @@ +import os +# database options +CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") + +QUEUES=os.getenv("QUEUES","high,default,low") +DBNAME=os.getenv("SUCKER_DBNAME","crawler") +# retrieving filter +BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) +MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) +MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) +# document originality filter +MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) +CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) +TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) +# link and domain sampling +DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) +SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) +CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) +# link filter +LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") +DOMAIN = os.getenv("SUCKER_DOMAIN","sk") +STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",") + + diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 61db0f8..4217d51 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -27,25 +27,7 @@ import os.path import binascii import json -# database options -CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") -DBNAME=os.getenv("SUCKER_DBNAME","crawler") -# retrieving filter -BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) -MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) -MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) -# document originality filter -MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) -CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) -TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) -# link and domain sampling -DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) -SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) -CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) -# link filter -LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") -DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",") +from config import * def get_bs_links(link,html): # Extrakcia linkov zo stranky @@ -690,8 +672,22 @@ def index_pages(db,hostname,extracted_pages,filter_content): final_states.append(status) docs.append(doc) save_batch_info(db,hostname,final_states,docs) - +def fetch_and_extract(links,rules): + print("Processing links") + responses = [] + for link in links: + responses.append(fetch_page(link)) + extracted_pages = [] + for original_link,(final_link,html) in zip(links,responses): + doc = None + assert original_link is not None + doc = extract_page(final_link,html) + extracted_pages.append((original_link,html,doc)) + extracted_links = extract_links(links,responses,rules,"frontlink") + return extracted_pages, extracted_links + + def visit(hostname,filter_content=True): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -710,15 +706,7 @@ def visit(hostname,filter_content=True): print(links) # index results print("Processing links") - responses = [] - for link in links: - responses.append(fetch_page(link)) - extracted_pages = [] - for original_link,(final_link,html) in zip(links,responses): - doc = None - assert original_link is not None - doc = extract_page(final_link,html) - extracted_pages.append((original_link,html,doc)) + extracted_pages, extracted_links = fetch_and_extract(links,rules) extracted_links = extract_links(links,responses,hostname,rules,"frontlink") @@ -731,7 +719,6 @@ def visit(hostname,filter_content=True): docs.append(doc) save_batch_info(db,hostname,final_states,docs) index_pages(db,hostname,extracted_pages,filter_content) - extracted_links = extract_links(links,responses,rules,"frontlink") index_links(db, extracted_links) link_summary(db,hostname) diff --git a/mongo/start-docker-devstack.sh b/mongo/start-docker-devstack.sh new file mode 100755 index 0000000..3bd2953 --- /dev/null +++ b/mongo/start-docker-devstack.sh @@ -0,0 +1,5 @@ +#!/usr/bin/sh +docker pull redis +docker pull mongo +docker pull mongo-express +docker stack deploy -c ./docker-compose.yaml websucker