diff --git a/mongo/cli.py b/mongo/cli.py index ccec053..84ed122 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -47,6 +47,14 @@ def visit(hostname,filter_content=True): """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) +@cli.command() +@click.argument("hostname") +def linksummary(hostname): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + mongocrawler.link_summary(db,hostname) + + @cli.command() def summary(): mongocrawler.crawl_summary() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 6bae2b2..1c4b91e 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -671,8 +671,22 @@ def index_pages(db,hostname,extracted_pages,filter_content): final_states.append(status) docs.append(doc) save_batch_info(db,hostname,final_states,docs) - +def fetch_and_extract(links,rules): + print("Processing links") + responses = [] + for link in links: + responses.append(fetch_page(link)) + extracted_pages = [] + for original_link,(final_link,html) in zip(links,responses): + doc = None + assert original_link is not None + doc = extract_page(final_link,html) + extracted_pages.append((original_link,html,doc)) + extracted_links = extract_links(links,responses,rules,"frontlink") + return extracted_pages, extracted_links + + def visit(hostname,filter_content=True): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -700,9 +714,9 @@ def visit(hostname,filter_content=True): assert original_link is not None doc = extract_page(final_link,html) extracted_pages.append((original_link,html,doc)) + extracted_links = extract_links(links,responses,rules,"frontlink") index_pages(db,hostname,extracted_pages,filter_content) - extracted_links = extract_links(links,responses,rules,"frontlink") index_links(db, extracted_links) link_summary(db,hostname) diff --git a/mongo/start-docker-devstack.sh b/mongo/start-docker-devstack.sh new file mode 100755 index 0000000..3bd2953 --- /dev/null +++ b/mongo/start-docker-devstack.sh @@ -0,0 +1,5 @@ +#!/usr/bin/sh +docker pull redis +docker pull mongo +docker pull mongo-express +docker stack deploy -c ./docker-compose.yaml websucker