diff --git a/mongo/cli.py b/mongo/cli.py index 52089ad..d3ebcaa 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -32,9 +32,10 @@ def classify(start_link): mongocrawler.classify(start_link) @cli.command() -@click.argument("hostname") -def visit(hostname): - mongocrawler.visit(hostname) +@click.argument("hostname",help="Hostname to crawl") +@click.option("--filter_content",default=True,help="Filter content") +def visit(hostname,filter_content=True): + mongocrawler.visit(hostname,filter_content=filter_content) @cli.command() def summary(): diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 6677429..7a6d33c 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -24,6 +24,8 @@ import hashlib from bs4 import BeautifulSoup import urllib.parse import os.path +import binascii +import json # database options CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") @@ -166,6 +168,7 @@ def fetch_page(link:str)->(str,str): html = None if response is not None : good = True + print(response) if response.status != 200: good = False LOGGER.error('not a 200 response: %s for URL %s', response.status, url) @@ -227,7 +230,7 @@ def set_content_checksums(doc): sentences += 1 doc["sentences_count"] = sentences -def index_page(db,original_link,final_link,html,doc): +def index_page(db,original_link,final_link,html,doc,filter_content=True): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] @@ -246,7 +249,7 @@ def index_page(db,original_link,final_link,html,doc): set_content_checksums(doc) tsz = doc["text_size"] psz = doc["paragraph_sizes_sum"] - if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO: + if filter_content and (tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO): state = "small" # check copy if state == "good": @@ -258,7 +261,7 @@ def index_page(db,original_link,final_link,html,doc): origsz += paragraph_size doc["original_text_size"] = origsz - if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: + if filter_content and (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: state = "copy" if state == "good": htdoc = get_link_doc(link,state) @@ -673,7 +676,7 @@ def classify(start_link): cl.train(trainset) cl.test(testset) -def visit(hostname): +def visit(hostname,filter_content=True): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] batch_size = BATCH_SIZE @@ -707,7 +710,7 @@ def visit(hostname): final_states = [] docs = [] for original_link,final_link,html,doc in extracted_pages: - status = index_page(db,original_link,final_link,html,doc) + status = index_page(db,original_link,final_link,html,doc,filter_content) final_states.append(status) docs.append(doc) save_batch_info(db,hostname,final_states,docs) @@ -737,8 +740,6 @@ def crawl_summary(): values = [str(item[x]) for x in headers] print("\t".join(values)) -import binascii -import json def import_html(): myclient= pymongo.MongoClient(CONNECTION)