diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 36ff4e0..0fbecbe 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -175,7 +175,7 @@ def fetch_page(link:str)->(str,str): good = True if response.status != 200: good = False - LOGGER.error('not a 200 response: %s for URL %s', response.status, url) + LOGGER.error('not a 200 response: %s for URL %s', response.status, link) elif response.data is None or len(response.data) < MIN_FILE_SIZE: LOGGER.error('too small/incorrect for URL %s', link) good = False @@ -278,7 +278,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) - htdoc["html_md5"]= hashlib.md5(html).hexdigest() + htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() # can be revisited - upsert del htdoc["url"] htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) @@ -682,6 +682,16 @@ def classify(start_link): cl.train(trainset) cl.test(testset) +def index_pages(hostname,extracted_pages): + final_states = [] + docs = [] + for original_link,final_link,html,doc in extracted_pages: + status = index_page(db,original_link,final_link,html,doc,filter_content) + final_states.append(status) + docs.append(doc) + save_batch_info(db,hostname,final_states,docs) + + def visit(hostname,filter_content=True): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -712,6 +722,7 @@ def visit(hostname,filter_content=True): extracted_pages.append((original_link,final_link,html,doc)) extracted_links = extract_links(links,responses,hostname,rules,"frontlink") + index_links(db,extracted_links) final_states = [] docs = []