diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 0c66a4b..92c0966 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -167,14 +167,14 @@ def fetch_page(link:str)->(str,str): print("fetching:::::") print(link) final_link = link - response = trafilatura.fetch_url(link,decode=False) + response = trafilatura.fetch_response(link,decode=False) time.sleep(2) html = None if response is not None : good = True if response.status != 200: good = False - LOGGER.error('not a 200 response: %s for URL %s', response.status, url) + LOGGER.error('not a 200 response: %s for URL %s', response.status, link) elif response.data is None or len(response.data) < MIN_FILE_SIZE: LOGGER.error('too small/incorrect for URL %s', link) good = False @@ -183,7 +183,7 @@ def fetch_page(link:str)->(str,str): good = False LOGGER.error('too large: length %s for URL %s', len(response.data), link) if good: - html = trafilatura.utils.decode_response(response) + html = trafilatura.utils.decode_file(response.data) if html is not None: html, final_link = trafilatura.spider.refresh_detection(html, final_link) # is there a meta-refresh on the page? @@ -241,21 +241,21 @@ def set_content_checksums(doc): sentences += 1 doc["sentences_count"] = sentences -def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True): +def index_page(db,original_link:str,html:bytes,doc,filter_content=True): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] checkcol = db["check"] state = "good" link = original_link - if original_link != final_link: - linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) - link = final_link if html is None: state = "html_error" elif doc is None: state = "content_error" if doc is not None: + if original_link != doc["url"]: + linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) + link = doc["url"] set_content_checksums(doc) tsz = doc["text_size"] psz = doc["paragraph_sizes_sum"] @@ -277,7 +277,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) - htdoc["html_md5"]= hashlib.md5(html).hexdigest() + htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() # can be revisited - upsert del htdoc["url"] htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) @@ -317,7 +317,7 @@ def save_batch_info(db,host,states,docs): db["batches"].insert_one(batchdoc) -def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: +def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list: links = {} badrobot = 0 for original_link,(final_link,html) in zip(link_batch,responses): @@ -482,7 +482,7 @@ def fetch_sitemap_links(start_link): def fetch_front_links(start_link,rules): start_link,hostname = courlan.check_url(start_link) response = fetch_page(start_link) - extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink") + extracted_links = extract_links([start_link],[response],rules,"frontlink") print("Fetched {} frontlinks".format(len(extracted_links))) return extracted_links @@ -682,6 +682,16 @@ def classify(start_link): cl.test(testset) +def index_pages(db,hostname,extracted_pages,filter_content): + final_states = [] + docs = [] + for original_link,html,doc in extracted_pages: + status = index_page(db,original_link,html,doc,filter_content) + final_states.append(status) + docs.append(doc) + save_batch_info(db,hostname,final_states,docs) + + def visit(hostname,filter_content=True): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -703,14 +713,14 @@ def visit(hostname,filter_content=True): responses = [] for link in links: responses.append(fetch_page(link)) - extracted_pages = [] for original_link,(final_link,html) in zip(links,responses): doc = None assert original_link is not None doc = extract_page(final_link,html) - extracted_pages.append((original_link,final_link,html,doc)) + extracted_pages.append((original_link,html,doc)) +<<<<<<< HEAD extracted_links = extract_links(links,responses,hostname,rules,"frontlink") index_links(db,extracted_links) @@ -721,6 +731,9 @@ def visit(hostname,filter_content=True): final_states.append(status) docs.append(doc) save_batch_info(db,hostname,final_states,docs) + index_pages(db,hostname,extracted_pages,filter_content) + extracted_links = extract_links(links,responses,rules,"frontlink") + index_links(db, extracted_links) link_summary(db,hostname) def crawl_summary(): @@ -793,7 +806,7 @@ def import_html(): if doc is None: print("bad html" + hdoc["url"]) continue - status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) + status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) counter += 1 print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status) del buffer[:]