From c06348080a91a48a4c9f8257e0698b5475322ded Mon Sep 17 00:00:00 2001 From: Dnaiel Hladek Date: Wed, 6 Mar 2024 16:01:22 +0100 Subject: [PATCH] zz --- mongo/mongocrawler.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 0fbecbe..19a7353 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -168,7 +168,6 @@ def fetch_page(link:str)->(str,str): print(link) final_link = link response = trafilatura.fetch_response(link,decode=False) - print(response) time.sleep(2) html = None if response is not None : @@ -242,21 +241,21 @@ def set_content_checksums(doc): sentences += 1 doc["sentences_count"] = sentences -def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True): +def index_page(db,original_link:str,html:bytes,doc,filter_content=True): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] checkcol = db["check"] state = "good" link = original_link - if original_link != final_link: - linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) - link = final_link if html is None: state = "html_error" elif doc is None: state = "content_error" if doc is not None: + if original_link != doc["url"]: + linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) + link = doc["url"] set_content_checksums(doc) tsz = doc["text_size"] psz = doc["paragraph_sizes_sum"] @@ -318,7 +317,7 @@ def save_batch_info(db,host,states,docs): db["batches"].insert_one(batchdoc) -def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: +def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list: links = {} badrobot = 0 for original_link,(final_link,html) in zip(link_batch,responses): @@ -483,7 +482,7 @@ def fetch_sitemap_links(start_link): def fetch_front_links(start_link,rules): start_link,hostname = courlan.check_url(start_link) response = fetch_page(start_link) - extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink") + extracted_links = extract_links([start_link],[response],rules,"frontlink") print("Fetched {} frontlinks".format(len(extracted_links))) return extracted_links @@ -682,11 +681,11 @@ def classify(start_link): cl.train(trainset) cl.test(testset) -def index_pages(hostname,extracted_pages): +def index_pages(db,hostname,extracted_pages,filter_content): final_states = [] docs = [] - for original_link,final_link,html,doc in extracted_pages: - status = index_page(db,original_link,final_link,html,doc,filter_content) + for original_link,html,doc in extracted_pages: + status = index_page(db,original_link,html,doc,filter_content) final_states.append(status) docs.append(doc) save_batch_info(db,hostname,final_states,docs) @@ -713,24 +712,16 @@ def visit(hostname,filter_content=True): responses = [] for link in links: responses.append(fetch_page(link)) - extracted_pages = [] for original_link,(final_link,html) in zip(links,responses): doc = None assert original_link is not None doc = extract_page(final_link,html) - extracted_pages.append((original_link,final_link,html,doc)) + extracted_pages.append((original_link,html,doc)) - extracted_links = extract_links(links,responses,hostname,rules,"frontlink") - - index_links(db,extracted_links) - final_states = [] - docs = [] - for original_link,final_link,html,doc in extracted_pages: - status = index_page(db,original_link,final_link,html,doc,filter_content) - final_states.append(status) - docs.append(doc) - save_batch_info(db,hostname,final_states,docs) + index_pages(db,hostname,extracted_pages,filter_content) + extracted_links = extract_links(links,responses,rules,"frontlink") + index_links(db, extracted_links) link_summary(db,hostname) def crawl_summary(): @@ -803,7 +794,7 @@ def import_html(): if doc is None: print("bad html" + hdoc["url"]) continue - status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) + status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) counter += 1 print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status) del buffer[:]