From a1638db2c5a65780f663ae79ad60d13e27ba184c Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sat, 11 Mar 2023 11:30:30 +0100 Subject: [PATCH] zz --- mongo/mongocwarler.py | 135 +++++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 6bff0d3..85285ca 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -39,23 +39,35 @@ def calculate_checksums(text): sizes.append(sz) return checksums, sizes -def filter_links(links,language=LANGUAGE,rules=None): +def is_robot_good(link,rules): + # check robots.txt rules + if rules is not None and not rules.can_fetch("*", llink): + return False + return True + +def is_link_good(link): + r = courlan.check_url(link,strict=True,language=language) + if r is None: + return None + llink,ldomain = r + print(llink,ldomain) + # domain rules + if not ldomain.endswith("sk"): + print("bad domain") + return None + if courlan.is_not_crawlable(llink): + print("not crawlable") + return None + return None + +def filter_links(links,rules=None): out = set() for link in links: - r = courlan.check_url(link,strict=True,language=language) + r = is_link_good(link) if r is None: continue - llink,ldomain = r - print(llink,ldomain) - # domain rules - if not ldomain.endswith("sk"): - print("bad domain") - continue - if courlan.is_not_crawlable(llink): - print("not crawlable") - continue # check robots.txt rules - if rules is not None and not rules.can_fetch("*", llink): + if rules is not None and not rules.can_fetch("*", r): continue out.add(llink) return out @@ -93,7 +105,12 @@ def fetch_pages(link_batch): for link in link_batch: print("fetching:::::") print(link) - htmls.append(trafilatura.fetch_url(link,decode=False)) + response = trafilatura.fetch_url(link,decode=False) + htmlstring, homepage = trafilatura.spider.refresh_detection(response.data, link) + # is there a meta-refresh on the page? + if homepage is None: # malformed or malicious content + response = None + htmls.append(response) return htmls def fetch_rules(base_url): @@ -107,43 +124,24 @@ def fetch_rules(base_url): rules = None return rules -def fetch_front_links(start_links,rules): - responses = fetch_pages(start_links) - - # decode response - htmlstring = trafilatura.utils.decode_response(response.data) - # is there a meta-refresh on the page? - htmlstring, homepage = trafilatura.spider.refresh_detection(htmlstring, homepage) - if homepage is None: # malformed or malicious content - return None, None, None - visit_links = courlan.extract_links(htmlstring) - visit_links,known_links = trafilatura.spider.focused_crawler(start_link,lang=LANGUAGE) - print(visit_links,known_links) - filtered_links = filter_links(visit_links,LANGUAGE,rules=rules) - return filtered_links def extract_pages(link_batch,responses): out = [] + extracted_links = set() for link,response in zip(link_batch,responses): doc = None assert link is not None html = None response_link = None if response is not None: - # is reponse link good? - # filter and normalize - rl = list(filter_links([response.url])) - if len(rl) == 1: - response_link = rl[0] - html = trafilatura.utils.decode_response(response) + html = trafilatura.utils.decode_response(response) if html is not None: doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) out.append((link,response_link,html,doc)) return out - -def index_pages(db,domain,extracted_pages,rules): +def index_pages(db,domain,extracted_pages): extracted_links = set() linkcol = db["links"] htmlcol = db["html"] @@ -163,25 +161,46 @@ def index_pages(db,domain,extracted_pages,rules): # todo extract links print(doc) contentcol.insert_one(doc) - # todo extract links - doc = get_link_doc(original_link,state) - linkcol.replace_one({"url":final_link},doc,upsert=True) - if original_link != final_link: - rl = list(filter_links([final_link],rules)) - if len(rl) == 1: - doc = get_link_doc(rl[0],"redirect") - linkcol.replace_one({"url":rl[0]},doc,upsert=True) -def extract_links(extracted_pages,domain,rules,status="frontlink"): - filtered_links = filter_links(extracted_links,rules) - for original_link,final_link,html,doc in extracted_pages: - doc = get_link_doc(link,status) + +def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): + links = {} + for original_link,response in zip(link_batch,resposnes): + final_link = response.url + status = default_status + link = original_link + if original_link != final_link: + links[original_link] = "redirect" + link = final_link if courlan.is_external(link,domain): - doc["status"]= "frontlink" + status = "frontlink" elif courlan.is_navigation(link): - doc["status"] = "navigation" + status = "navigation" + links[link] = status + extracted_links = courlan.extract_links(response.content) + for link in extracted_links + if courlan.is_external(link,domain): + status = "frontlink" + elif courlan.is_navigation(link): + status = "navigation" + links[link] = status + outlinks = [] + for link,status in links.items(): + if not is_robot_good(rules): + continue + link = is_link_good(link) + if link is None: + continue + outlinks.append((link,status)) + return outlinks + +def index_links(db,extracted_links): + linkcol=db["links"] + for link,status in extracted_links: + doc = get_link_doc(link,status) linkcol.insert_one(doc) + def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] res = linkcol.find({"status":status,"domain":domain},{"url":1},limit=batch_size) @@ -190,7 +209,7 @@ def get_links(db,domain,status,batch_size=BATCHSIZE): for doc in res: print(doc) front_links.append(doc["url"]) - return filter_links(front_links) + return front_links def index_front_links(db,filtered_links): @@ -207,13 +226,25 @@ def index_front_links(db,filtered_links): # document = trafilatura.bare_extraction(content) # print(content) +def process_links(status,domain,links=[],rules=None): + links += get_links(db,domain,status) + responses = fetch_pages(links) + extracted_pages = extract_pages(links,responses) + extracted_links = extract_links(links,responses,domain,rules,status) + index_links(db,extracted_links) + index_pages(db,domain,extracted_pages) + def simple_visit(start_link): start_link,domain = courlan.check_url(start_link) myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") db=myclient["crawler"] + rules = fetch_robot(domain) navigation_links =[start_link] - navigation_links += get_links(db,domain,"navigation") print(navigation_links) + process_links(db,domain,"navigation",navigation_links,rules) + process_links(db,domain,front_links,rules) + back_links = get_links(db,domain,"backlink") + process_links(db,domain,front_links,rules) #new_front_links = fetch_front_links(navigation_links) print("NEW FRONT LINKS") #print(new_front_links) @@ -223,7 +254,5 @@ def simple_visit(start_link): visit_links = front_links print(visit_links) responses = fetch_pages(visit_links) - extracted_pages = extract_pages(visit_links,responses) - index_pages(db,domain,extracted_pages) simple_visit(sys.argv[1])