From 6567de421c9f43aa0ea11d2ae80d0e13a602d7f1 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sat, 1 Apr 2023 20:44:37 +0200 Subject: [PATCH] zz --- mongo/mongocwarler.py | 58 +++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 6fc7464..62b0409 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -1,4 +1,5 @@ import pymongo +import pymongo.errors import trafilatura import trafilatura.feeds import trafilatura.sitemaps @@ -71,7 +72,6 @@ def calculate_checksums(text): def is_robot_good(link,rules): # check robots.txt rules if rules is not None and not rules.can_fetch("*", link): - print("bad>>>" + link) return False return True @@ -203,7 +203,7 @@ def index_pages(db,hostname,extracted_pages): nd = checkcol.find_one({"_id":chs}) if nd is not None: copysz += paragraph_size - if copysz / len(text) > TEXT_TRASH_RATIO: + if (copysz / len(text)) > TEXT_TRASH_RATIO: state = "copy" print(copysz) if state == "good": @@ -219,7 +219,10 @@ def index_pages(db,hostname,extracted_pages): del doc["url"] contentcol.update_one({"url":link},{"$set":doc},upsert=True) for chs in doc["paragraph_checksums"]: - checkcol.insert_one({"_id":chs}) + try: + checkcol.insert_one({"_id":chs}) + except pymongo.errors.DuplicateKeyError as err: + pass linkcol.update_one({"url":original_link},{"$set":{"status":state}}) @@ -238,8 +241,6 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink" badrobot += 1 continue status = str(default_status) - if courlan.is_navigation_page(link): - status = "navigation" #print(link,status) links[link] = status outlinks = [] @@ -264,19 +265,27 @@ def index_links(db,extracted_links): def get_links(db,hostname,status,batch_size): linkcol = db["links"] - res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) - links = [] + #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) + res = linkcol.aggregate([ + { "$match": { "status": status,"host":hostname } }, + { "$sample": { "size": batch_size } } + ]) + links = set() for i,doc in enumerate(res): #print(">>>>>" + status) #print(doc); - print(">>>>links") - print(doc) - links.append(doc["url"]) + links.add(doc["url"]) if i >= batch_size: break - return links + return list(links) +def fetch_sitemap_links(start_link): + out = [] + navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE) + for link in navigation_links: + out.append((link,"frontlink")) + return out def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE): #print(links) @@ -372,23 +381,24 @@ def visit(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) - rules = fetch_robot(hostname) - print(rules) batch_size = BATCHSIZE - navigation_links = get_links(db,hostname,"navigation",batch_size) - if start_link is not None: - navigation_links.append(start_link) - print(f"Navigation links {len(navigation_links)}") - process_links(db,hostname,"frontlink",navigation_links,rules) + + print("Getting frontlinks") links = get_links(db,hostname,"frontlink",batch_size) - bl = len(links) - batch_size print(f"Got {len(links)} frontlinks") - if bl > 0: - print("Getting backlinks") - front_links = get_links(db,hostname,"backlink",bl) - links += front_links + if len(links) < batch_size: + print("Fetching sitemap links") + sitemap_links = fetch_sitemap_links(start_link) + index_links(db,sitemap_links) + links.append(start_link) + + print("Processing frontlinks") + rules = fetch_robot(hostname) + process_links(db,hostname,"frontlink",links,rules) + print("Getting backlinks") + back_links = get_links(db,hostname,"backlink",batch_size) print("Processing backlinks") - process_links(db,hostname,"backlink",links,rules=rules) + process_links(db,hostname,"backlink",back_links,rules=rules) link_summary(db,hostname) if __name__ == "__main__":