zz

2023-04-01 20:44:37 +02:00 · 2023-04-01 20:44:37 +02:00 · 6567de421c
commit 6567de421c
parent efe2872777
1 changed files with 34 additions and 24 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -1,4 +1,5 @@
 import pymongo
+import pymongo.errors
 import trafilatura
 import trafilatura.feeds
 import trafilatura.sitemaps
@ -71,7 +72,6 @@ def calculate_checksums(text):
 def is_robot_good(link,rules):
    # check robots.txt rules
    if rules is not None and not rules.can_fetch("*", link):
-        print("bad>>>" + link)
        return False
    return True

@ -203,7 +203,7 @@ def index_pages(db,hostname,extracted_pages):
                nd = checkcol.find_one({"_id":chs})
                if nd is not None:
                    copysz += paragraph_size
-            if copysz / len(text) > TEXT_TRASH_RATIO:
+            if (copysz / len(text)) > TEXT_TRASH_RATIO:
                state = "copy"
            print(copysz)
        if state == "good":
@ -219,7 +219,10 @@ def index_pages(db,hostname,extracted_pages):
            del doc["url"]
            contentcol.update_one({"url":link},{"$set":doc},upsert=True)
            for chs in doc["paragraph_checksums"]:
+                try:
                    checkcol.insert_one({"_id":chs})
+                except pymongo.errors.DuplicateKeyError as err:
+                    pass
        linkcol.update_one({"url":original_link},{"$set":{"status":state}})


@ -238,8 +241,6 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"
                badrobot += 1
                continue
            status = str(default_status)
-            if courlan.is_navigation_page(link):
-                status = "navigation"
            #print(link,status)
            links[link] = status
    outlinks = []
@ -264,19 +265,27 @@ def index_links(db,extracted_links):

 def get_links(db,hostname,status,batch_size):
    linkcol = db["links"]
-    res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
-    links = []
+    #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
+    res = linkcol.aggregate([
+        { "$match": { "status": status,"host":hostname } },
+        { "$sample": { "size": batch_size } }
+    ])
+    links = set()
    for i,doc in enumerate(res):
        #print(">>>>>" + status)
        #print(doc);
-        print(">>>>links")
-        print(doc)
-        links.append(doc["url"])
+        links.add(doc["url"])
        if i >= batch_size:
            break
-    return links
+    return list(links)


+def fetch_sitemap_links(start_link):
+    out = []
+    navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE)
+    for link in navigation_links:
+        out.append((link,"frontlink"))
+    return out

 def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE):
    #print(links)
@ -372,23 +381,24 @@ def visit(start_link):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    start_link,hostname = courlan.check_url(start_link)
-    rules = fetch_robot(hostname)
-    print(rules)
    batch_size = BATCHSIZE
-    navigation_links = get_links(db,hostname,"navigation",batch_size)
-    if start_link is not None:
-        navigation_links.append(start_link)
-    print(f"Navigation links {len(navigation_links)}")
-    process_links(db,hostname,"frontlink",navigation_links,rules)
+
+    print("Getting  frontlinks")
    links = get_links(db,hostname,"frontlink",batch_size)
-    bl = len(links) - batch_size
    print(f"Got {len(links)} frontlinks")
-    if bl > 0:
+    if len(links) < batch_size:
+        print("Fetching sitemap links")
+        sitemap_links = fetch_sitemap_links(start_link)
+        index_links(db,sitemap_links)
+    links.append(start_link)
+
+    print("Processing frontlinks")
+    rules = fetch_robot(hostname)
+    process_links(db,hostname,"frontlink",links,rules)
    print("Getting  backlinks")
-        front_links = get_links(db,hostname,"backlink",bl)
-        links += front_links
+    back_links = get_links(db,hostname,"backlink",batch_size)
    print("Processing backlinks")
-    process_links(db,hostname,"backlink",links,rules=rules)
+    process_links(db,hostname,"backlink",back_links,rules=rules)
    link_summary(db,hostname)

 if __name__ == "__main__":