zz

2023-04-01 06:47:12 +02:00 · 2023-04-01 06:47:12 +02:00 · 813ac195f2
commit 813ac195f2
parent 713481c095
1 changed files with 15 additions and 3 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler")
 MINFILESIZE=300
 MAXFILESIZE=10000000
 MINTEXTSIZE=200
+CHECK_PARAGRAPH_SIZE=150
+TEXT_TRASH_SIZE=200
+TEXT_TRASH_RATIO=0.6

 def put_queue(db,channel,message):
    queuecol = db["queue"]
@ -54,12 +57,12 @@ def calculate_checksums(text):
            hval += zv
            hsz += 1
        if c == "\n" and hsz > 0:
-            if hsz > 100:
+            if hsz > CHECK_PARAGRAPH_SIZE:
                checksums.append(hval)
                sizes.append(sz)
            sz = 0
            hsz = 0
-    if hsz > 100:
+    if hsz > CHECK_PARAGRAPH_SIZE:
        checksums.append(hval)
        sizes.append(sz)
    return checksums, sizes
@ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages):
    linkcol = db["links"]
    htmlcol = db["html"]
    contentcol = db["content"]
+    checkcol = db["check"]
    links = []
    for original_link,final_link,html,doc in extracted_pages:
        state = "good"
@ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages):
            doc["paragraph_sizes"] = sizes
            goodsz = sum(sizes)
            doc["paragraph_sizes_sum"] = goodsz
-            if len(text) < 200 or goodsz/len(text) < 0.6:
+            if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
                state = "trash"
        if state == "good":
            htdoc = get_link_doc(link,state)
@ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages):
            print(doc)
            del doc["url"]
            contentcol.update_one({"url":link},{"$set":doc},upsert=True)
+            copysz = len(text) - goodsz
+            for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
+                nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
+                paragraph_count = nd["count"]
+                print(paragraph_count)
+                if paragraph_count > 1:
+                    copysz += paragraph_size
+            print(copysz)
        linkcol.update_one({"url":original_link},{"$set":{"status":state}})