diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index a524540..df1830b 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 MAXFILESIZE=10000000 MINTEXTSIZE=200 +CHECK_PARAGRAPH_SIZE=150 +TEXT_TRASH_SIZE=200 +TEXT_TRASH_RATIO=0.6 def put_queue(db,channel,message): queuecol = db["queue"] @@ -54,12 +57,12 @@ def calculate_checksums(text): hval += zv hsz += 1 if c == "\n" and hsz > 0: - if hsz > 100: + if hsz > CHECK_PARAGRAPH_SIZE: checksums.append(hval) sizes.append(sz) sz = 0 hsz = 0 - if hsz > 100: + if hsz > CHECK_PARAGRAPH_SIZE: checksums.append(hval) sizes.append(sz) return checksums, sizes @@ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] + checkcol = db["check"] links = [] for original_link,final_link,html,doc in extracted_pages: state = "good" @@ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages): doc["paragraph_sizes"] = sizes goodsz = sum(sizes) doc["paragraph_sizes_sum"] = goodsz - if len(text) < 200 or goodsz/len(text) < 0.6: + if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO: state = "trash" if state == "good": htdoc = get_link_doc(link,state) @@ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages): print(doc) del doc["url"] contentcol.update_one({"url":link},{"$set":doc},upsert=True) + copysz = len(text) - goodsz + for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): + nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER) + paragraph_count = nd["count"] + print(paragraph_count) + if paragraph_count > 1: + copysz += paragraph_size + print(copysz) linkcol.update_one({"url":original_link},{"$set":{"status":state}})