zz
This commit is contained in:
		
							parent
							
								
									713481c095
								
							
						
					
					
						commit
						813ac195f2
					
				| @ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler") | ||||
| MINFILESIZE=300 | ||||
| MAXFILESIZE=10000000 | ||||
| MINTEXTSIZE=200 | ||||
| CHECK_PARAGRAPH_SIZE=150 | ||||
| TEXT_TRASH_SIZE=200 | ||||
| TEXT_TRASH_RATIO=0.6 | ||||
| 
 | ||||
| def put_queue(db,channel,message): | ||||
|     queuecol = db["queue"] | ||||
| @ -54,12 +57,12 @@ def calculate_checksums(text): | ||||
|             hval += zv | ||||
|             hsz += 1 | ||||
|         if c == "\n" and hsz > 0: | ||||
|             if hsz > 100: | ||||
|             if hsz > CHECK_PARAGRAPH_SIZE: | ||||
|                 checksums.append(hval) | ||||
|                 sizes.append(sz) | ||||
|             sz = 0 | ||||
|             hsz = 0 | ||||
|     if hsz > 100: | ||||
|     if hsz > CHECK_PARAGRAPH_SIZE: | ||||
|         checksums.append(hval) | ||||
|         sizes.append(sz) | ||||
|     return checksums, sizes | ||||
| @ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages): | ||||
|     linkcol = db["links"] | ||||
|     htmlcol = db["html"] | ||||
|     contentcol = db["content"] | ||||
|     checkcol = db["check"] | ||||
|     links = [] | ||||
|     for original_link,final_link,html,doc in extracted_pages: | ||||
|         state = "good" | ||||
| @ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages): | ||||
|             doc["paragraph_sizes"] = sizes | ||||
|             goodsz = sum(sizes) | ||||
|             doc["paragraph_sizes_sum"] = goodsz | ||||
|             if len(text) < 200 or goodsz/len(text) < 0.6: | ||||
|             if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO: | ||||
|                 state = "trash" | ||||
|         if state == "good": | ||||
|             htdoc = get_link_doc(link,state) | ||||
| @ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages): | ||||
|             print(doc) | ||||
|             del doc["url"] | ||||
|             contentcol.update_one({"url":link},{"$set":doc},upsert=True) | ||||
|             copysz = len(text) - goodsz | ||||
|             for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): | ||||
|                 nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER) | ||||
|                 paragraph_count = nd["count"] | ||||
|                 print(paragraph_count) | ||||
|                 if paragraph_count > 1: | ||||
|                     copysz += paragraph_size | ||||
|             print(copysz) | ||||
|         linkcol.update_one({"url":original_link},{"$set":{"status":state}}) | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user