zz
This commit is contained in:
		
							parent
							
								
									6eca731e42
								
							
						
					
					
						commit
						a05a3372af
					
				| @ -59,10 +59,10 @@ def is_robot_good(link,rules): | ||||
| def is_link_good(link): | ||||
|     r = courlan.check_url(link,strict=True,language=LANGUAGE) | ||||
|     if r is None: | ||||
|         print(link) | ||||
|         #print(link) | ||||
|         return None | ||||
|     llink,ldomain = r | ||||
|     print(llink,ldomain) | ||||
|     #print(llink,ldomain) | ||||
|     # domain rules | ||||
|     if not ldomain.endswith(DOMAIN): | ||||
|         LOGGER.debug("bad domain") | ||||
| @ -187,17 +187,20 @@ def index_pages(db,domain,extracted_pages): | ||||
|             doc["paragraph_checksums"] = checksums | ||||
|             doc["paragraph_sizes"] = sizes | ||||
|             goodsz = sum(sizes) | ||||
|             if len(text) < 200 or goodsz/len(text) < 0.3: | ||||
|             if len(text) < 200 or goodsz/len(text) < 0.4: | ||||
|                 state = "trash" | ||||
|         if state == "good": | ||||
|             htdoc = get_link_doc(link,state) | ||||
|             htdoc["html"] = html | ||||
|             htdoc["html_size"] = len(html) | ||||
|             htmlcol.insert_one(htdoc) | ||||
|             # can be revisited - upsert | ||||
|             del htdoc["url"] | ||||
|             htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) | ||||
|             doc.update(get_link_doc(link,"good")) | ||||
|             # todo extract links | ||||
|             print(doc) | ||||
|             contentcol.insert_one(doc) | ||||
|             del doc["url"] | ||||
|             contentcol.update_one({"url":link},{"$set":doc},upsert=True) | ||||
|         linkcol.update_one({"url":original_link},{"$set":{"status":state}}) | ||||
| 
 | ||||
| 
 | ||||
| @ -246,8 +249,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE): | ||||
|     res  = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) | ||||
|     links = [] | ||||
|     for doc in res: | ||||
|         print(">>>>>" + status) | ||||
|         print(doc) | ||||
|         #print(">>>>>" + status) | ||||
|         #print(doc) | ||||
|         links.append(doc["url"]) | ||||
|     return links | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user