zz
This commit is contained in:
		
							parent
							
								
									3413b1a190
								
							
						
					
					
						commit
						b8850819b9
					
				| @ -175,7 +175,7 @@ def fetch_page(link:str)->(str,str): | |||||||
|         good = True |         good = True | ||||||
|         if response.status != 200: |         if response.status != 200: | ||||||
|             good = False |             good = False | ||||||
|             LOGGER.error('not a 200 response: %s for URL %s', response.status, url) |             LOGGER.error('not a 200 response: %s for URL %s', response.status, link) | ||||||
|         elif response.data is None or len(response.data) < MIN_FILE_SIZE: |         elif response.data is None or len(response.data) < MIN_FILE_SIZE: | ||||||
|             LOGGER.error('too small/incorrect for URL %s', link) |             LOGGER.error('too small/incorrect for URL %s', link) | ||||||
|             good = False |             good = False | ||||||
| @ -278,7 +278,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content | |||||||
|         htdoc = get_link_doc(link,state) |         htdoc = get_link_doc(link,state) | ||||||
|         htdoc["html"] = html |         htdoc["html"] = html | ||||||
|         htdoc["html_size"] = len(html) |         htdoc["html_size"] = len(html) | ||||||
|         htdoc["html_md5"]= hashlib.md5(html).hexdigest() |         htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() | ||||||
|         # can be revisited - upsert |         # can be revisited - upsert | ||||||
|         del htdoc["url"] |         del htdoc["url"] | ||||||
|         htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) |         htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) | ||||||
| @ -682,6 +682,16 @@ def classify(start_link): | |||||||
|     cl.train(trainset) |     cl.train(trainset) | ||||||
|     cl.test(testset) |     cl.test(testset) | ||||||
| 
 | 
 | ||||||
|  | def index_pages(hostname,extracted_pages): | ||||||
|  |     final_states = [] | ||||||
|  |     docs = [] | ||||||
|  |     for original_link,final_link,html,doc in extracted_pages: | ||||||
|  |         status = index_page(db,original_link,final_link,html,doc,filter_content) | ||||||
|  |         final_states.append(status) | ||||||
|  |         docs.append(doc) | ||||||
|  |     save_batch_info(db,hostname,final_states,docs) | ||||||
|  |   | ||||||
|  | 
 | ||||||
| def visit(hostname,filter_content=True): | def visit(hostname,filter_content=True): | ||||||
|     myclient = pymongo.MongoClient(CONNECTION) |     myclient = pymongo.MongoClient(CONNECTION) | ||||||
|     db=myclient[DBNAME] |     db=myclient[DBNAME] | ||||||
| @ -712,6 +722,7 @@ def visit(hostname,filter_content=True): | |||||||
|         extracted_pages.append((original_link,final_link,html,doc)) |         extracted_pages.append((original_link,final_link,html,doc)) | ||||||
| 
 | 
 | ||||||
|     extracted_links = extract_links(links,responses,hostname,rules,"frontlink") |     extracted_links = extract_links(links,responses,hostname,rules,"frontlink") | ||||||
|  | 
 | ||||||
|     index_links(db,extracted_links) |     index_links(db,extracted_links) | ||||||
|     final_states = [] |     final_states = [] | ||||||
|     docs = [] |     docs = [] | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user