zz
This commit is contained in:
		
							parent
							
								
									c8a18fd8c7
								
							
						
					
					
						commit
						3413b1a190
					
				@ -167,7 +167,8 @@ def fetch_page(link:str)->(str,str):
 | 
				
			|||||||
    print("fetching:::::")
 | 
					    print("fetching:::::")
 | 
				
			||||||
    print(link)
 | 
					    print(link)
 | 
				
			||||||
    final_link = link
 | 
					    final_link = link
 | 
				
			||||||
    response = trafilatura.fetch_url(link,decode=False)
 | 
					    response = trafilatura.fetch_response(link,decode=False)
 | 
				
			||||||
 | 
					    print(response)
 | 
				
			||||||
    time.sleep(2)
 | 
					    time.sleep(2)
 | 
				
			||||||
    html = None
 | 
					    html = None
 | 
				
			||||||
    if response is not None :
 | 
					    if response is not None :
 | 
				
			||||||
@ -183,7 +184,7 @@ def fetch_page(link:str)->(str,str):
 | 
				
			|||||||
            good = False
 | 
					            good = False
 | 
				
			||||||
            LOGGER.error('too large: length %s for URL %s', len(response.data), link)
 | 
					            LOGGER.error('too large: length %s for URL %s', len(response.data), link)
 | 
				
			||||||
        if good:
 | 
					        if good:
 | 
				
			||||||
            html = trafilatura.utils.decode_response(response) 
 | 
					            html = trafilatura.utils.decode_file(response.data) 
 | 
				
			||||||
        if html is not None:
 | 
					        if html is not None:
 | 
				
			||||||
            html, final_link = trafilatura.spider.refresh_detection(html, final_link)
 | 
					            html, final_link = trafilatura.spider.refresh_detection(html, final_link)
 | 
				
			||||||
            # is there a meta-refresh on the page?
 | 
					            # is there a meta-refresh on the page?
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user