This commit is contained in:
Dnaiel Hladek 2024-03-06 08:59:17 +01:00
parent c8a18fd8c7
commit 3413b1a190

View File

@ -167,7 +167,8 @@ def fetch_page(link:str)->(str,str):
print("fetching:::::") print("fetching:::::")
print(link) print(link)
final_link = link final_link = link
response = trafilatura.fetch_url(link,decode=False) response = trafilatura.fetch_response(link,decode=False)
print(response)
time.sleep(2) time.sleep(2)
html = None html = None
if response is not None : if response is not None :
@ -183,7 +184,7 @@ def fetch_page(link:str)->(str,str):
good = False good = False
LOGGER.error('too large: length %s for URL %s', len(response.data), link) LOGGER.error('too large: length %s for URL %s', len(response.data), link)
if good: if good:
html = trafilatura.utils.decode_response(response) html = trafilatura.utils.decode_file(response.data)
if html is not None: if html is not None:
html, final_link = trafilatura.spider.refresh_detection(html, final_link) html, final_link = trafilatura.spider.refresh_detection(html, final_link)
# is there a meta-refresh on the page? # is there a meta-refresh on the page?