zz
This commit is contained in:
parent
c8a18fd8c7
commit
3413b1a190
@ -167,7 +167,8 @@ def fetch_page(link:str)->(str,str):
|
|||||||
print("fetching:::::")
|
print("fetching:::::")
|
||||||
print(link)
|
print(link)
|
||||||
final_link = link
|
final_link = link
|
||||||
response = trafilatura.fetch_url(link,decode=False)
|
response = trafilatura.fetch_response(link,decode=False)
|
||||||
|
print(response)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
html = None
|
html = None
|
||||||
if response is not None :
|
if response is not None :
|
||||||
@ -183,7 +184,7 @@ def fetch_page(link:str)->(str,str):
|
|||||||
good = False
|
good = False
|
||||||
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||||
if good:
|
if good:
|
||||||
html = trafilatura.utils.decode_response(response)
|
html = trafilatura.utils.decode_file(response.data)
|
||||||
if html is not None:
|
if html is not None:
|
||||||
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
||||||
# is there a meta-refresh on the page?
|
# is there a meta-refresh on the page?
|
||||||
|
Loading…
Reference in New Issue
Block a user