diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index f2242c5..36ff4e0 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -167,7 +167,8 @@ def fetch_page(link:str)->(str,str): print("fetching:::::") print(link) final_link = link - response = trafilatura.fetch_url(link,decode=False) + response = trafilatura.fetch_response(link,decode=False) + print(response) time.sleep(2) html = None if response is not None : @@ -183,7 +184,7 @@ def fetch_page(link:str)->(str,str): good = False LOGGER.error('too large: length %s for URL %s', len(response.data), link) if good: - html = trafilatura.utils.decode_response(response) + html = trafilatura.utils.decode_file(response.data) if html is not None: html, final_link = trafilatura.spider.refresh_detection(html, final_link) # is there a meta-refresh on the page?