From f78a64d4e80b68ec78acc8609129cb28e164fde3 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 27 Apr 2023 07:29:15 +0200 Subject: [PATCH] zz --- mongo/mongocrawler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 2180ad2..3ab4478 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: def extract_page(final_link,html): doc = None if html is not None: - doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True) if doc is not None: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: # text too small @@ -619,7 +619,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) - domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING)) + domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)]) batchcol = db["batches"] batchcol.create_index("host") batchcol.create_index("created_at") @@ -744,6 +744,7 @@ def import_html(): html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() doc = extract_page(url,html) if doc is not None: + print("------=====-") print(doc) status = index_page(db,url,url,html,doc) print(status)