From f78a64d4e80b68ec78acc8609129cb28e164fde3 Mon Sep 17 00:00:00 2001
From: Daniel Hladek <daniel.hladek@tuke.sk>
Date: Thu, 27 Apr 2023 07:29:15 +0200
Subject: [PATCH] zz

---
 mongo/mongocrawler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index 2180ad2..3ab4478 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
 def extract_page(final_link,html):
     doc = None
     if html is not None:
-        doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
+        doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
         if doc is not None:
             if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
                 # text too small
@@ -619,7 +619,7 @@ def createdb():
     htmlcol.create_index("html_md5",unique=True)
     domaincol = db["domains"]
     domaincol.create_index("host",unique=True)
-    domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
+    domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
     batchcol = db["batches"]
     batchcol.create_index("host")
     batchcol.create_index("created_at")
@@ -744,6 +744,7 @@ def import_html():
         html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
         doc = extract_page(url,html)
         if doc is not None:
+            print("------=====-")
             print(doc)
             status = index_page(db,url,url,html,doc)
             print(status)