zz

2023-04-27 07:29:15 +02:00 · 2023-04-27 07:29:15 +02:00 · f78a64d4e8
commit f78a64d4e8
parent 8a91f88d73
1 changed files with 3 additions and 2 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
 def extract_page(final_link,html):
    doc = None
    if html is not None:
-        doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
+        doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
        if doc is not None:
            if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
                # text too small
@ -619,7 +619,7 @@ def createdb():
    htmlcol.create_index("html_md5",unique=True)
    domaincol = db["domains"]
    domaincol.create_index("host",unique=True)
-    domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
+    domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
    batchcol = db["batches"]
    batchcol.create_index("host")
    batchcol.create_index("created_at")
@ -744,6 +744,7 @@ def import_html():
        html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
        doc = extract_page(url,html)
        if doc is not None:
+            print("------=====-")
            print(doc)
            status = index_page(db,url,url,html,doc)
            print(status)