This commit is contained in:
Daniel Hládek 2023-04-27 07:29:15 +02:00
parent 8a91f88d73
commit f78a64d4e8

View File

@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
def extract_page(final_link,html): def extract_page(final_link,html):
doc = None doc = None
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
if doc is not None: if doc is not None:
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
# text too small # text too small
@ -619,7 +619,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING)) domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
batchcol = db["batches"] batchcol = db["batches"]
batchcol.create_index("host") batchcol.create_index("host")
batchcol.create_index("created_at") batchcol.create_index("created_at")
@ -744,6 +744,7 @@ def import_html():
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
doc = extract_page(url,html) doc = extract_page(url,html)
if doc is not None: if doc is not None:
print("------=====-")
print(doc) print(doc)
status = index_page(db,url,url,html,doc) status = index_page(db,url,url,html,doc)
print(status) print(status)