zz
This commit is contained in:
parent
8a91f88d73
commit
f78a64d4e8
@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
||||
def extract_page(final_link,html):
|
||||
doc = None
|
||||
if html is not None:
|
||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
|
||||
if doc is not None:
|
||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||
# text too small
|
||||
@ -619,7 +619,7 @@ def createdb():
|
||||
htmlcol.create_index("html_md5",unique=True)
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
|
||||
domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
|
||||
batchcol = db["batches"]
|
||||
batchcol.create_index("host")
|
||||
batchcol.create_index("created_at")
|
||||
@ -744,6 +744,7 @@ def import_html():
|
||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||
doc = extract_page(url,html)
|
||||
if doc is not None:
|
||||
print("------=====-")
|
||||
print(doc)
|
||||
status = index_page(db,url,url,html,doc)
|
||||
print(status)
|
||||
|
Loading…
Reference in New Issue
Block a user