zz
This commit is contained in:
parent
8a91f88d73
commit
f78a64d4e8
@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
|||||||
def extract_page(final_link,html):
|
def extract_page(final_link,html):
|
||||||
doc = None
|
doc = None
|
||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||||
# text too small
|
# text too small
|
||||||
@ -619,7 +619,7 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
|
domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
|
||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
batchcol.create_index("host")
|
batchcol.create_index("host")
|
||||||
batchcol.create_index("created_at")
|
batchcol.create_index("created_at")
|
||||||
@ -744,6 +744,7 @@ def import_html():
|
|||||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||||
doc = extract_page(url,html)
|
doc = extract_page(url,html)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
|
print("------=====-")
|
||||||
print(doc)
|
print(doc)
|
||||||
status = index_page(db,url,url,html,doc)
|
status = index_page(db,url,url,html,doc)
|
||||||
print(status)
|
print(status)
|
||||||
|
Loading…
Reference in New Issue
Block a user