This commit is contained in:
Daniel Hládek 2023-03-29 11:04:29 +02:00
parent 1bd088a38a
commit 713481c095

View File

@ -178,10 +178,10 @@ def index_pages(db,hostname,extracted_pages):
doc["text_size"] = len(text) doc["text_size"] = len(text)
doc["paragraph_checksums"] = checksums doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes doc["paragraph_sizes"] = sizes
doc["paragraph_sizes_sum"] = sum(sizes)
goodsz = sum(sizes) goodsz = sum(sizes)
if len(text) < 200 or goodsz/len(text) < 0.4: doc["paragraph_sizes_sum"] = goodsz
stat = "trash" if len(text) < 200 or goodsz/len(text) < 0.6:
state = "trash"
if state == "good": if state == "good":
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html