This commit is contained in:
Daniel Hládek 2023-03-29 11:04:29 +02:00
parent 1bd088a38a
commit 713481c095

View File

@ -178,10 +178,10 @@ def index_pages(db,hostname,extracted_pages):
doc["text_size"] = len(text)
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
doc["paragraph_sizes_sum"] = sum(sizes)
goodsz = sum(sizes)
if len(text) < 200 or goodsz/len(text) < 0.4:
stat = "trash"
doc["paragraph_sizes_sum"] = goodsz
if len(text) < 200 or goodsz/len(text) < 0.6:
state = "trash"
if state == "good":
htdoc = get_link_doc(link,state)
htdoc["html"] = html