zz
This commit is contained in:
parent
1bd088a38a
commit
713481c095
@ -178,10 +178,10 @@ def index_pages(db,hostname,extracted_pages):
|
||||
doc["text_size"] = len(text)
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
doc["paragraph_sizes_sum"] = sum(sizes)
|
||||
goodsz = sum(sizes)
|
||||
if len(text) < 200 or goodsz/len(text) < 0.4:
|
||||
stat = "trash"
|
||||
doc["paragraph_sizes_sum"] = goodsz
|
||||
if len(text) < 200 or goodsz/len(text) < 0.6:
|
||||
state = "trash"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
htdoc["html"] = html
|
||||
|
Loading…
Reference in New Issue
Block a user