zz
This commit is contained in:
parent
1bd088a38a
commit
713481c095
@ -178,10 +178,10 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
doc["text_size"] = len(text)
|
doc["text_size"] = len(text)
|
||||||
doc["paragraph_checksums"] = checksums
|
doc["paragraph_checksums"] = checksums
|
||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
doc["paragraph_sizes_sum"] = sum(sizes)
|
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
if len(text) < 200 or goodsz/len(text) < 0.4:
|
doc["paragraph_sizes_sum"] = goodsz
|
||||||
stat = "trash"
|
if len(text) < 200 or goodsz/len(text) < 0.6:
|
||||||
|
state = "trash"
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
|
Loading…
Reference in New Issue
Block a user