zz
This commit is contained in:
parent
813ac195f2
commit
efe2872777
@ -13,6 +13,7 @@ import click
|
|||||||
import logging as LOGGER
|
import logging as LOGGER
|
||||||
import os
|
import os
|
||||||
import pprint
|
import pprint
|
||||||
|
import re
|
||||||
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
@ -184,8 +185,27 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
doc["paragraph_sizes_sum"] = goodsz
|
doc["paragraph_sizes_sum"] = goodsz
|
||||||
|
# Not enough larger paragraphs
|
||||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||||
state = "trash"
|
state = "trash"
|
||||||
|
end_sentence_marker = re.compile("\w[\.]")
|
||||||
|
sentences = 0
|
||||||
|
for item in re.finditer(end_sentence_marker,text):
|
||||||
|
t = item.group(0)
|
||||||
|
if t[0].islower():
|
||||||
|
sentences += 1
|
||||||
|
doc["sentences"] = sentences
|
||||||
|
# check copy
|
||||||
|
if state == "good":
|
||||||
|
copysz = len(text) - goodsz
|
||||||
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
|
# index paragraph checksums
|
||||||
|
nd = checkcol.find_one({"_id":chs})
|
||||||
|
if nd is not None:
|
||||||
|
copysz += paragraph_size
|
||||||
|
if copysz / len(text) > TEXT_TRASH_RATIO:
|
||||||
|
state = "copy"
|
||||||
|
print(copysz)
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
@ -198,14 +218,8 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
print(doc)
|
print(doc)
|
||||||
del doc["url"]
|
del doc["url"]
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
copysz = len(text) - goodsz
|
for chs in doc["paragraph_checksums"]:
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
checkcol.insert_one({"_id":chs})
|
||||||
nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
|
|
||||||
paragraph_count = nd["count"]
|
|
||||||
print(paragraph_count)
|
|
||||||
if paragraph_count > 1:
|
|
||||||
copysz += paragraph_size
|
|
||||||
print(copysz)
|
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user