This commit is contained in:
Daniel Hládek 2023-04-09 09:37:52 +02:00
parent 3dc4aa6290
commit 000490bf73

View File

@ -169,6 +169,21 @@ def extract_pages(link_batch:list,responses:list)->list:
out.append((original_link,final_link,html,doc))
return out
def set_content_checksum(doc):
text = doc["text"]
checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text)
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
doc["paragraph_sizes_sum"] = sum(sizes)
end_sentence_marker = re.compile("\w[\.]")
sentences = 0
for item in re.finditer(end_sentence_marker,text):
t = item.group(0)
if t[0].islower():
sentences += 1
doc["sentences_count"] = sentences
def index_pages(db,hostname,extracted_pages):
linkcol = db["links"]
@ -188,32 +203,21 @@ def index_pages(db,hostname,extracted_pages):
elif doc is None:
state = "content_error"
if doc is not None:
text = doc["text"]
checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text)
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
# Not enough larger paragraphs
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
set_content_checksums(doc)
tsz = doc["text_size"]
psz = doc["paragraph_sizes_sum"]
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
state = "trash"
end_sentence_marker = re.compile("\w[\.]")
sentences = 0
for item in re.finditer(end_sentence_marker,text):
t = item.group(0)
if t[0].islower():
sentences += 1
doc["sentences"] = sentences
# check copy
if state == "good":
copysz = len(text) - goodsz
origsz = 0
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums
nd = checkcol.find_one({"_id":chs})
if nd is not None:
copysz += paragraph_size
if (copysz / len(text)) > TEXT_TRASH_RATIO:
if nd is None:
origsz += paragraph_size
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy"
print(copysz)
if state == "good":