This commit is contained in:
Daniel Hládek 2023-04-09 09:37:52 +02:00
parent 3dc4aa6290
commit 000490bf73

View File

@ -169,6 +169,21 @@ def extract_pages(link_batch:list,responses:list)->list:
out.append((original_link,final_link,html,doc)) out.append((original_link,final_link,html,doc))
return out return out
def set_content_checksum(doc):
text = doc["text"]
checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text)
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
doc["paragraph_sizes_sum"] = sum(sizes)
end_sentence_marker = re.compile("\w[\.]")
sentences = 0
for item in re.finditer(end_sentence_marker,text):
t = item.group(0)
if t[0].islower():
sentences += 1
doc["sentences_count"] = sentences
def index_pages(db,hostname,extracted_pages): def index_pages(db,hostname,extracted_pages):
linkcol = db["links"] linkcol = db["links"]
@ -188,32 +203,21 @@ def index_pages(db,hostname,extracted_pages):
elif doc is None: elif doc is None:
state = "content_error" state = "content_error"
if doc is not None: if doc is not None:
text = doc["text"] set_content_checksums(doc)
checksums,sizes = calculate_checksums(text) tsz = doc["text_size"]
doc["text_size"] = len(text) psz = doc["paragraph_sizes_sum"]
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest() if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
# Not enough larger paragraphs
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
state = "trash" state = "trash"
end_sentence_marker = re.compile("\w[\.]")
sentences = 0
for item in re.finditer(end_sentence_marker,text):
t = item.group(0)
if t[0].islower():
sentences += 1
doc["sentences"] = sentences
# check copy # check copy
if state == "good": if state == "good":
copysz = len(text) - goodsz origsz = 0
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums # index paragraph checksums
nd = checkcol.find_one({"_id":chs}) nd = checkcol.find_one({"_id":chs})
if nd is not None: if nd is None:
copysz += paragraph_size origsz += paragraph_size
if (copysz / len(text)) > TEXT_TRASH_RATIO:
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
print(copysz) print(copysz)
if state == "good": if state == "good":