zz
This commit is contained in:
parent
3dc4aa6290
commit
000490bf73
@ -169,6 +169,21 @@ def extract_pages(link_batch:list,responses:list)->list:
|
||||
out.append((original_link,final_link,html,doc))
|
||||
return out
|
||||
|
||||
def set_content_checksum(doc):
|
||||
text = doc["text"]
|
||||
checksums,sizes = calculate_checksums(text)
|
||||
doc["text_size"] = len(text)
|
||||
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
doc["paragraph_sizes_sum"] = sum(sizes)
|
||||
end_sentence_marker = re.compile("\w[\.]")
|
||||
sentences = 0
|
||||
for item in re.finditer(end_sentence_marker,text):
|
||||
t = item.group(0)
|
||||
if t[0].islower():
|
||||
sentences += 1
|
||||
doc["sentences_count"] = sentences
|
||||
|
||||
def index_pages(db,hostname,extracted_pages):
|
||||
linkcol = db["links"]
|
||||
@ -188,32 +203,21 @@ def index_pages(db,hostname,extracted_pages):
|
||||
elif doc is None:
|
||||
state = "content_error"
|
||||
if doc is not None:
|
||||
text = doc["text"]
|
||||
checksums,sizes = calculate_checksums(text)
|
||||
doc["text_size"] = len(text)
|
||||
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
goodsz = sum(sizes)
|
||||
# Not enough larger paragraphs
|
||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||
set_content_checksums(doc)
|
||||
tsz = doc["text_size"]
|
||||
psz = doc["paragraph_sizes_sum"]
|
||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||
state = "trash"
|
||||
end_sentence_marker = re.compile("\w[\.]")
|
||||
sentences = 0
|
||||
for item in re.finditer(end_sentence_marker,text):
|
||||
t = item.group(0)
|
||||
if t[0].islower():
|
||||
sentences += 1
|
||||
doc["sentences"] = sentences
|
||||
# check copy
|
||||
if state == "good":
|
||||
copysz = len(text) - goodsz
|
||||
origsz = 0
|
||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||
# index paragraph checksums
|
||||
nd = checkcol.find_one({"_id":chs})
|
||||
if nd is not None:
|
||||
copysz += paragraph_size
|
||||
if (copysz / len(text)) > TEXT_TRASH_RATIO:
|
||||
if nd is None:
|
||||
origsz += paragraph_size
|
||||
|
||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||
state = "copy"
|
||||
print(copysz)
|
||||
if state == "good":
|
||||
|
Loading…
Reference in New Issue
Block a user