zz
This commit is contained in:
parent
3dc4aa6290
commit
000490bf73
@ -169,6 +169,21 @@ def extract_pages(link_batch:list,responses:list)->list:
|
|||||||
out.append((original_link,final_link,html,doc))
|
out.append((original_link,final_link,html,doc))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def set_content_checksum(doc):
|
||||||
|
text = doc["text"]
|
||||||
|
checksums,sizes = calculate_checksums(text)
|
||||||
|
doc["text_size"] = len(text)
|
||||||
|
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
|
||||||
|
doc["paragraph_checksums"] = checksums
|
||||||
|
doc["paragraph_sizes"] = sizes
|
||||||
|
doc["paragraph_sizes_sum"] = sum(sizes)
|
||||||
|
end_sentence_marker = re.compile("\w[\.]")
|
||||||
|
sentences = 0
|
||||||
|
for item in re.finditer(end_sentence_marker,text):
|
||||||
|
t = item.group(0)
|
||||||
|
if t[0].islower():
|
||||||
|
sentences += 1
|
||||||
|
doc["sentences_count"] = sentences
|
||||||
|
|
||||||
def index_pages(db,hostname,extracted_pages):
|
def index_pages(db,hostname,extracted_pages):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -188,32 +203,21 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
elif doc is None:
|
elif doc is None:
|
||||||
state = "content_error"
|
state = "content_error"
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
text = doc["text"]
|
set_content_checksums(doc)
|
||||||
checksums,sizes = calculate_checksums(text)
|
tsz = doc["text_size"]
|
||||||
doc["text_size"] = len(text)
|
psz = doc["paragraph_sizes_sum"]
|
||||||
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
|
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||||
doc["paragraph_checksums"] = checksums
|
|
||||||
doc["paragraph_sizes"] = sizes
|
|
||||||
goodsz = sum(sizes)
|
|
||||||
# Not enough larger paragraphs
|
|
||||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
|
||||||
state = "trash"
|
state = "trash"
|
||||||
end_sentence_marker = re.compile("\w[\.]")
|
|
||||||
sentences = 0
|
|
||||||
for item in re.finditer(end_sentence_marker,text):
|
|
||||||
t = item.group(0)
|
|
||||||
if t[0].islower():
|
|
||||||
sentences += 1
|
|
||||||
doc["sentences"] = sentences
|
|
||||||
# check copy
|
# check copy
|
||||||
if state == "good":
|
if state == "good":
|
||||||
copysz = len(text) - goodsz
|
origsz = 0
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
# index paragraph checksums
|
# index paragraph checksums
|
||||||
nd = checkcol.find_one({"_id":chs})
|
nd = checkcol.find_one({"_id":chs})
|
||||||
if nd is not None:
|
if nd is None:
|
||||||
copysz += paragraph_size
|
origsz += paragraph_size
|
||||||
if (copysz / len(text)) > TEXT_TRASH_RATIO:
|
|
||||||
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
print(copysz)
|
print(copysz)
|
||||||
if state == "good":
|
if state == "good":
|
||||||
|
Loading…
Reference in New Issue
Block a user