This commit is contained in:
Daniel Hládek 2023-04-01 10:49:28 +02:00
parent 813ac195f2
commit efe2872777

View File

@ -13,6 +13,7 @@ import click
import logging as LOGGER
import os
import pprint
import re
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@ -184,8 +185,27 @@ def index_pages(db,hostname,extracted_pages):
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
doc["paragraph_sizes_sum"] = goodsz
# Not enough larger paragraphs
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
state = "trash"
end_sentence_marker = re.compile("\w[\.]")
sentences = 0
for item in re.finditer(end_sentence_marker,text):
t = item.group(0)
if t[0].islower():
sentences += 1
doc["sentences"] = sentences
# check copy
if state == "good":
copysz = len(text) - goodsz
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums
nd = checkcol.find_one({"_id":chs})
if nd is not None:
copysz += paragraph_size
if copysz / len(text) > TEXT_TRASH_RATIO:
state = "copy"
print(copysz)
if state == "good":
htdoc = get_link_doc(link,state)
htdoc["html"] = html
@ -198,14 +218,8 @@ def index_pages(db,hostname,extracted_pages):
print(doc)
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
copysz = len(text) - goodsz
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
paragraph_count = nd["count"]
print(paragraph_count)
if paragraph_count > 1:
copysz += paragraph_size
print(copysz)
for chs in doc["paragraph_checksums"]:
checkcol.insert_one({"_id":chs})
linkcol.update_one({"url":original_link},{"$set":{"status":state}})