From 000490bf73f4fc5cb8ad2023b260793cc30a576b Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sun, 9 Apr 2023 09:37:52 +0200 Subject: [PATCH] zz --- mongo/mongocrawler.py | 44 +++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index a95cfbf..9e3ec20 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -169,6 +169,21 @@ def extract_pages(link_batch:list,responses:list)->list: out.append((original_link,final_link,html,doc)) return out +def set_content_checksum(doc): + text = doc["text"] + checksums,sizes = calculate_checksums(text) + doc["text_size"] = len(text) + doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest() + doc["paragraph_checksums"] = checksums + doc["paragraph_sizes"] = sizes + doc["paragraph_sizes_sum"] = sum(sizes) + end_sentence_marker = re.compile("\w[\.]") + sentences = 0 + for item in re.finditer(end_sentence_marker,text): + t = item.group(0) + if t[0].islower(): + sentences += 1 + doc["sentences_count"] = sentences def index_pages(db,hostname,extracted_pages): linkcol = db["links"] @@ -188,32 +203,21 @@ def index_pages(db,hostname,extracted_pages): elif doc is None: state = "content_error" if doc is not None: - text = doc["text"] - checksums,sizes = calculate_checksums(text) - doc["text_size"] = len(text) - doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest() - doc["paragraph_checksums"] = checksums - doc["paragraph_sizes"] = sizes - goodsz = sum(sizes) - # Not enough larger paragraphs - if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO: + set_content_checksums(doc) + tsz = doc["text_size"] + psz = doc["paragraph_sizes_sum"] + if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: state = "trash" - end_sentence_marker = re.compile("\w[\.]") - sentences = 0 - for item in re.finditer(end_sentence_marker,text): - t = item.group(0) - if t[0].islower(): - sentences += 1 - doc["sentences"] = sentences # check copy if state == "good": - copysz = len(text) - goodsz + origsz = 0 for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): # index paragraph checksums nd = checkcol.find_one({"_id":chs}) - if nd is not None: - copysz += paragraph_size - if (copysz / len(text)) > TEXT_TRASH_RATIO: + if nd is None: + origsz += paragraph_size + + if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: state = "copy" print(copysz) if state == "good":