diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index df1830b..6fc7464 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -13,6 +13,7 @@ import click import logging as LOGGER import os import pprint +import re LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") @@ -184,8 +185,27 @@ def index_pages(db,hostname,extracted_pages): doc["paragraph_sizes"] = sizes goodsz = sum(sizes) doc["paragraph_sizes_sum"] = goodsz + # Not enough larger paragraphs if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO: state = "trash" + end_sentence_marker = re.compile("\w[\.]") + sentences = 0 + for item in re.finditer(end_sentence_marker,text): + t = item.group(0) + if t[0].islower(): + sentences += 1 + doc["sentences"] = sentences + # check copy + if state == "good": + copysz = len(text) - goodsz + for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): + # index paragraph checksums + nd = checkcol.find_one({"_id":chs}) + if nd is not None: + copysz += paragraph_size + if copysz / len(text) > TEXT_TRASH_RATIO: + state = "copy" + print(copysz) if state == "good": htdoc = get_link_doc(link,state) htdoc["html"] = html @@ -198,14 +218,8 @@ def index_pages(db,hostname,extracted_pages): print(doc) del doc["url"] contentcol.update_one({"url":link},{"$set":doc},upsert=True) - copysz = len(text) - goodsz - for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): - nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER) - paragraph_count = nd["count"] - print(paragraph_count) - if paragraph_count > 1: - copysz += paragraph_size - print(copysz) + for chs in doc["paragraph_checksums"]: + checkcol.insert_one({"_id":chs}) linkcol.update_one({"url":original_link},{"$set":{"status":state}})