This commit is contained in:
Daniel Hládek 2023-04-01 06:47:12 +02:00
parent 713481c095
commit 813ac195f2

View File

@ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler")
MINFILESIZE=300
MAXFILESIZE=10000000
MINTEXTSIZE=200
CHECK_PARAGRAPH_SIZE=150
TEXT_TRASH_SIZE=200
TEXT_TRASH_RATIO=0.6
def put_queue(db,channel,message):
queuecol = db["queue"]
@ -54,12 +57,12 @@ def calculate_checksums(text):
hval += zv
hsz += 1
if c == "\n" and hsz > 0:
if hsz > 100:
if hsz > CHECK_PARAGRAPH_SIZE:
checksums.append(hval)
sizes.append(sz)
sz = 0
hsz = 0
if hsz > 100:
if hsz > CHECK_PARAGRAPH_SIZE:
checksums.append(hval)
sizes.append(sz)
return checksums, sizes
@ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages):
linkcol = db["links"]
htmlcol = db["html"]
contentcol = db["content"]
checkcol = db["check"]
links = []
for original_link,final_link,html,doc in extracted_pages:
state = "good"
@ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages):
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
doc["paragraph_sizes_sum"] = goodsz
if len(text) < 200 or goodsz/len(text) < 0.6:
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
state = "trash"
if state == "good":
htdoc = get_link_doc(link,state)
@ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages):
print(doc)
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
copysz = len(text) - goodsz
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
paragraph_count = nd["count"]
print(paragraph_count)
if paragraph_count > 1:
copysz += paragraph_size
print(copysz)
linkcol.update_one({"url":original_link},{"$set":{"status":state}})