zz
This commit is contained in:
parent
713481c095
commit
813ac195f2
@ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
MINFILESIZE=300
|
||||
MAXFILESIZE=10000000
|
||||
MINTEXTSIZE=200
|
||||
CHECK_PARAGRAPH_SIZE=150
|
||||
TEXT_TRASH_SIZE=200
|
||||
TEXT_TRASH_RATIO=0.6
|
||||
|
||||
def put_queue(db,channel,message):
|
||||
queuecol = db["queue"]
|
||||
@ -54,12 +57,12 @@ def calculate_checksums(text):
|
||||
hval += zv
|
||||
hsz += 1
|
||||
if c == "\n" and hsz > 0:
|
||||
if hsz > 100:
|
||||
if hsz > CHECK_PARAGRAPH_SIZE:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
sz = 0
|
||||
hsz = 0
|
||||
if hsz > 100:
|
||||
if hsz > CHECK_PARAGRAPH_SIZE:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
return checksums, sizes
|
||||
@ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages):
|
||||
linkcol = db["links"]
|
||||
htmlcol = db["html"]
|
||||
contentcol = db["content"]
|
||||
checkcol = db["check"]
|
||||
links = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
state = "good"
|
||||
@ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages):
|
||||
doc["paragraph_sizes"] = sizes
|
||||
goodsz = sum(sizes)
|
||||
doc["paragraph_sizes_sum"] = goodsz
|
||||
if len(text) < 200 or goodsz/len(text) < 0.6:
|
||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||
state = "trash"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
@ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages):
|
||||
print(doc)
|
||||
del doc["url"]
|
||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||
copysz = len(text) - goodsz
|
||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||
nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
|
||||
paragraph_count = nd["count"]
|
||||
print(paragraph_count)
|
||||
if paragraph_count > 1:
|
||||
copysz += paragraph_size
|
||||
print(copysz)
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user