zz
This commit is contained in:
parent
713481c095
commit
813ac195f2
@ -22,6 +22,9 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
|||||||
MINFILESIZE=300
|
MINFILESIZE=300
|
||||||
MAXFILESIZE=10000000
|
MAXFILESIZE=10000000
|
||||||
MINTEXTSIZE=200
|
MINTEXTSIZE=200
|
||||||
|
CHECK_PARAGRAPH_SIZE=150
|
||||||
|
TEXT_TRASH_SIZE=200
|
||||||
|
TEXT_TRASH_RATIO=0.6
|
||||||
|
|
||||||
def put_queue(db,channel,message):
|
def put_queue(db,channel,message):
|
||||||
queuecol = db["queue"]
|
queuecol = db["queue"]
|
||||||
@ -54,12 +57,12 @@ def calculate_checksums(text):
|
|||||||
hval += zv
|
hval += zv
|
||||||
hsz += 1
|
hsz += 1
|
||||||
if c == "\n" and hsz > 0:
|
if c == "\n" and hsz > 0:
|
||||||
if hsz > 100:
|
if hsz > CHECK_PARAGRAPH_SIZE:
|
||||||
checksums.append(hval)
|
checksums.append(hval)
|
||||||
sizes.append(sz)
|
sizes.append(sz)
|
||||||
sz = 0
|
sz = 0
|
||||||
hsz = 0
|
hsz = 0
|
||||||
if hsz > 100:
|
if hsz > CHECK_PARAGRAPH_SIZE:
|
||||||
checksums.append(hval)
|
checksums.append(hval)
|
||||||
sizes.append(sz)
|
sizes.append(sz)
|
||||||
return checksums, sizes
|
return checksums, sizes
|
||||||
@ -161,6 +164,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
|
checkcol = db["check"]
|
||||||
links = []
|
links = []
|
||||||
for original_link,final_link,html,doc in extracted_pages:
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
state = "good"
|
state = "good"
|
||||||
@ -180,7 +184,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
doc["paragraph_sizes_sum"] = goodsz
|
doc["paragraph_sizes_sum"] = goodsz
|
||||||
if len(text) < 200 or goodsz/len(text) < 0.6:
|
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||||
state = "trash"
|
state = "trash"
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
@ -194,6 +198,14 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
print(doc)
|
print(doc)
|
||||||
del doc["url"]
|
del doc["url"]
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
|
copysz = len(text) - goodsz
|
||||||
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
|
nd = checkcol.find_one_and_update({"_id":chs},{"$inc":{"count":1}},upsert=True,return_document=pymongo.ReturnDocument.AFTER)
|
||||||
|
paragraph_count = nd["count"]
|
||||||
|
print(paragraph_count)
|
||||||
|
if paragraph_count > 1:
|
||||||
|
copysz += paragraph_size
|
||||||
|
print(copysz)
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user