This commit is contained in:
Daniel Hládek 2023-04-12 14:50:18 +02:00
parent ad52af705b
commit f5dc1f42cf

View File

@ -711,10 +711,21 @@ def crawl_summary():
batchcol = db["batches"] batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1) yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday) print(yesterday)
res = batchcol.find({"created_at":{"$gt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters") res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
res = batchcol.aggregate([
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
{"$group":{"_id":"$host",
"document_count":{"$sum":{"document_count":1}},
"good_document_count":{"$sum":{"good_document_count":1}},
"batch_size":{"$sum":{"batch_size":1}},
"count":{"$sum":1},
}
},
])
print(">>>> Batches") print(">>>> Batches")
for item in res: for item in res:
print(item["url"],item["average_fetch_characters"]) print(item)
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
domaincol = db["domains"] domaincol = db["domains"]
print(">>>> Best domains") print(">>>> Best domains")
res = domaincol.find({},limit=100).sort("average_fetch_characters") res = domaincol.find({},limit=100).sort("average_fetch_characters")