diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 7be8809..93c9e78 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -711,10 +711,21 @@ def crawl_summary(): batchcol = db["batches"] yesterday = datetime.datetime.today() - datetime.timedelta(days=1) print(yesterday) - res = batchcol.find({"created_at":{"$gt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters") + res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters") + res = batchcol.aggregate([ + {"$match":{"created_at":{"$lt": yesterday.utcnow()}}}, + {"$group":{"_id":"$host", + "document_count":{"$sum":{"document_count":1}}, + "good_document_count":{"$sum":{"good_document_count":1}}, + "batch_size":{"$sum":{"batch_size":1}}, + "count":{"$sum":1}, + } + }, + ]) print(">>>> Batches") for item in res: - print(item["url"],item["average_fetch_characters"]) + print(item) + #print(item["host"],item["document_count"],item["good_document_count"],item["created_at"]) domaincol = db["domains"] print(">>>> Best domains") res = domaincol.find({},limit=100).sort("average_fetch_characters")