diff --git a/mongo/cli.py b/mongo/cli.py index 6f27215..1e7dcfe 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -33,5 +33,9 @@ def visit(hostname): def summary(): mongocrawler.crawl_summary() +@cli.command() +def sampledomains(): + mongocrawler.sample_domains() + if __name__ == "__main__": cli() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 93c9e78..a2fc796 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -715,19 +715,33 @@ def crawl_summary(): res = batchcol.aggregate([ {"$match":{"created_at":{"$lt": yesterday.utcnow()}}}, {"$group":{"_id":"$host", - "document_count":{"$sum":{"document_count":1}}, - "good_document_count":{"$sum":{"good_document_count":1}}, - "batch_size":{"$sum":{"batch_size":1}}, + "document_count":{"$sum":"$document_count"}, + "good_document_count":{"$sum":"$good_document_count"}, + "batch_count":{"$sum":"$batch_size"}, + "text_size":{"$sum":"$text_size"}, + "original_text_size":{"$sum":"$original_text_size"}, "count":{"$sum":1}, } }, ]) print(">>>> Batches") + headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"] + print("\t".join(headers)) for item in res: - print(item) - #print(item["host"],item["document_count"],item["good_document_count"],item["created_at"]) + values = [str(item[x]) for x in headers] + print("\t".join(values)) + +def sample_domains(): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + linkscol = db["links"] + # discover domains + domains = linkscol.distinct("host",filter={"status":"frontlink"}) + for domain in domains: + print(domain) domaincol = db["domains"] + # exploit domains print(">>>> Best domains") - res = domaincol.find({},limit=100).sort("average_fetch_characters") + res = domaincol.find({},limit=100).sort("average_fetch_characters",-1) for item in res: print(item)