This commit is contained in:
Daniel Hládek 2023-04-12 16:39:44 +02:00
parent f5dc1f42cf
commit 4d62cd38fd
2 changed files with 24 additions and 6 deletions

View File

@ -33,5 +33,9 @@ def visit(hostname):
def summary(): def summary():
mongocrawler.crawl_summary() mongocrawler.crawl_summary()
@cli.command()
def sampledomains():
mongocrawler.sample_domains()
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()

View File

@ -715,19 +715,33 @@ def crawl_summary():
res = batchcol.aggregate([ res = batchcol.aggregate([
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}}, {"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
{"$group":{"_id":"$host", {"$group":{"_id":"$host",
"document_count":{"$sum":{"document_count":1}}, "document_count":{"$sum":"$document_count"},
"good_document_count":{"$sum":{"good_document_count":1}}, "good_document_count":{"$sum":"$good_document_count"},
"batch_size":{"$sum":{"batch_size":1}}, "batch_count":{"$sum":"$batch_size"},
"text_size":{"$sum":"$text_size"},
"original_text_size":{"$sum":"$original_text_size"},
"count":{"$sum":1}, "count":{"$sum":1},
} }
}, },
]) ])
print(">>>> Batches") print(">>>> Batches")
headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"]
print("\t".join(headers))
for item in res: for item in res:
print(item) values = [str(item[x]) for x in headers]
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"]) print("\t".join(values))
def sample_domains():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
linkscol = db["links"]
# discover domains
domains = linkscol.distinct("host",filter={"status":"frontlink"})
for domain in domains:
print(domain)
domaincol = db["domains"] domaincol = db["domains"]
# exploit domains
print(">>>> Best domains") print(">>>> Best domains")
res = domaincol.find({},limit=100).sort("average_fetch_characters") res = domaincol.find({},limit=100).sort("average_fetch_characters",-1)
for item in res: for item in res:
print(item) print(item)