zz
This commit is contained in:
parent
f5dc1f42cf
commit
4d62cd38fd
@ -33,5 +33,9 @@ def visit(hostname):
|
||||
def summary():
|
||||
mongocrawler.crawl_summary()
|
||||
|
||||
@cli.command()
|
||||
def sampledomains():
|
||||
mongocrawler.sample_domains()
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
@ -715,19 +715,33 @@ def crawl_summary():
|
||||
res = batchcol.aggregate([
|
||||
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
||||
{"$group":{"_id":"$host",
|
||||
"document_count":{"$sum":{"document_count":1}},
|
||||
"good_document_count":{"$sum":{"good_document_count":1}},
|
||||
"batch_size":{"$sum":{"batch_size":1}},
|
||||
"document_count":{"$sum":"$document_count"},
|
||||
"good_document_count":{"$sum":"$good_document_count"},
|
||||
"batch_count":{"$sum":"$batch_size"},
|
||||
"text_size":{"$sum":"$text_size"},
|
||||
"original_text_size":{"$sum":"$original_text_size"},
|
||||
"count":{"$sum":1},
|
||||
}
|
||||
},
|
||||
])
|
||||
print(">>>> Batches")
|
||||
headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"]
|
||||
print("\t".join(headers))
|
||||
for item in res:
|
||||
print(item)
|
||||
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
|
||||
values = [str(item[x]) for x in headers]
|
||||
print("\t".join(values))
|
||||
|
||||
def sample_domains():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
linkscol = db["links"]
|
||||
# discover domains
|
||||
domains = linkscol.distinct("host",filter={"status":"frontlink"})
|
||||
for domain in domains:
|
||||
print(domain)
|
||||
domaincol = db["domains"]
|
||||
# exploit domains
|
||||
print(">>>> Best domains")
|
||||
res = domaincol.find({},limit=100).sort("average_fetch_characters")
|
||||
res = domaincol.find({},limit=100).sort("average_fetch_characters",-1)
|
||||
for item in res:
|
||||
print(item)
|
||||
|
Loading…
Reference in New Issue
Block a user