zz
This commit is contained in:
parent
f5dc1f42cf
commit
4d62cd38fd
@ -33,5 +33,9 @@ def visit(hostname):
|
|||||||
def summary():
|
def summary():
|
||||||
mongocrawler.crawl_summary()
|
mongocrawler.crawl_summary()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def sampledomains():
|
||||||
|
mongocrawler.sample_domains()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
@ -715,19 +715,33 @@ def crawl_summary():
|
|||||||
res = batchcol.aggregate([
|
res = batchcol.aggregate([
|
||||||
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
||||||
{"$group":{"_id":"$host",
|
{"$group":{"_id":"$host",
|
||||||
"document_count":{"$sum":{"document_count":1}},
|
"document_count":{"$sum":"$document_count"},
|
||||||
"good_document_count":{"$sum":{"good_document_count":1}},
|
"good_document_count":{"$sum":"$good_document_count"},
|
||||||
"batch_size":{"$sum":{"batch_size":1}},
|
"batch_count":{"$sum":"$batch_size"},
|
||||||
|
"text_size":{"$sum":"$text_size"},
|
||||||
|
"original_text_size":{"$sum":"$original_text_size"},
|
||||||
"count":{"$sum":1},
|
"count":{"$sum":1},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
print(">>>> Batches")
|
print(">>>> Batches")
|
||||||
|
headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"]
|
||||||
|
print("\t".join(headers))
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
values = [str(item[x]) for x in headers]
|
||||||
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
|
print("\t".join(values))
|
||||||
|
|
||||||
|
def sample_domains():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
linkscol = db["links"]
|
||||||
|
# discover domains
|
||||||
|
domains = linkscol.distinct("host",filter={"status":"frontlink"})
|
||||||
|
for domain in domains:
|
||||||
|
print(domain)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
|
# exploit domains
|
||||||
print(">>>> Best domains")
|
print(">>>> Best domains")
|
||||||
res = domaincol.find({},limit=100).sort("average_fetch_characters")
|
res = domaincol.find({},limit=100).sort("average_fetch_characters",-1)
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
Loading…
Reference in New Issue
Block a user