diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index ba14123..64e6402 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -730,6 +730,13 @@ def crawl_summary(): for item in res: values = [str(item[x]) for x in headers] print("\t".join(values)) + contentcol = db["content"] + res = contentcol.aggregate([ + {"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}} + ]) + print(">>>>> Total text size in content") + for item in res: + print(res) def import_html(): diff --git a/mongo/submitdomains.sh b/mongo/submitdomains.sh new file mode 100644 index 0000000..282a968 --- /dev/null +++ b/mongo/submitdomains.sh @@ -0,0 +1,4 @@ +DOMAINS=`cut -f 1 -d ";" domains.txt | shuf` +for DOM in $DOMAINS ; do + echo rq enqueue mongocrawler.visit hostname=$DOM +done