From 93717bed14850172014f49bd516ceb9e460104b9 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Fri, 12 May 2023 08:11:15 +0200 Subject: [PATCH] zz --- mongo/mongocrawler.py | 7 +++++++ mongo/submitdomains.sh | 4 ++++ 2 files changed, 11 insertions(+) create mode 100644 mongo/submitdomains.sh diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index ba14123..64e6402 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -730,6 +730,13 @@ def crawl_summary(): for item in res: values = [str(item[x]) for x in headers] print("\t".join(values)) + contentcol = db["content"] + res = contentcol.aggregate([ + {"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}} + ]) + print(">>>>> Total text size in content") + for item in res: + print(res) def import_html(): diff --git a/mongo/submitdomains.sh b/mongo/submitdomains.sh new file mode 100644 index 0000000..282a968 --- /dev/null +++ b/mongo/submitdomains.sh @@ -0,0 +1,4 @@ +DOMAINS=`cut -f 1 -d ";" domains.txt | shuf` +for DOM in $DOMAINS ; do + echo rq enqueue mongocrawler.visit hostname=$DOM +done