This commit is contained in:
Daniel Hládek 2023-05-12 08:11:15 +02:00
parent 01645b8862
commit 93717bed14
2 changed files with 11 additions and 0 deletions

View File

@ -730,6 +730,13 @@ def crawl_summary():
for item in res: for item in res:
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
print("\t".join(values)) print("\t".join(values))
contentcol = db["content"]
res = contentcol.aggregate([
{"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}}
])
print(">>>>> Total text size in content")
for item in res:
print(res)
def import_html(): def import_html():

4
mongo/submitdomains.sh Normal file
View File

@ -0,0 +1,4 @@
DOMAINS=`cut -f 1 -d ";" domains.txt | shuf`
for DOM in $DOMAINS ; do
echo rq enqueue mongocrawler.visit hostname=$DOM
done