From 33d7bc9a218d020a73b0446041ba0bd89f500b01 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sat, 11 Mar 2023 14:43:20 +0100 Subject: [PATCH] zz --- mongo/mongocwarler.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 4bd3fb0..86ca152 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -9,6 +9,7 @@ import courlan import urllib LANGUAGE="sk" +DOMAIN = "sk" BATCHSIZE=10 MINFILESIZE=300 MAXFILESIZE=1000000 @@ -51,13 +52,12 @@ def is_robot_good(link,rules): def is_link_good(link): r = courlan.check_url(link,strict=True,language=LANGUAGE) if r is None: - print("BBBBBBB") print(link) return None llink,ldomain = r print(llink,ldomain) # domain rules - if not ldomain.endswith("sk"): + if not ldomain.endswith(DOMAIN): print("bad domain") return None if courlan.is_not_crawlable(llink): @@ -220,8 +220,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE): -def process_links(db,domain,status,links=[],rules=None): - links += get_links(db,domain,status) +def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): + links += get_links(db,domain,status,batch_size) #print(links) responses = fetch_pages(links) #print(responses) @@ -243,4 +243,23 @@ def simple_visit(start_link): process_links(db,domain,"frontlink",rules=rules) process_links(db,domain,"backlink",rules=rules) +def create_indices(db): + linkcol = db["links"] + linkcol.create_index({"url":1},{"name":"url"}) + linkcol.create_index({"hostname":1,"status":1},{"name":"hostname_status"}) + contentcol = db["content"] + contentcol.create_index({"url":1}) + contentcol.create_index({"paragraph_checksums":1}) + contentcol.create_index({"domain":1}) + htmlcol = db["html"] + htmlcol.create_index({"url":1}) + +def link_summary(db,domain): + linkcol = db["links"] + res = linkcol.aggregate([ + {"$match":{"hostname":domain}}, + {"$group":{"_id":"status":domain,"count":{"$count":1}}}, + ]) + print(res) + simple_visit(sys.argv[1])