This commit is contained in:
Daniel Hládek 2023-03-11 14:43:20 +01:00
parent 08ccc93977
commit 33d7bc9a21

View File

@ -9,6 +9,7 @@ import courlan
import urllib
LANGUAGE="sk"
DOMAIN = "sk"
BATCHSIZE=10
MINFILESIZE=300
MAXFILESIZE=1000000
@ -51,13 +52,12 @@ def is_robot_good(link,rules):
def is_link_good(link):
r = courlan.check_url(link,strict=True,language=LANGUAGE)
if r is None:
print("BBBBBBB")
print(link)
return None
llink,ldomain = r
print(llink,ldomain)
# domain rules
if not ldomain.endswith("sk"):
if not ldomain.endswith(DOMAIN):
print("bad domain")
return None
if courlan.is_not_crawlable(llink):
@ -220,8 +220,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
def process_links(db,domain,status,links=[],rules=None):
links += get_links(db,domain,status)
def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
links += get_links(db,domain,status,batch_size)
#print(links)
responses = fetch_pages(links)
#print(responses)
@ -243,4 +243,23 @@ def simple_visit(start_link):
process_links(db,domain,"frontlink",rules=rules)
process_links(db,domain,"backlink",rules=rules)
def create_indices(db):
linkcol = db["links"]
linkcol.create_index({"url":1},{"name":"url"})
linkcol.create_index({"hostname":1,"status":1},{"name":"hostname_status"})
contentcol = db["content"]
contentcol.create_index({"url":1})
contentcol.create_index({"paragraph_checksums":1})
contentcol.create_index({"domain":1})
htmlcol = db["html"]
htmlcol.create_index({"url":1})
def link_summary(db,domain):
linkcol = db["links"]
res = linkcol.aggregate([
{"$match":{"hostname":domain}},
{"$group":{"_id":"status":domain,"count":{"$count":1}}},
])
print(res)
simple_visit(sys.argv[1])