zz
This commit is contained in:
parent
08ccc93977
commit
33d7bc9a21
@ -9,6 +9,7 @@ import courlan
|
|||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
LANGUAGE="sk"
|
LANGUAGE="sk"
|
||||||
|
DOMAIN = "sk"
|
||||||
BATCHSIZE=10
|
BATCHSIZE=10
|
||||||
MINFILESIZE=300
|
MINFILESIZE=300
|
||||||
MAXFILESIZE=1000000
|
MAXFILESIZE=1000000
|
||||||
@ -51,13 +52,12 @@ def is_robot_good(link,rules):
|
|||||||
def is_link_good(link):
|
def is_link_good(link):
|
||||||
r = courlan.check_url(link,strict=True,language=LANGUAGE)
|
r = courlan.check_url(link,strict=True,language=LANGUAGE)
|
||||||
if r is None:
|
if r is None:
|
||||||
print("BBBBBBB")
|
|
||||||
print(link)
|
print(link)
|
||||||
return None
|
return None
|
||||||
llink,ldomain = r
|
llink,ldomain = r
|
||||||
print(llink,ldomain)
|
print(llink,ldomain)
|
||||||
# domain rules
|
# domain rules
|
||||||
if not ldomain.endswith("sk"):
|
if not ldomain.endswith(DOMAIN):
|
||||||
print("bad domain")
|
print("bad domain")
|
||||||
return None
|
return None
|
||||||
if courlan.is_not_crawlable(llink):
|
if courlan.is_not_crawlable(llink):
|
||||||
@ -220,8 +220,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_links(db,domain,status,links=[],rules=None):
|
def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
||||||
links += get_links(db,domain,status)
|
links += get_links(db,domain,status,batch_size)
|
||||||
#print(links)
|
#print(links)
|
||||||
responses = fetch_pages(links)
|
responses = fetch_pages(links)
|
||||||
#print(responses)
|
#print(responses)
|
||||||
@ -243,4 +243,23 @@ def simple_visit(start_link):
|
|||||||
process_links(db,domain,"frontlink",rules=rules)
|
process_links(db,domain,"frontlink",rules=rules)
|
||||||
process_links(db,domain,"backlink",rules=rules)
|
process_links(db,domain,"backlink",rules=rules)
|
||||||
|
|
||||||
|
def create_indices(db):
|
||||||
|
linkcol = db["links"]
|
||||||
|
linkcol.create_index({"url":1},{"name":"url"})
|
||||||
|
linkcol.create_index({"hostname":1,"status":1},{"name":"hostname_status"})
|
||||||
|
contentcol = db["content"]
|
||||||
|
contentcol.create_index({"url":1})
|
||||||
|
contentcol.create_index({"paragraph_checksums":1})
|
||||||
|
contentcol.create_index({"domain":1})
|
||||||
|
htmlcol = db["html"]
|
||||||
|
htmlcol.create_index({"url":1})
|
||||||
|
|
||||||
|
def link_summary(db,domain):
|
||||||
|
linkcol = db["links"]
|
||||||
|
res = linkcol.aggregate([
|
||||||
|
{"$match":{"hostname":domain}},
|
||||||
|
{"$group":{"_id":"status":domain,"count":{"$count":1}}},
|
||||||
|
])
|
||||||
|
print(res)
|
||||||
|
|
||||||
simple_visit(sys.argv[1])
|
simple_visit(sys.argv[1])
|
||||||
|
Loading…
Reference in New Issue
Block a user