This commit is contained in:
Daniel Hládek 2023-03-11 18:41:20 +01:00
parent 33d7bc9a21
commit 39f66bc98a

View File

@ -221,7 +221,6 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
links += get_links(db,domain,status,batch_size)
#print(links) #print(links)
responses = fetch_pages(links) responses = fetch_pages(links)
#print(responses) #print(responses)
@ -232,16 +231,32 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
index_links(db,extracted_links) index_links(db,extracted_links)
index_pages(db,domain,extracted_pages) index_pages(db,domain,extracted_pages)
def simple_visit(start_link): def simple_visit(start_link=None):
start_link,domain = courlan.check_url(start_link) start_link,domain = courlan.check_url(start_link)
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
db=myclient["crawler"] db=myclient["crawler"]
rules = fetch_robot(domain) rules = fetch_robot(domain)
navigation_links =[start_link] batch_size = BATCHSIZE
navigation_links = get_links(db,domain,"navigation",batch_size)
if start_link is not None:
navigation_links.append(start_link)
print(navigation_links) print(navigation_links)
process_links(db,domain,"navigation",navigation_links,rules) process_links(db,domain,"frontlink",navigation_links,rules)
process_links(db,domain,"frontlink",rules=rules) links = get_links(db,domain,"frontlink",batch_size)
process_links(db,domain,"backlink",rules=rules) bl = len(links) - batch_size
if bl > 0:
print("Getting backlinks")
front_links = get_links(db,domain,"backlink",bl)
process_links(db,domain,"backlink",links,rules=rules)
link_summary(db,domain)
def link_summary(db,domain):
linkcol = db["links"]
res = linkcol.aggregate([
{"$match":{"hostname":domain}},
{"$group":{"_id":{"status":domain},"count":{"$count":1}}},
])
print(res)
def create_indices(db): def create_indices(db):
linkcol = db["links"] linkcol = db["links"]
@ -254,12 +269,5 @@ def create_indices(db):
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index({"url":1}) htmlcol.create_index({"url":1})
def link_summary(db,domain):
linkcol = db["links"]
res = linkcol.aggregate([
{"$match":{"hostname":domain}},
{"$group":{"_id":"status":domain,"count":{"$count":1}}},
])
print(res)
simple_visit(sys.argv[1]) simple_visit(sys.argv[1])