zz
This commit is contained in:
parent
33d7bc9a21
commit
39f66bc98a
@ -221,7 +221,6 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
|
|||||||
|
|
||||||
|
|
||||||
def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
||||||
links += get_links(db,domain,status,batch_size)
|
|
||||||
#print(links)
|
#print(links)
|
||||||
responses = fetch_pages(links)
|
responses = fetch_pages(links)
|
||||||
#print(responses)
|
#print(responses)
|
||||||
@ -232,16 +231,32 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
|||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
index_pages(db,domain,extracted_pages)
|
index_pages(db,domain,extracted_pages)
|
||||||
|
|
||||||
def simple_visit(start_link):
|
def simple_visit(start_link=None):
|
||||||
start_link,domain = courlan.check_url(start_link)
|
start_link,domain = courlan.check_url(start_link)
|
||||||
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
|
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
|
||||||
db=myclient["crawler"]
|
db=myclient["crawler"]
|
||||||
rules = fetch_robot(domain)
|
rules = fetch_robot(domain)
|
||||||
navigation_links =[start_link]
|
batch_size = BATCHSIZE
|
||||||
|
navigation_links = get_links(db,domain,"navigation",batch_size)
|
||||||
|
if start_link is not None:
|
||||||
|
navigation_links.append(start_link)
|
||||||
print(navigation_links)
|
print(navigation_links)
|
||||||
process_links(db,domain,"navigation",navigation_links,rules)
|
process_links(db,domain,"frontlink",navigation_links,rules)
|
||||||
process_links(db,domain,"frontlink",rules=rules)
|
links = get_links(db,domain,"frontlink",batch_size)
|
||||||
process_links(db,domain,"backlink",rules=rules)
|
bl = len(links) - batch_size
|
||||||
|
if bl > 0:
|
||||||
|
print("Getting backlinks")
|
||||||
|
front_links = get_links(db,domain,"backlink",bl)
|
||||||
|
process_links(db,domain,"backlink",links,rules=rules)
|
||||||
|
link_summary(db,domain)
|
||||||
|
|
||||||
|
def link_summary(db,domain):
|
||||||
|
linkcol = db["links"]
|
||||||
|
res = linkcol.aggregate([
|
||||||
|
{"$match":{"hostname":domain}},
|
||||||
|
{"$group":{"_id":{"status":domain},"count":{"$count":1}}},
|
||||||
|
])
|
||||||
|
print(res)
|
||||||
|
|
||||||
def create_indices(db):
|
def create_indices(db):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -254,12 +269,5 @@ def create_indices(db):
|
|||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.create_index({"url":1})
|
htmlcol.create_index({"url":1})
|
||||||
|
|
||||||
def link_summary(db,domain):
|
|
||||||
linkcol = db["links"]
|
|
||||||
res = linkcol.aggregate([
|
|
||||||
{"$match":{"hostname":domain}},
|
|
||||||
{"$group":{"_id":"status":domain,"count":{"$count":1}}},
|
|
||||||
])
|
|
||||||
print(res)
|
|
||||||
|
|
||||||
simple_visit(sys.argv[1])
|
simple_visit(sys.argv[1])
|
||||||
|
Loading…
Reference in New Issue
Block a user