zz
This commit is contained in:
parent
39d1057ea1
commit
2fce373d0f
@ -163,7 +163,6 @@ def extract_pages(link_batch,responses):
|
|||||||
|
|
||||||
|
|
||||||
def index_pages(db,domain,extracted_pages):
|
def index_pages(db,domain,extracted_pages):
|
||||||
extracted_links = set()
|
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
@ -201,12 +200,14 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
|
|||||||
links = {}
|
links = {}
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||||
status = default_status
|
status = default_status
|
||||||
extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE)
|
external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
|
for link in external_links:
|
||||||
|
links[link] = "frontlink"
|
||||||
|
internal_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
#print(extracted_links)
|
#print(extracted_links)
|
||||||
for link in extracted_links:
|
for link in internal_links:
|
||||||
if courlan.is_external(link,domain):
|
status = str(default_status)
|
||||||
status = "frontlink"
|
if courlan.is_navigation_page(link):
|
||||||
elif courlan.is_navigation(link):
|
|
||||||
status = "navigation"
|
status = "navigation"
|
||||||
#print(link,status)
|
#print(link,status)
|
||||||
links[link] = status
|
links[link] = status
|
||||||
@ -249,7 +250,7 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
|||||||
extracted_pages = extract_pages(links,responses)
|
extracted_pages = extract_pages(links,responses)
|
||||||
#print(extracted_pages)
|
#print(extracted_pages)
|
||||||
extracted_links = extract_links(links,responses,domain,rules,status)
|
extracted_links = extract_links(links,responses,domain,rules,status)
|
||||||
print(extracted_links)
|
#print(extracted_links)
|
||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
index_pages(db,domain,extracted_pages)
|
index_pages(db,domain,extracted_pages)
|
||||||
|
|
||||||
@ -273,7 +274,6 @@ def link_summary(db,domain):
|
|||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
global DB
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
@ -305,6 +305,7 @@ def visit(start_link):
|
|||||||
navigation_links = get_links(db,domain,"navigation",batch_size)
|
navigation_links = get_links(db,domain,"navigation",batch_size)
|
||||||
if start_link is not None:
|
if start_link is not None:
|
||||||
navigation_links.append(start_link)
|
navigation_links.append(start_link)
|
||||||
|
print("Navigtaion links")
|
||||||
print(navigation_links)
|
print(navigation_links)
|
||||||
process_links(db,domain,"frontlink",navigation_links,rules)
|
process_links(db,domain,"frontlink",navigation_links,rules)
|
||||||
links = get_links(db,domain,"frontlink",batch_size)
|
links = get_links(db,domain,"frontlink",batch_size)
|
||||||
@ -312,6 +313,8 @@ def visit(start_link):
|
|||||||
if bl > 0:
|
if bl > 0:
|
||||||
print("Getting backlinks")
|
print("Getting backlinks")
|
||||||
front_links = get_links(db,domain,"backlink",bl)
|
front_links = get_links(db,domain,"backlink",bl)
|
||||||
|
links += front_links
|
||||||
|
print("Pricessing backlinks")
|
||||||
process_links(db,domain,"backlink",links,rules=rules)
|
process_links(db,domain,"backlink",links,rules=rules)
|
||||||
link_summary(db,domain)
|
link_summary(db,domain)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user