zz
This commit is contained in:
parent
39d1057ea1
commit
2fce373d0f
@ -163,7 +163,6 @@ def extract_pages(link_batch,responses):
|
||||
|
||||
|
||||
def index_pages(db,domain,extracted_pages):
|
||||
extracted_links = set()
|
||||
linkcol = db["links"]
|
||||
htmlcol = db["html"]
|
||||
contentcol = db["content"]
|
||||
@ -201,12 +200,14 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
|
||||
links = {}
|
||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||
status = default_status
|
||||
extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE)
|
||||
external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||
for link in external_links:
|
||||
links[link] = "frontlink"
|
||||
internal_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||
#print(extracted_links)
|
||||
for link in extracted_links:
|
||||
if courlan.is_external(link,domain):
|
||||
status = "frontlink"
|
||||
elif courlan.is_navigation(link):
|
||||
for link in internal_links:
|
||||
status = str(default_status)
|
||||
if courlan.is_navigation_page(link):
|
||||
status = "navigation"
|
||||
#print(link,status)
|
||||
links[link] = status
|
||||
@ -249,7 +250,7 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
|
||||
extracted_pages = extract_pages(links,responses)
|
||||
#print(extracted_pages)
|
||||
extracted_links = extract_links(links,responses,domain,rules,status)
|
||||
print(extracted_links)
|
||||
#print(extracted_links)
|
||||
index_links(db,extracted_links)
|
||||
index_pages(db,domain,extracted_pages)
|
||||
|
||||
@ -273,7 +274,6 @@ def link_summary(db,domain):
|
||||
for item in res:
|
||||
print(item)
|
||||
|
||||
global DB
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
@ -305,6 +305,7 @@ def visit(start_link):
|
||||
navigation_links = get_links(db,domain,"navigation",batch_size)
|
||||
if start_link is not None:
|
||||
navigation_links.append(start_link)
|
||||
print("Navigtaion links")
|
||||
print(navigation_links)
|
||||
process_links(db,domain,"frontlink",navigation_links,rules)
|
||||
links = get_links(db,domain,"frontlink",batch_size)
|
||||
@ -312,6 +313,8 @@ def visit(start_link):
|
||||
if bl > 0:
|
||||
print("Getting backlinks")
|
||||
front_links = get_links(db,domain,"backlink",bl)
|
||||
links += front_links
|
||||
print("Pricessing backlinks")
|
||||
process_links(db,domain,"backlink",links,rules=rules)
|
||||
link_summary(db,domain)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user