This commit is contained in:
Daniel Hládek 2023-03-12 13:53:17 +01:00
parent 39d1057ea1
commit 2fce373d0f

View File

@ -163,7 +163,6 @@ def extract_pages(link_batch,responses):
def index_pages(db,domain,extracted_pages):
extracted_links = set()
linkcol = db["links"]
htmlcol = db["html"]
contentcol = db["content"]
@ -201,12 +200,14 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
links = {}
for original_link,(final_link,html) in zip(link_batch,responses):
status = default_status
extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE)
external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links:
links[link] = "frontlink"
internal_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
#print(extracted_links)
for link in extracted_links:
if courlan.is_external(link,domain):
status = "frontlink"
elif courlan.is_navigation(link):
for link in internal_links:
status = str(default_status)
if courlan.is_navigation_page(link):
status = "navigation"
#print(link,status)
links[link] = status
@ -249,7 +250,7 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
extracted_pages = extract_pages(links,responses)
#print(extracted_pages)
extracted_links = extract_links(links,responses,domain,rules,status)
print(extracted_links)
#print(extracted_links)
index_links(db,extracted_links)
index_pages(db,domain,extracted_pages)
@ -273,7 +274,6 @@ def link_summary(db,domain):
for item in res:
print(item)
global DB
@click.group()
def cli():
@ -305,6 +305,7 @@ def visit(start_link):
navigation_links = get_links(db,domain,"navigation",batch_size)
if start_link is not None:
navigation_links.append(start_link)
print("Navigtaion links")
print(navigation_links)
process_links(db,domain,"frontlink",navigation_links,rules)
links = get_links(db,domain,"frontlink",batch_size)
@ -312,6 +313,8 @@ def visit(start_link):
if bl > 0:
print("Getting backlinks")
front_links = get_links(db,domain,"backlink",bl)
links += front_links
print("Pricessing backlinks")
process_links(db,domain,"backlink",links,rules=rules)
link_summary(db,domain)