This commit is contained in:
Daniel Hládek 2023-03-12 13:53:17 +01:00
parent 39d1057ea1
commit 2fce373d0f

View File

@ -163,7 +163,6 @@ def extract_pages(link_batch,responses):
def index_pages(db,domain,extracted_pages): def index_pages(db,domain,extracted_pages):
extracted_links = set()
linkcol = db["links"] linkcol = db["links"]
htmlcol = db["html"] htmlcol = db["html"]
contentcol = db["content"] contentcol = db["content"]
@ -201,12 +200,14 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
links = {} links = {}
for original_link,(final_link,html) in zip(link_batch,responses): for original_link,(final_link,html) in zip(link_batch,responses):
status = default_status status = default_status
extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE) external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links:
links[link] = "frontlink"
internal_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in extracted_links: for link in internal_links:
if courlan.is_external(link,domain): status = str(default_status)
status = "frontlink" if courlan.is_navigation_page(link):
elif courlan.is_navigation(link):
status = "navigation" status = "navigation"
#print(link,status) #print(link,status)
links[link] = status links[link] = status
@ -249,7 +250,7 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
extracted_pages = extract_pages(links,responses) extracted_pages = extract_pages(links,responses)
#print(extracted_pages) #print(extracted_pages)
extracted_links = extract_links(links,responses,domain,rules,status) extracted_links = extract_links(links,responses,domain,rules,status)
print(extracted_links) #print(extracted_links)
index_links(db,extracted_links) index_links(db,extracted_links)
index_pages(db,domain,extracted_pages) index_pages(db,domain,extracted_pages)
@ -273,7 +274,6 @@ def link_summary(db,domain):
for item in res: for item in res:
print(item) print(item)
global DB
@click.group() @click.group()
def cli(): def cli():
@ -305,6 +305,7 @@ def visit(start_link):
navigation_links = get_links(db,domain,"navigation",batch_size) navigation_links = get_links(db,domain,"navigation",batch_size)
if start_link is not None: if start_link is not None:
navigation_links.append(start_link) navigation_links.append(start_link)
print("Navigtaion links")
print(navigation_links) print(navigation_links)
process_links(db,domain,"frontlink",navigation_links,rules) process_links(db,domain,"frontlink",navigation_links,rules)
links = get_links(db,domain,"frontlink",batch_size) links = get_links(db,domain,"frontlink",batch_size)
@ -312,6 +313,8 @@ def visit(start_link):
if bl > 0: if bl > 0:
print("Getting backlinks") print("Getting backlinks")
front_links = get_links(db,domain,"backlink",bl) front_links = get_links(db,domain,"backlink",bl)
links += front_links
print("Pricessing backlinks")
process_links(db,domain,"backlink",links,rules=rules) process_links(db,domain,"backlink",links,rules=rules)
link_summary(db,domain) link_summary(db,domain)