This commit is contained in:
Daniel Hládek 2023-03-09 13:29:34 +01:00
parent 781325d2a9
commit 29800903c5

View File

@ -57,8 +57,9 @@ def filter_links(links):
def get_link_doc(link,status="frontlink"): def get_link_doc(link,status="frontlink"):
r = courlan.check_url(link) r = courlan.check_url(link)
assert r is not None assert r is not None
link,domain = r link,host = r
return {"url":link,"domain":domain,"status":status} domain = extract_domain(link)
return {"url":link,"host":host,"domain":domain,"status":status}
def generic_visit(domain): def generic_visit(domain):
known_links = set(get_visited_links(domain)) known_links = set(get_visited_links(domain))
@ -70,7 +71,6 @@ def generic_visit(domain):
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
def fetch_pages(link_batch): def fetch_pages(link_batch):
htmls = [] htmls = []
print(link_batch) print(link_batch)
@ -78,8 +78,7 @@ def fetch_pages(link_batch):
for link in link_batch: for link in link_batch:
print("fetching:::::") print("fetching:::::")
print(link) print(link)
rr = trafilatura.fetch_url(link,decode=True) htmls.append(trafilatura.fetch_url(link))
htmls.append(rr)
return htmls return htmls
def fetch_front_links(start_link): def fetch_front_links(start_link):
@ -90,12 +89,13 @@ def fetch_front_links(start_link):
def extract_pages(link_batch,htmls): def extract_pages(link_batch,htmls):
out = [] out = []
for link,html in zip(link_batch,htmls): for link,response in zip(link_batch,responses):
doc = None doc = None
assert link is not None assert link is not None
html = trafilatura.util.decode_response(response.data)
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk") doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
out.append((link,html,doc)) out.append((link,response.url,html,doc))
return out return out
@ -163,7 +163,7 @@ def simple_visit(start_link):
visit_links = get_links(db,"frontlink",domain) visit_links = get_links(db,"frontlink",domain)
print("NEW VISIT LINKS") print("NEW VISIT LINKS")
print(visit_links) print(visit_links)
htmls = fetch_pages(visit_links) responses = fetch_pages(visit_links)
extracted_pages = extract_pages(visit_links,htmls) extracted_pages = extract_pages(visit_links,htmls)
index_pages(db,domain,extracted_pages) index_pages(db,domain,extracted_pages)