zz
This commit is contained in:
parent
781325d2a9
commit
29800903c5
@ -57,8 +57,9 @@ def filter_links(links):
|
||||
def get_link_doc(link,status="frontlink"):
|
||||
r = courlan.check_url(link)
|
||||
assert r is not None
|
||||
link,domain = r
|
||||
return {"url":link,"domain":domain,"status":status}
|
||||
link,host = r
|
||||
domain = extract_domain(link)
|
||||
return {"url":link,"host":host,"domain":domain,"status":status}
|
||||
|
||||
def generic_visit(domain):
|
||||
known_links = set(get_visited_links(domain))
|
||||
@ -70,7 +71,6 @@ def generic_visit(domain):
|
||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||
|
||||
|
||||
|
||||
def fetch_pages(link_batch):
|
||||
htmls = []
|
||||
print(link_batch)
|
||||
@ -78,8 +78,7 @@ def fetch_pages(link_batch):
|
||||
for link in link_batch:
|
||||
print("fetching:::::")
|
||||
print(link)
|
||||
rr = trafilatura.fetch_url(link,decode=True)
|
||||
htmls.append(rr)
|
||||
htmls.append(trafilatura.fetch_url(link))
|
||||
return htmls
|
||||
|
||||
def fetch_front_links(start_link):
|
||||
@ -90,12 +89,13 @@ def fetch_front_links(start_link):
|
||||
|
||||
def extract_pages(link_batch,htmls):
|
||||
out = []
|
||||
for link,html in zip(link_batch,htmls):
|
||||
for link,response in zip(link_batch,responses):
|
||||
doc = None
|
||||
assert link is not None
|
||||
html = trafilatura.util.decode_response(response.data)
|
||||
if html is not None:
|
||||
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
|
||||
out.append((link,html,doc))
|
||||
out.append((link,response.url,html,doc))
|
||||
return out
|
||||
|
||||
|
||||
@ -163,7 +163,7 @@ def simple_visit(start_link):
|
||||
visit_links = get_links(db,"frontlink",domain)
|
||||
print("NEW VISIT LINKS")
|
||||
print(visit_links)
|
||||
htmls = fetch_pages(visit_links)
|
||||
responses = fetch_pages(visit_links)
|
||||
extracted_pages = extract_pages(visit_links,htmls)
|
||||
index_pages(db,domain,extracted_pages)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user