zz
This commit is contained in:
parent
781325d2a9
commit
29800903c5
@ -57,8 +57,9 @@ def filter_links(links):
|
|||||||
def get_link_doc(link,status="frontlink"):
|
def get_link_doc(link,status="frontlink"):
|
||||||
r = courlan.check_url(link)
|
r = courlan.check_url(link)
|
||||||
assert r is not None
|
assert r is not None
|
||||||
link,domain = r
|
link,host = r
|
||||||
return {"url":link,"domain":domain,"status":status}
|
domain = extract_domain(link)
|
||||||
|
return {"url":link,"host":host,"domain":domain,"status":status}
|
||||||
|
|
||||||
def generic_visit(domain):
|
def generic_visit(domain):
|
||||||
known_links = set(get_visited_links(domain))
|
known_links = set(get_visited_links(domain))
|
||||||
@ -70,7 +71,6 @@ def generic_visit(domain):
|
|||||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_pages(link_batch):
|
def fetch_pages(link_batch):
|
||||||
htmls = []
|
htmls = []
|
||||||
print(link_batch)
|
print(link_batch)
|
||||||
@ -78,8 +78,7 @@ def fetch_pages(link_batch):
|
|||||||
for link in link_batch:
|
for link in link_batch:
|
||||||
print("fetching:::::")
|
print("fetching:::::")
|
||||||
print(link)
|
print(link)
|
||||||
rr = trafilatura.fetch_url(link,decode=True)
|
htmls.append(trafilatura.fetch_url(link))
|
||||||
htmls.append(rr)
|
|
||||||
return htmls
|
return htmls
|
||||||
|
|
||||||
def fetch_front_links(start_link):
|
def fetch_front_links(start_link):
|
||||||
@ -90,12 +89,13 @@ def fetch_front_links(start_link):
|
|||||||
|
|
||||||
def extract_pages(link_batch,htmls):
|
def extract_pages(link_batch,htmls):
|
||||||
out = []
|
out = []
|
||||||
for link,html in zip(link_batch,htmls):
|
for link,response in zip(link_batch,responses):
|
||||||
doc = None
|
doc = None
|
||||||
assert link is not None
|
assert link is not None
|
||||||
|
html = trafilatura.util.decode_response(response.data)
|
||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
|
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
|
||||||
out.append((link,html,doc))
|
out.append((link,response.url,html,doc))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
@ -163,7 +163,7 @@ def simple_visit(start_link):
|
|||||||
visit_links = get_links(db,"frontlink",domain)
|
visit_links = get_links(db,"frontlink",domain)
|
||||||
print("NEW VISIT LINKS")
|
print("NEW VISIT LINKS")
|
||||||
print(visit_links)
|
print(visit_links)
|
||||||
htmls = fetch_pages(visit_links)
|
responses = fetch_pages(visit_links)
|
||||||
extracted_pages = extract_pages(visit_links,htmls)
|
extracted_pages = extract_pages(visit_links,htmls)
|
||||||
index_pages(db,domain,extracted_pages)
|
index_pages(db,domain,extracted_pages)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user