This commit is contained in:
Daniel Hládek 2023-03-07 08:58:28 +01:00
parent cc0d720d1c
commit 6724e964e9

View File

@ -4,6 +4,7 @@ import trafilatura.feeds
import trafilatura.sitemaps
import trafilatura.spider
import sys
import courlan
def calculate_checksums(self, text):
@ -36,16 +37,6 @@ def calculate_checksums(self, text):
return checksums, sizes
def fetch_pages(link_batch):
docs = []
for link in link_batch:
link_doc = {"url":link,"status": "html_error"}
rr = trafilatura.fetch_url(page,decode=True)
if rr is not None:
link_doc["status"] = "html_ok"
link_doc["html"] = rr
docs.append(link_doc)
return docs
def extract_pages(link_docs):
content = []
@ -90,6 +81,21 @@ def process_pages(db,fetched_pages):
htmlcol = db["html"]
htmlcol.insertMany(html_pages)
def filter_links(links,domain):
out = set()
for link in links:
r = courlan.check_url(link)
if r is None:
continue
link,ldomain = r
# domain rules
if not ldomain.endswith("sk"):
continue
if not courlan.is_crawlable(link):
continue
out.add(link)
return out
def get_visited_links(domain):
@ -104,10 +110,31 @@ def generic_visit(domain):
if visit_links is None:
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
def simple_visit(domain):
def get_new_links(domain):
known_links = []
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
# get seed links
# get new links
out_links = trafilatura.spider.focused_crawler(domain,known_links=known_links)
filtered_links = filter_links(out_links)
print(res)
return filtered_links
def index_links(db,link_batch):
htmls = []
for link in link_batch:
rr = trafilatura.fetch_url(page,decode=True)
htmls.append(rr)
html_docs = []
link_docs = []
for link,html in zip(link_batch,html):
status = "visited"
if html is None:
status = "html_error"
html_docs.append({"url":link,"html":html})
link_docs.append({"url":link,"status":status})
return docs
#visit_links = trafilatura.feeds.find_feed_urls(domain)
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
#print(visit_links)