zz
This commit is contained in:
parent
cc0d720d1c
commit
6724e964e9
@ -4,6 +4,7 @@ import trafilatura.feeds
|
||||
import trafilatura.sitemaps
|
||||
import trafilatura.spider
|
||||
import sys
|
||||
import courlan
|
||||
|
||||
|
||||
def calculate_checksums(self, text):
|
||||
@ -36,16 +37,6 @@ def calculate_checksums(self, text):
|
||||
return checksums, sizes
|
||||
|
||||
|
||||
def fetch_pages(link_batch):
|
||||
docs = []
|
||||
for link in link_batch:
|
||||
link_doc = {"url":link,"status": "html_error"}
|
||||
rr = trafilatura.fetch_url(page,decode=True)
|
||||
if rr is not None:
|
||||
link_doc["status"] = "html_ok"
|
||||
link_doc["html"] = rr
|
||||
docs.append(link_doc)
|
||||
return docs
|
||||
|
||||
def extract_pages(link_docs):
|
||||
content = []
|
||||
@ -90,6 +81,21 @@ def process_pages(db,fetched_pages):
|
||||
htmlcol = db["html"]
|
||||
htmlcol.insertMany(html_pages)
|
||||
|
||||
def filter_links(links,domain):
|
||||
out = set()
|
||||
for link in links:
|
||||
r = courlan.check_url(link)
|
||||
if r is None:
|
||||
continue
|
||||
link,ldomain = r
|
||||
# domain rules
|
||||
if not ldomain.endswith("sk"):
|
||||
continue
|
||||
if not courlan.is_crawlable(link):
|
||||
continue
|
||||
out.add(link)
|
||||
return out
|
||||
|
||||
|
||||
|
||||
def get_visited_links(domain):
|
||||
@ -104,10 +110,31 @@ def generic_visit(domain):
|
||||
if visit_links is None:
|
||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||
|
||||
def simple_visit(domain):
|
||||
def get_new_links(domain):
|
||||
known_links = []
|
||||
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
||||
# get seed links
|
||||
# get new links
|
||||
out_links = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
||||
filtered_links = filter_links(out_links)
|
||||
print(res)
|
||||
return filtered_links
|
||||
|
||||
def index_links(db,link_batch):
|
||||
htmls = []
|
||||
for link in link_batch:
|
||||
rr = trafilatura.fetch_url(page,decode=True)
|
||||
htmls.append(rr)
|
||||
html_docs = []
|
||||
link_docs = []
|
||||
for link,html in zip(link_batch,html):
|
||||
status = "visited"
|
||||
if html is None:
|
||||
status = "html_error"
|
||||
html_docs.append({"url":link,"html":html})
|
||||
link_docs.append({"url":link,"status":status})
|
||||
|
||||
|
||||
return docs
|
||||
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
||||
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
||||
#print(visit_links)
|
||||
|
Loading…
Reference in New Issue
Block a user