zz
This commit is contained in:
parent
cc0d720d1c
commit
6724e964e9
@ -4,6 +4,7 @@ import trafilatura.feeds
|
|||||||
import trafilatura.sitemaps
|
import trafilatura.sitemaps
|
||||||
import trafilatura.spider
|
import trafilatura.spider
|
||||||
import sys
|
import sys
|
||||||
|
import courlan
|
||||||
|
|
||||||
|
|
||||||
def calculate_checksums(self, text):
|
def calculate_checksums(self, text):
|
||||||
@ -36,16 +37,6 @@ def calculate_checksums(self, text):
|
|||||||
return checksums, sizes
|
return checksums, sizes
|
||||||
|
|
||||||
|
|
||||||
def fetch_pages(link_batch):
|
|
||||||
docs = []
|
|
||||||
for link in link_batch:
|
|
||||||
link_doc = {"url":link,"status": "html_error"}
|
|
||||||
rr = trafilatura.fetch_url(page,decode=True)
|
|
||||||
if rr is not None:
|
|
||||||
link_doc["status"] = "html_ok"
|
|
||||||
link_doc["html"] = rr
|
|
||||||
docs.append(link_doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
def extract_pages(link_docs):
|
def extract_pages(link_docs):
|
||||||
content = []
|
content = []
|
||||||
@ -90,6 +81,21 @@ def process_pages(db,fetched_pages):
|
|||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.insertMany(html_pages)
|
htmlcol.insertMany(html_pages)
|
||||||
|
|
||||||
|
def filter_links(links,domain):
|
||||||
|
out = set()
|
||||||
|
for link in links:
|
||||||
|
r = courlan.check_url(link)
|
||||||
|
if r is None:
|
||||||
|
continue
|
||||||
|
link,ldomain = r
|
||||||
|
# domain rules
|
||||||
|
if not ldomain.endswith("sk"):
|
||||||
|
continue
|
||||||
|
if not courlan.is_crawlable(link):
|
||||||
|
continue
|
||||||
|
out.add(link)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_visited_links(domain):
|
def get_visited_links(domain):
|
||||||
@ -104,10 +110,31 @@ def generic_visit(domain):
|
|||||||
if visit_links is None:
|
if visit_links is None:
|
||||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||||
|
|
||||||
def simple_visit(domain):
|
def get_new_links(domain):
|
||||||
known_links = []
|
known_links = []
|
||||||
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
# get seed links
|
||||||
|
# get new links
|
||||||
|
out_links = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
||||||
|
filtered_links = filter_links(out_links)
|
||||||
print(res)
|
print(res)
|
||||||
|
return filtered_links
|
||||||
|
|
||||||
|
def index_links(db,link_batch):
|
||||||
|
htmls = []
|
||||||
|
for link in link_batch:
|
||||||
|
rr = trafilatura.fetch_url(page,decode=True)
|
||||||
|
htmls.append(rr)
|
||||||
|
html_docs = []
|
||||||
|
link_docs = []
|
||||||
|
for link,html in zip(link_batch,html):
|
||||||
|
status = "visited"
|
||||||
|
if html is None:
|
||||||
|
status = "html_error"
|
||||||
|
html_docs.append({"url":link,"html":html})
|
||||||
|
link_docs.append({"url":link,"status":status})
|
||||||
|
|
||||||
|
|
||||||
|
return docs
|
||||||
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
||||||
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
||||||
#print(visit_links)
|
#print(visit_links)
|
||||||
|
Loading…
Reference in New Issue
Block a user