diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 4b49508..9a3bb6d 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -4,6 +4,7 @@ import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import sys +import courlan def calculate_checksums(self, text): @@ -36,16 +37,6 @@ def calculate_checksums(self, text): return checksums, sizes -def fetch_pages(link_batch): - docs = [] - for link in link_batch: - link_doc = {"url":link,"status": "html_error"} - rr = trafilatura.fetch_url(page,decode=True) - if rr is not None: - link_doc["status"] = "html_ok" - link_doc["html"] = rr - docs.append(link_doc) - return docs def extract_pages(link_docs): content = [] @@ -90,6 +81,21 @@ def process_pages(db,fetched_pages): htmlcol = db["html"] htmlcol.insertMany(html_pages) +def filter_links(links,domain): + out = set() + for link in links: + r = courlan.check_url(link) + if r is None: + continue + link,ldomain = r + # domain rules + if not ldomain.endswith("sk"): + continue + if not courlan.is_crawlable(link): + continue + out.add(link) + return out + def get_visited_links(domain): @@ -104,10 +110,31 @@ def generic_visit(domain): if visit_links is None: visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) -def simple_visit(domain): +def get_new_links(domain): known_links = [] - #res = trafilatura.spider.focused_crawler(domain,known_links=known_links) + # get seed links + # get new links + out_links = trafilatura.spider.focused_crawler(domain,known_links=known_links) + filtered_links = filter_links(out_links) print(res) + return filtered_links + +def index_links(db,link_batch): + htmls = [] + for link in link_batch: + rr = trafilatura.fetch_url(page,decode=True) + htmls.append(rr) + html_docs = [] + link_docs = [] + for link,html in zip(link_batch,html): + status = "visited" + if html is None: + status = "html_error" + html_docs.append({"url":link,"html":html}) + link_docs.append({"url":link,"status":status}) + + + return docs #visit_links = trafilatura.feeds.find_feed_urls(domain) #visit_links = trafilatura.sitemaps.sitemap_search(domain) #print(visit_links)