import pymongo import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import sys import courlan def calculate_checksums(self, text): """ @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line """ checksums = [] sizes = [] hval = 0 hsz = 0 sz = 0 for c in text: cv = ord(c) sz += 1 if cv > 64: hval += (hval << 3) + cv zv = hval >> 31 hval &= 0x7fffffff hval += zv hsz += 1 if c == "\n" and hsz > 0: if hsz > 100: checksums.append(hval) sizes.append(sz) sz = 0 hsz = 0 if hsz > 100: checksums.append(hval) sizes.append(sz) return checksums, sizes def extract_pages(link_docs): content = [] extracted_links = set() for doc in link_docs: if doc["status"] != "html_ok": continue extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True) links = extracted_doc["links"] extracted_links += links del extracted_doc["links"] content.append(extracted_doc) return content, extracted_links def index_pages(pagedb,pages_list): contentcol = pagedb["content"] for page in page_list: # get paragraph checksums checksums,sizes = get_checksums(page["text"]) page["paragraph_checksums"] = checksums page["paragraph_sizes"] = sizes x = contentcol.insert_many(pages_list) page_hashes = [] def process_pages(db,fetched_pages): content_pages, extracted_links = extract_pages(fetched_pages) contentcol = db["content"] contentcol.insertMany(content_pages) linkcol = db["links"] extracted = [] for link in extracted_links: extracted.append({"url":link,"status":"backlink"}) # shuld fail if link already exists linkcol.insertMany(extracted) html_pages = [] for page in fetched_pages: linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}}) if "html" in page: html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()}) del page["html"] htmlcol = db["html"] htmlcol.insertMany(html_pages) def filter_links(links,domain): out = set() for link in links: r = courlan.check_url(link) if r is None: continue link,ldomain = r # domain rules if not ldomain.endswith("sk"): continue if not courlan.is_crawlable(link): continue out.add(link) return out def get_visited_links(domain): return [] def generic_visit(domain): known_links = set(get_visited_links(domain)) visit_links = [] visit_links = trafilatura.find_feed_urls(domain) if visit_links is None: visit_links = trafilatura.sitemap_search(domain) if visit_links is None: visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) def get_new_links(domain): known_links = [] # get seed links # get new links out_links = trafilatura.spider.focused_crawler(domain,known_links=known_links) filtered_links = filter_links(out_links) print(res) return filtered_links def index_links(db,link_batch): htmls = [] for link in link_batch: rr = trafilatura.fetch_url(page,decode=True) htmls.append(rr) html_docs = [] link_docs = [] for link,html in zip(link_batch,html): status = "visited" if html is None: status = "html_error" html_docs.append({"url":link,"html":html}) link_docs.append({"url":link,"status":status}) return docs #visit_links = trafilatura.feeds.find_feed_urls(domain) #visit_links = trafilatura.sitemaps.sitemap_search(domain) #print(visit_links) #for link in visit_links: # content = trafilatura.fetch_url(link,decode=True) # document = trafilatura.bare_extraction(content) # print(content) simple_visit(sys.argv[1])