websucker-pip/mongo/mongocwarler.py
2023-03-05 15:44:49 +01:00

76 lines
2.2 KiB
Python

import pymongo
import trafilatura
import trafilatura.feeds
import trafilatura.sitemaps
import trafilatura.spider
import sys
def index_page(db,url,content,extracted_page):
htmlldb = db["html"]
htmldb.insert_one({"url":ulr,"content":content})
contentdb = db["content"]
contentdb.insert_one(extracted_page)
pass
def fetch_pages(link_batch):
docs = []
for link in link_batch:
link_doc = {"url":link,"status": "unvisited"}
rr = trafilatura.fetch_url(page,decode=True)
if rr is not None:
link_doc["status"] = "html_ok"
link_doc["html"] = rr
docs.append(link_doc)
return docs
def extract_pages(link_docs):
content = []
extracted_links = set()
for doc in link_docs:
if doc["status"] != "html_ok":
continue
extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True)
links = extracted_doc["links"]
extracted_links += links
del extracted_doc["links"]
content.append(extracted_doc)
return content, extracted_links
def index_pages(pagedb,pages_list):
mycol = pagedb["content"]
for page in page_list:
# get paragraph checksums
checksums = get_checksums(page["text"])
page["checksums"] = checksums
x = mycol.insert_many(pages_list)
page_hashes = []
pass
def get_visited_links(domain):
return []
def generic_visit(domain):
known_links = set(get_visited_links(domain))
visit_links = []
visit_links = trafilatura.find_feed_urls(domain)
if visit_links is None:
visit_links = trafilatura.sitemap_search(domain)
if visit_links is None:
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
def simple_visit(domain):
known_links = []
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
print(res)
#visit_links = trafilatura.feeds.find_feed_urls(domain)
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
#print(visit_links)
#for link in visit_links:
# content = trafilatura.fetch_url(link,decode=True)
# document = trafilatura.bare_extraction(content)
# print(content)
simple_visit(sys.argv[1])