76 lines
2.2 KiB
Python
76 lines
2.2 KiB
Python
import pymongo
|
|
|
|
|
|
import trafilatura
|
|
import trafilatura.feeds
|
|
import trafilatura.sitemaps
|
|
import trafilatura.spider
|
|
import sys
|
|
|
|
def index_page(db,url,content,extracted_page):
|
|
htmlldb = db["html"]
|
|
htmldb.insert_one({"url":ulr,"content":content})
|
|
contentdb = db["content"]
|
|
contentdb.insert_one(extracted_page)
|
|
pass
|
|
|
|
def fetch_pages(link_batch):
|
|
docs = []
|
|
for link in link_batch:
|
|
link_doc = {"url":link,"status": "unvisited"}
|
|
rr = trafilatura.fetch_url(page,decode=True)
|
|
if rr is not None:
|
|
link_doc["status"] = "html_ok"
|
|
link_doc["html"] = rr
|
|
docs.append(link_doc)
|
|
return docs
|
|
|
|
def extract_pages(link_docs):
|
|
content = []
|
|
extracted_links = set()
|
|
for doc in link_docs:
|
|
if doc["status"] != "html_ok":
|
|
continue
|
|
extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True)
|
|
links = extracted_doc["links"]
|
|
extracted_links += links
|
|
del extracted_doc["links"]
|
|
content.append(extracted_doc)
|
|
return content, extracted_links
|
|
|
|
def index_pages(pagedb,pages_list):
|
|
mycol = pagedb["content"]
|
|
for page in page_list:
|
|
# get paragraph checksums
|
|
checksums = get_checksums(page["text"])
|
|
page["checksums"] = checksums
|
|
x = mycol.insert_many(pages_list)
|
|
page_hashes = []
|
|
pass
|
|
|
|
def get_visited_links(domain):
|
|
return []
|
|
|
|
def generic_visit(domain):
|
|
known_links = set(get_visited_links(domain))
|
|
visit_links = []
|
|
visit_links = trafilatura.find_feed_urls(domain)
|
|
if visit_links is None:
|
|
visit_links = trafilatura.sitemap_search(domain)
|
|
if visit_links is None:
|
|
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
|
|
|
def simple_visit(domain):
|
|
known_links = []
|
|
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
|
print(res)
|
|
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
|
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
|
#print(visit_links)
|
|
#for link in visit_links:
|
|
# content = trafilatura.fetch_url(link,decode=True)
|
|
# document = trafilatura.bare_extraction(content)
|
|
# print(content)
|
|
|
|
simple_visit(sys.argv[1])
|