This commit is contained in:
Daniel Hládek 2023-03-05 15:44:49 +01:00
parent 437d4f9684
commit 1752d5c776
3 changed files with 103 additions and 0 deletions

18
mongo/docker-compose.yaml Normal file
View File

@ -0,0 +1,18 @@
version: "3.0"
services:
mongo:
image: mongo
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
ports:
- 27017:27017
mongo-express:
image: mongo-express
ports:
- 8081:8081
environment:
ME_CONFIG_MONGODB_ADMINUSERNAME: root
ME_CONFIG_MONGODB_ADMINPASSWORD: example
ME_CONFIG_MONGODB_URL: mongodb://root:example@mongo:27017/

75
mongo/mongocwarler.py Normal file
View File

@ -0,0 +1,75 @@
import pymongo
import trafilatura
import trafilatura.feeds
import trafilatura.sitemaps
import trafilatura.spider
import sys
def index_page(db,url,content,extracted_page):
htmlldb = db["html"]
htmldb.insert_one({"url":ulr,"content":content})
contentdb = db["content"]
contentdb.insert_one(extracted_page)
pass
def fetch_pages(link_batch):
docs = []
for link in link_batch:
link_doc = {"url":link,"status": "unvisited"}
rr = trafilatura.fetch_url(page,decode=True)
if rr is not None:
link_doc["status"] = "html_ok"
link_doc["html"] = rr
docs.append(link_doc)
return docs
def extract_pages(link_docs):
content = []
extracted_links = set()
for doc in link_docs:
if doc["status"] != "html_ok":
continue
extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True)
links = extracted_doc["links"]
extracted_links += links
del extracted_doc["links"]
content.append(extracted_doc)
return content, extracted_links
def index_pages(pagedb,pages_list):
mycol = pagedb["content"]
for page in page_list:
# get paragraph checksums
checksums = get_checksums(page["text"])
page["checksums"] = checksums
x = mycol.insert_many(pages_list)
page_hashes = []
pass
def get_visited_links(domain):
return []
def generic_visit(domain):
known_links = set(get_visited_links(domain))
visit_links = []
visit_links = trafilatura.find_feed_urls(domain)
if visit_links is None:
visit_links = trafilatura.sitemap_search(domain)
if visit_links is None:
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
def simple_visit(domain):
known_links = []
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
print(res)
#visit_links = trafilatura.feeds.find_feed_urls(domain)
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
#print(visit_links)
#for link in visit_links:
# content = trafilatura.fetch_url(link,decode=True)
# document = trafilatura.bare_extraction(content)
# print(content)
simple_visit(sys.argv[1])

10
mongo/mongoindexer.py Normal file
View File

@ -0,0 +1,10 @@
import pymongo
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
mydb = myclient["mydatabase"]
mycol = mydb["customers"]
mydict = {"text":"ahoj svet"}
x = mycol.insert_one(mydict)