zz
This commit is contained in:
parent
437d4f9684
commit
1752d5c776
18
mongo/docker-compose.yaml
Normal file
18
mongo/docker-compose.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
version: "3.0"
|
||||||
|
services:
|
||||||
|
mongo:
|
||||||
|
image: mongo
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: root
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: example
|
||||||
|
ports:
|
||||||
|
- 27017:27017
|
||||||
|
|
||||||
|
mongo-express:
|
||||||
|
image: mongo-express
|
||||||
|
ports:
|
||||||
|
- 8081:8081
|
||||||
|
environment:
|
||||||
|
ME_CONFIG_MONGODB_ADMINUSERNAME: root
|
||||||
|
ME_CONFIG_MONGODB_ADMINPASSWORD: example
|
||||||
|
ME_CONFIG_MONGODB_URL: mongodb://root:example@mongo:27017/
|
75
mongo/mongocwarler.py
Normal file
75
mongo/mongocwarler.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import pymongo
|
||||||
|
|
||||||
|
|
||||||
|
import trafilatura
|
||||||
|
import trafilatura.feeds
|
||||||
|
import trafilatura.sitemaps
|
||||||
|
import trafilatura.spider
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def index_page(db,url,content,extracted_page):
|
||||||
|
htmlldb = db["html"]
|
||||||
|
htmldb.insert_one({"url":ulr,"content":content})
|
||||||
|
contentdb = db["content"]
|
||||||
|
contentdb.insert_one(extracted_page)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fetch_pages(link_batch):
|
||||||
|
docs = []
|
||||||
|
for link in link_batch:
|
||||||
|
link_doc = {"url":link,"status": "unvisited"}
|
||||||
|
rr = trafilatura.fetch_url(page,decode=True)
|
||||||
|
if rr is not None:
|
||||||
|
link_doc["status"] = "html_ok"
|
||||||
|
link_doc["html"] = rr
|
||||||
|
docs.append(link_doc)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def extract_pages(link_docs):
|
||||||
|
content = []
|
||||||
|
extracted_links = set()
|
||||||
|
for doc in link_docs:
|
||||||
|
if doc["status"] != "html_ok":
|
||||||
|
continue
|
||||||
|
extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True)
|
||||||
|
links = extracted_doc["links"]
|
||||||
|
extracted_links += links
|
||||||
|
del extracted_doc["links"]
|
||||||
|
content.append(extracted_doc)
|
||||||
|
return content, extracted_links
|
||||||
|
|
||||||
|
def index_pages(pagedb,pages_list):
|
||||||
|
mycol = pagedb["content"]
|
||||||
|
for page in page_list:
|
||||||
|
# get paragraph checksums
|
||||||
|
checksums = get_checksums(page["text"])
|
||||||
|
page["checksums"] = checksums
|
||||||
|
x = mycol.insert_many(pages_list)
|
||||||
|
page_hashes = []
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_visited_links(domain):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def generic_visit(domain):
|
||||||
|
known_links = set(get_visited_links(domain))
|
||||||
|
visit_links = []
|
||||||
|
visit_links = trafilatura.find_feed_urls(domain)
|
||||||
|
if visit_links is None:
|
||||||
|
visit_links = trafilatura.sitemap_search(domain)
|
||||||
|
if visit_links is None:
|
||||||
|
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||||
|
|
||||||
|
def simple_visit(domain):
|
||||||
|
known_links = []
|
||||||
|
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
||||||
|
print(res)
|
||||||
|
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
||||||
|
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
||||||
|
#print(visit_links)
|
||||||
|
#for link in visit_links:
|
||||||
|
# content = trafilatura.fetch_url(link,decode=True)
|
||||||
|
# document = trafilatura.bare_extraction(content)
|
||||||
|
# print(content)
|
||||||
|
|
||||||
|
simple_visit(sys.argv[1])
|
10
mongo/mongoindexer.py
Normal file
10
mongo/mongoindexer.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import pymongo
|
||||||
|
|
||||||
|
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
|
||||||
|
|
||||||
|
mydb = myclient["mydatabase"]
|
||||||
|
mycol = mydb["customers"]
|
||||||
|
|
||||||
|
mydict = {"text":"ahoj svet"}
|
||||||
|
|
||||||
|
x = mycol.insert_one(mydict)
|
Loading…
Reference in New Issue
Block a user