zz
This commit is contained in:
parent
437d4f9684
commit
1752d5c776
18
mongo/docker-compose.yaml
Normal file
18
mongo/docker-compose.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
version: "3.0"
|
||||
services:
|
||||
mongo:
|
||||
image: mongo
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: root
|
||||
MONGO_INITDB_ROOT_PASSWORD: example
|
||||
ports:
|
||||
- 27017:27017
|
||||
|
||||
mongo-express:
|
||||
image: mongo-express
|
||||
ports:
|
||||
- 8081:8081
|
||||
environment:
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: root
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: example
|
||||
ME_CONFIG_MONGODB_URL: mongodb://root:example@mongo:27017/
|
75
mongo/mongocwarler.py
Normal file
75
mongo/mongocwarler.py
Normal file
@ -0,0 +1,75 @@
|
||||
import pymongo
|
||||
|
||||
|
||||
import trafilatura
|
||||
import trafilatura.feeds
|
||||
import trafilatura.sitemaps
|
||||
import trafilatura.spider
|
||||
import sys
|
||||
|
||||
def index_page(db,url,content,extracted_page):
|
||||
htmlldb = db["html"]
|
||||
htmldb.insert_one({"url":ulr,"content":content})
|
||||
contentdb = db["content"]
|
||||
contentdb.insert_one(extracted_page)
|
||||
pass
|
||||
|
||||
def fetch_pages(link_batch):
|
||||
docs = []
|
||||
for link in link_batch:
|
||||
link_doc = {"url":link,"status": "unvisited"}
|
||||
rr = trafilatura.fetch_url(page,decode=True)
|
||||
if rr is not None:
|
||||
link_doc["status"] = "html_ok"
|
||||
link_doc["html"] = rr
|
||||
docs.append(link_doc)
|
||||
return docs
|
||||
|
||||
def extract_pages(link_docs):
|
||||
content = []
|
||||
extracted_links = set()
|
||||
for doc in link_docs:
|
||||
if doc["status"] != "html_ok":
|
||||
continue
|
||||
extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True)
|
||||
links = extracted_doc["links"]
|
||||
extracted_links += links
|
||||
del extracted_doc["links"]
|
||||
content.append(extracted_doc)
|
||||
return content, extracted_links
|
||||
|
||||
def index_pages(pagedb,pages_list):
|
||||
mycol = pagedb["content"]
|
||||
for page in page_list:
|
||||
# get paragraph checksums
|
||||
checksums = get_checksums(page["text"])
|
||||
page["checksums"] = checksums
|
||||
x = mycol.insert_many(pages_list)
|
||||
page_hashes = []
|
||||
pass
|
||||
|
||||
def get_visited_links(domain):
|
||||
return []
|
||||
|
||||
def generic_visit(domain):
|
||||
known_links = set(get_visited_links(domain))
|
||||
visit_links = []
|
||||
visit_links = trafilatura.find_feed_urls(domain)
|
||||
if visit_links is None:
|
||||
visit_links = trafilatura.sitemap_search(domain)
|
||||
if visit_links is None:
|
||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
||||
|
||||
def simple_visit(domain):
|
||||
known_links = []
|
||||
#res = trafilatura.spider.focused_crawler(domain,known_links=known_links)
|
||||
print(res)
|
||||
#visit_links = trafilatura.feeds.find_feed_urls(domain)
|
||||
#visit_links = trafilatura.sitemaps.sitemap_search(domain)
|
||||
#print(visit_links)
|
||||
#for link in visit_links:
|
||||
# content = trafilatura.fetch_url(link,decode=True)
|
||||
# document = trafilatura.bare_extraction(content)
|
||||
# print(content)
|
||||
|
||||
simple_visit(sys.argv[1])
|
10
mongo/mongoindexer.py
Normal file
10
mongo/mongoindexer.py
Normal file
@ -0,0 +1,10 @@
|
||||
import pymongo
|
||||
|
||||
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
|
||||
|
||||
mydb = myclient["mydatabase"]
|
||||
mycol = mydb["customers"]
|
||||
|
||||
mydict = {"text":"ahoj svet"}
|
||||
|
||||
x = mycol.insert_one(mydict)
|
Loading…
Reference in New Issue
Block a user