diff --git a/mongo/docker-compose.yaml b/mongo/docker-compose.yaml new file mode 100644 index 0000000..b527f0a --- /dev/null +++ b/mongo/docker-compose.yaml @@ -0,0 +1,18 @@ +version: "3.0" +services: + mongo: + image: mongo + environment: + MONGO_INITDB_ROOT_USERNAME: root + MONGO_INITDB_ROOT_PASSWORD: example + ports: + - 27017:27017 + + mongo-express: + image: mongo-express + ports: + - 8081:8081 + environment: + ME_CONFIG_MONGODB_ADMINUSERNAME: root + ME_CONFIG_MONGODB_ADMINPASSWORD: example + ME_CONFIG_MONGODB_URL: mongodb://root:example@mongo:27017/ diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py new file mode 100644 index 0000000..53c830a --- /dev/null +++ b/mongo/mongocwarler.py @@ -0,0 +1,75 @@ +import pymongo + + +import trafilatura +import trafilatura.feeds +import trafilatura.sitemaps +import trafilatura.spider +import sys + +def index_page(db,url,content,extracted_page): + htmlldb = db["html"] + htmldb.insert_one({"url":ulr,"content":content}) + contentdb = db["content"] + contentdb.insert_one(extracted_page) + pass + +def fetch_pages(link_batch): + docs = [] + for link in link_batch: + link_doc = {"url":link,"status": "unvisited"} + rr = trafilatura.fetch_url(page,decode=True) + if rr is not None: + link_doc["status"] = "html_ok" + link_doc["html"] = rr + docs.append(link_doc) + return docs + +def extract_pages(link_docs): + content = [] + extracted_links = set() + for doc in link_docs: + if doc["status"] != "html_ok": + continue + extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True) + links = extracted_doc["links"] + extracted_links += links + del extracted_doc["links"] + content.append(extracted_doc) + return content, extracted_links + +def index_pages(pagedb,pages_list): + mycol = pagedb["content"] + for page in page_list: + # get paragraph checksums + checksums = get_checksums(page["text"]) + page["checksums"] = checksums + x = mycol.insert_many(pages_list) + page_hashes = [] + pass + +def get_visited_links(domain): + return [] + +def generic_visit(domain): + known_links = set(get_visited_links(domain)) + visit_links = [] + visit_links = trafilatura.find_feed_urls(domain) + if visit_links is None: + visit_links = trafilatura.sitemap_search(domain) + if visit_links is None: + visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) + +def simple_visit(domain): + known_links = [] + #res = trafilatura.spider.focused_crawler(domain,known_links=known_links) + print(res) + #visit_links = trafilatura.feeds.find_feed_urls(domain) + #visit_links = trafilatura.sitemaps.sitemap_search(domain) + #print(visit_links) + #for link in visit_links: + # content = trafilatura.fetch_url(link,decode=True) + # document = trafilatura.bare_extraction(content) + # print(content) + +simple_visit(sys.argv[1]) diff --git a/mongo/mongoindexer.py b/mongo/mongoindexer.py new file mode 100644 index 0000000..3b5f6e5 --- /dev/null +++ b/mongo/mongoindexer.py @@ -0,0 +1,10 @@ +import pymongo + +myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") + +mydb = myclient["mydatabase"] +mycol = mydb["customers"] + +mydict = {"text":"ahoj svet"} + +x = mycol.insert_one(mydict)