zz
This commit is contained in:
		
							parent
							
								
									437d4f9684
								
							
						
					
					
						commit
						1752d5c776
					
				
							
								
								
									
										18
									
								
								mongo/docker-compose.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								mongo/docker-compose.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| version: "3.0" | ||||
| services: | ||||
|   mongo: | ||||
|     image: mongo | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: root | ||||
|       MONGO_INITDB_ROOT_PASSWORD: example | ||||
|     ports: | ||||
|       - 27017:27017 | ||||
| 
 | ||||
|   mongo-express: | ||||
|     image: mongo-express | ||||
|     ports: | ||||
|       - 8081:8081 | ||||
|     environment: | ||||
|       ME_CONFIG_MONGODB_ADMINUSERNAME: root | ||||
|       ME_CONFIG_MONGODB_ADMINPASSWORD: example | ||||
|       ME_CONFIG_MONGODB_URL: mongodb://root:example@mongo:27017/ | ||||
							
								
								
									
										75
									
								
								mongo/mongocwarler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								mongo/mongocwarler.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,75 @@ | ||||
| import pymongo | ||||
| 
 | ||||
| 
 | ||||
| import trafilatura | ||||
| import trafilatura.feeds | ||||
| import trafilatura.sitemaps | ||||
| import trafilatura.spider | ||||
| import sys | ||||
| 
 | ||||
| def index_page(db,url,content,extracted_page): | ||||
|     htmlldb = db["html"] | ||||
|     htmldb.insert_one({"url":ulr,"content":content}) | ||||
|     contentdb = db["content"] | ||||
|     contentdb.insert_one(extracted_page) | ||||
|     pass | ||||
| 
 | ||||
| def fetch_pages(link_batch): | ||||
|     docs  = [] | ||||
|     for link in link_batch: | ||||
|         link_doc = {"url":link,"status": "unvisited"} | ||||
|         rr = trafilatura.fetch_url(page,decode=True) | ||||
|         if rr is not None: | ||||
|             link_doc["status"] = "html_ok" | ||||
|             link_doc["html"] = rr | ||||
|         docs.append(link_doc) | ||||
|     return docs | ||||
| 
 | ||||
| def extract_pages(link_docs): | ||||
|     content = [] | ||||
|     extracted_links = set() | ||||
|     for doc in link_docs: | ||||
|         if doc["status"] != "html_ok": | ||||
|             continue | ||||
|         extracted_doc = trafilatura.bare_extraction(doc["content"],extract_links=True) | ||||
|         links = extracted_doc["links"] | ||||
|         extracted_links += links | ||||
|         del extracted_doc["links"] | ||||
|         content.append(extracted_doc) | ||||
|     return content, extracted_links | ||||
| 
 | ||||
| def index_pages(pagedb,pages_list): | ||||
|     mycol = pagedb["content"] | ||||
|     for page in page_list: | ||||
|         # get paragraph checksums | ||||
|         checksums = get_checksums(page["text"]) | ||||
|         page["checksums"] = checksums | ||||
|     x = mycol.insert_many(pages_list) | ||||
|     page_hashes = [] | ||||
|         pass | ||||
| 
 | ||||
| def get_visited_links(domain): | ||||
|     return [] | ||||
| 
 | ||||
| def generic_visit(domain): | ||||
|     known_links = set(get_visited_links(domain)) | ||||
|     visit_links = [] | ||||
|     visit_links = trafilatura.find_feed_urls(domain) | ||||
|     if visit_links is None: | ||||
|         visit_links = trafilatura.sitemap_search(domain) | ||||
|     if visit_links is None: | ||||
|         visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) | ||||
| 
 | ||||
| def simple_visit(domain): | ||||
|     known_links = [] | ||||
|     #res = trafilatura.spider.focused_crawler(domain,known_links=known_links) | ||||
|     print(res) | ||||
|     #visit_links = trafilatura.feeds.find_feed_urls(domain) | ||||
|     #visit_links = trafilatura.sitemaps.sitemap_search(domain) | ||||
|     #print(visit_links) | ||||
|     #for link in visit_links: | ||||
|     #    content = trafilatura.fetch_url(link,decode=True) | ||||
|     #    document = trafilatura.bare_extraction(content) | ||||
|     #    print(content) | ||||
| 
 | ||||
| simple_visit(sys.argv[1]) | ||||
							
								
								
									
										10
									
								
								mongo/mongoindexer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								mongo/mongoindexer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,10 @@ | ||||
| import pymongo | ||||
| 
 | ||||
| myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") | ||||
| 
 | ||||
| mydb = myclient["mydatabase"] | ||||
| mycol = mydb["customers"] | ||||
| 
 | ||||
| mydict = {"text":"ahoj svet"} | ||||
| 
 | ||||
| x = mycol.insert_one(mydict) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user