forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			143 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			143 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| //Jan Holp, DP 2021
 | |
| 
 | |
| 
 | |
| //client2 = cassandra
 | |
| //client1  = elasticsearch 
 | |
| //-----------------------------------------------------------------
 | |
| 
 | |
| //require for PageRank 
 | |
| var  Pagerank = require('../lib/pagerank')
 | |
| var fs = require('fs')
 | |
| var path = require('path')
 | |
| 
 | |
| //require the Elasticsearch librray
 | |
| const elasticsearch = require('elasticsearch');
 | |
| const client1 = new elasticsearch.Client({
 | |
|    hosts: [ 'localhost:9200']
 | |
| });
 | |
| client1.ping({
 | |
|      requestTimeout: 30000,
 | |
|  }, function(error) {
 | |
|  // at this point, eastic search is down, please check your Elasticsearch service
 | |
|      if (error) {
 | |
|          console.error('Elasticsearch cluster is down!');
 | |
|      } else {
 | |
|          console.log('ELasticSearch is ok');
 | |
|      }
 | |
|  });
 | |
| 
 | |
| //create new index - skweb2
 | |
| client1.indices.create({
 | |
|     index: 'skweb2'
 | |
| }, function(error, response, status) {
 | |
|     if (error) {
 | |
|         console.log(error);
 | |
|     } else {
 | |
|         console.log("created a new index", response);
 | |
|     }
 | |
| });
 | |
| 
 | |
| //indexing method
 | |
| const bulkIndex = function bulkIndex(index, type, data) {
 | |
| 	let bulkBody = [];
 | |
| 	//id = 1;
 | |
| const errorCount = 0;
 | |
| 	data.forEach(item => {
 | |
| 		bulkBody.push({
 | |
| 			index: {
 | |
| 				_index: index,
 | |
| 				_type:  type,
 | |
| 				_id :   item.target_link,  // documents id is url
 | |
| 			}
 | |
| 		});
 | |
| 		bulkBody.push(item);
 | |
| 	});
 | |
|         console.log(bulkBody);
 | |
|         //console.log(object_list.id);
 | |
| 
 | |
| 	client1.bulk({body: bulkBody})
 | |
| 		.then(response => {
 | |
| 
 | |
| 			response.items.forEach(item => {
 | |
| 				if (item.index && item.index.error) {
 | |
| 					console.log(++errorCount, item.index.error);
 | |
| 				}
 | |
| 			});
 | |
| 			console.log(
 | |
| 				`Successfully indexed ${data.length - errorCount}
 | |
| 				out of ${data.length} items`
 | |
| 			);
 | |
| 		})
 | |
| 		.catch(console.err);
 | |
| };
 | |
| 
 | |
| 
 | |
| const cassandra = require('cassandra-driver');
 | |
| const client2 = new cassandra.Client({ contactPoints: ['localhost:9042'], localDataCenter: 'datacenter1', keyspace: 'websucker' });
 | |
| const query1 = 'SELECT domain_name FROM websucker.domain_quality WHERE good_count > 0 ALLOW FILTERING';
 | |
| //const query2 = 'SELECT * from websucker.content WHERE domain_name = ' + domain_name[i] + 'ALLOW FILTERING'; // body_size > 0
 | |
| 
 | |
| //-------------------------------------------------------------------------
 | |
| 
 | |
| var domain_name = [];         // pole domain name 
 | |
| var object_list = [];         // pole ktore obsahuje vsetky dokumenty pre jednotilive domain name
 | |
| const linkProb = 0.85;    // high numbers are more stable (Pagerank)
 | |
| const tolerance = 0.0001;  // sensitivity for accuracy of convergence
 | |
| 
 | |
| 
 | |
| client2.execute(query1)                             // vyselektujeme vsetky domenove mena a ulozime do pola
 | |
|   .then(result => {
 | |
|   		let pole = result.rows.map(r => {
 | |
|   			domain_name.push(r.domain_name)
 | |
|   		});
 | |
|   		console.log("Vsetky domenove mena : " , domain_name);
 | |
|  	domain_name.forEach(name => {										// pre kazde domenove meno spustime select nizsie, kt. vyberie vsetky clanky ktore niesu prazdne
 | |
| 			let query = 'SELECT * from websucker.content WHERE domain_name = ' + "'" + name + "'" + ' and body_size > 0 ALLOW FILTERING';
 | |
| 			client2.execute(query).then( res => {
 | |
| 				object_list = res.rows.map(rr => {
 | |
| 				return {
 | |
| 					 domain_name: rr.domain_name,
 | |
| 					 title: rr.title,
 | |
| 					 body: rr.body,
 | |
| 					 links: rr.links,
 | |
| 					 target_link: rr.target_link,
 | |
| 					// pagerank: Pagerank(rr.links ,linkProb,tolerance, function (err, res) {
 | |
| 					// 	return res;
 | |
| 					// })
 | |
| 				}
 | |
| 
 | |
| 				
 | |
| 
 | |
| 			});
 | |
| 			//console.log(object_list);
 | |
| 			bulkIndex('skweb2', 'web_page', object_list);
 | |
| 			}).catch(error => console.log(error));
 | |
| })
 | |
| 
 | |
| }).catch(err => console.log(err));
 | |
| 
 | |
| //volanie funkcie pre vypocet Pageranku a definovane premenne
 | |
| 
 | |
| //Larger numbers (0.85) //var linkProb = 0.85;    
 | |
| //accuracy at which we terminate 
 | |
| //--------------Pagerank 
 | |
| //	const linkProb = 0.85;    // high numbers are more stable (Pagerank)
 | |
| //	const tolerance = 0.0001;  // sensitivity for accuracy of convergence
 | |
|    /*
 | |
| 	var nodeMatrix = [
 | |
|         [object_list.links]
 | |
|     ];
 | |
| */
 | |
| /*
 | |
| 
 | |
| const PR = function PR(nodeMatrix,linkProb,tolerance){
 | |
| 
 | |
| Pagerank(nodeMatrix, linkProb, tolerance, function (err, res) {
 | |
| 
 | |
| 
 | |
|     return res;
 | |
|     //console.log(res);
 | |
|            
 | |
| });
 | |
| }
 | |
| */ |