//Jan Holp, DP 2021 //client2 = cassandra //client1 = elasticsearch //----------------------------------------------------------------- //require for PageRank var Pagerank = require('../lib/pagerank') var fs = require('fs') var path = require('path') //require the Elasticsearch librray const elasticsearch = require('elasticsearch'); const client1 = new elasticsearch.Client({ hosts: [ 'localhost:9200'] }); client1.ping({ requestTimeout: 30000, }, function(error) { // at this point, eastic search is down, please check your Elasticsearch service if (error) { console.error('Elasticsearch cluster is down!'); } else { console.log('ELasticSearch is ok'); } }); //create new index - skweb2 client1.indices.create({ index: 'skweb2' }, function(error, response, status) { if (error) { console.log(error); } else { console.log("created a new index", response); } }); //indexing method const bulkIndex = function bulkIndex(index, type, data) { let bulkBody = []; //id = 1; const errorCount = 0; data.forEach(item => { bulkBody.push({ index: { _index: index, _type: type, _id : item.target_link, // documents id is url } }); bulkBody.push(item); }); console.log(bulkBody); //console.log(object_list.id); client1.bulk({body: bulkBody}) .then(response => { response.items.forEach(item => { if (item.index && item.index.error) { console.log(++errorCount, item.index.error); } }); console.log( `Successfully indexed ${data.length - errorCount} out of ${data.length} items` ); }) .catch(console.err); }; const cassandra = require('cassandra-driver'); const client2 = new cassandra.Client({ contactPoints: ['localhost:9042'], localDataCenter: 'datacenter1', keyspace: 'websucker' }); const query1 = 'SELECT domain_name FROM websucker.domain_quality WHERE good_count > 0 ALLOW FILTERING'; //const query2 = 'SELECT * from websucker.content WHERE domain_name = ' + domain_name[i] + 'ALLOW FILTERING'; // body_size > 0 //------------------------------------------------------------------------- var domain_name = []; // pole domain name var object_list = []; // pole ktore obsahuje vsetky dokumenty pre jednotilive domain name const linkProb = 0.85; // high numbers are more stable (Pagerank) const tolerance = 0.0001; // sensitivity for accuracy of convergence client2.execute(query1) // vyselektujeme vsetky domenove mena a ulozime do pola .then(result => { let pole = result.rows.map(r => { domain_name.push(r.domain_name) }); console.log("Vsetky domenove mena : " , domain_name); domain_name.forEach(name => { // pre kazde domenove meno spustime select nizsie, kt. vyberie vsetky clanky ktore niesu prazdne let query = 'SELECT * from websucker.content WHERE domain_name = ' + "'" + name + "'" + ' and body_size > 0 ALLOW FILTERING'; client2.execute(query).then( res => { object_list = res.rows.map(rr => { return { domain_name: rr.domain_name, title: rr.title, body: rr.body, links: rr.links, target_link: rr.target_link, // pagerank: Pagerank(rr.links ,linkProb,tolerance, function (err, res) { // return res; // }) } }); //console.log(object_list); bulkIndex('skweb2', 'web_page', object_list); }).catch(error => console.log(error)); }) }).catch(err => console.log(err)); //volanie funkcie pre vypocet Pageranku a definovane premenne //Larger numbers (0.85) //var linkProb = 0.85; //accuracy at which we terminate //--------------Pagerank // const linkProb = 0.85; // high numbers are more stable (Pagerank) // const tolerance = 0.0001; // sensitivity for accuracy of convergence /* var nodeMatrix = [ [object_list.links] ]; */ /* const PR = function PR(nodeMatrix,linkProb,tolerance){ Pagerank(nodeMatrix, linkProb, tolerance, function (err, res) { return res; //console.log(res); }); } */