From 4989384544f5b8d347edbe8cccf98454924e68e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1n=20Holp?= Date: Tue, 5 Jan 2021 08:32:07 +0000 Subject: [PATCH] Upload files to 'pages/students/2016/jan_holp/dp2021/dp2021 zdrojove subory' upravena funkcia map --- .../dp2021 zdrojove subory/cassandra.js | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 pages/students/2016/jan_holp/dp2021/dp2021 zdrojove subory/cassandra.js diff --git a/pages/students/2016/jan_holp/dp2021/dp2021 zdrojove subory/cassandra.js b/pages/students/2016/jan_holp/dp2021/dp2021 zdrojove subory/cassandra.js new file mode 100644 index 000000000..83fe42dfa --- /dev/null +++ b/pages/students/2016/jan_holp/dp2021/dp2021 zdrojove subory/cassandra.js @@ -0,0 +1,143 @@ +//Jan Holp, DP 2021 + + +//client2 = cassandra +//client1 = elasticsearch +//----------------------------------------------------------------- + +//require for PageRank +var Pagerank = require('../lib/pagerank') +var fs = require('fs') +var path = require('path') + +//require the Elasticsearch librray +const elasticsearch = require('elasticsearch'); +const client1 = new elasticsearch.Client({ + hosts: [ 'localhost:9200'] +}); +client1.ping({ + requestTimeout: 30000, + }, function(error) { + // at this point, eastic search is down, please check your Elasticsearch service + if (error) { + console.error('Elasticsearch cluster is down!'); + } else { + console.log('ELasticSearch is ok'); + } + }); + +//create new index - skweb2 +client1.indices.create({ + index: 'skweb2' +}, function(error, response, status) { + if (error) { + console.log(error); + } else { + console.log("created a new index", response); + } +}); + +//indexing method +const bulkIndex = function bulkIndex(index, type, data) { + let bulkBody = []; + //id = 1; +const errorCount = 0; + data.forEach(item => { + bulkBody.push({ + index: { + _index: index, + _type: type, + _id : item.target_link, // documents id is url + } + }); + bulkBody.push(item); + }); + console.log(bulkBody); + //console.log(object_list.id); + + client1.bulk({body: bulkBody}) + .then(response => { + + response.items.forEach(item => { + if (item.index && item.index.error) { + console.log(++errorCount, item.index.error); + } + }); + console.log( + `Successfully indexed ${data.length - errorCount} + out of ${data.length} items` + ); + }) + .catch(console.err); +}; + + +const cassandra = require('cassandra-driver'); +const client2 = new cassandra.Client({ contactPoints: ['localhost:9042'], localDataCenter: 'datacenter1', keyspace: 'websucker' }); +const query1 = 'SELECT domain_name FROM websucker.domain_quality WHERE good_count > 0 ALLOW FILTERING'; +//const query2 = 'SELECT * from websucker.content WHERE domain_name = ' + domain_name[i] + 'ALLOW FILTERING'; // body_size > 0 + +//------------------------------------------------------------------------- + +var domain_name = []; // pole domain name +var object_list = []; // pole ktore obsahuje vsetky dokumenty pre jednotilive domain name +const linkProb = 0.85; // high numbers are more stable (Pagerank) +const tolerance = 0.0001; // sensitivity for accuracy of convergence + + +client2.execute(query1) // vyselektujeme vsetky domenove mena a ulozime do pola + .then(result => { + let pole = result.rows.map(r => { + domain_name.push(r.domain_name) + }); + console.log("Vsetky domenove mena : " , domain_name); + domain_name.forEach(name => { // pre kazde domenove meno spustime select nizsie, kt. vyberie vsetky clanky ktore niesu prazdne + let query = 'SELECT * from websucker.content WHERE domain_name = ' + "'" + name + "'" + ' and body_size > 0 ALLOW FILTERING'; + client2.execute(query).then( res => { + object_list = res.rows.map(rr => { + return { + domain_name: rr.domain_name, + title: rr.title, + body: rr.body, + links: rr.links, + target_link: rr.target_link, + // pagerank: Pagerank(rr.links ,linkProb,tolerance, function (err, res) { + // return res; + // }) + } + + + + }); + //console.log(object_list); + bulkIndex('skweb2', 'web_page', object_list); + }).catch(error => console.log(error)); +}) + +}).catch(err => console.log(err)); + +//volanie funkcie pre vypocet Pageranku a definovane premenne + +//Larger numbers (0.85) //var linkProb = 0.85; +//accuracy at which we terminate +//--------------Pagerank +// const linkProb = 0.85; // high numbers are more stable (Pagerank) +// const tolerance = 0.0001; // sensitivity for accuracy of convergence + /* + var nodeMatrix = [ + [object_list.links] + ]; +*/ +/* + +const PR = function PR(nodeMatrix,linkProb,tolerance){ + +Pagerank(nodeMatrix, linkProb, tolerance, function (err, res) { + + + return res; + //console.log(res); + +}); +} +*/ \ No newline at end of file