forked from KEMT/zpwiki
143 lines
4.0 KiB
JavaScript
143 lines
4.0 KiB
JavaScript
|
//Jan Holp, DP 2021
|
||
|
|
||
|
|
||
|
//client2 = cassandra
|
||
|
//client1 = elasticsearch
|
||
|
//-----------------------------------------------------------------
|
||
|
|
||
|
//require for PageRank
|
||
|
var Pagerank = require('../lib/pagerank')
|
||
|
var fs = require('fs')
|
||
|
var path = require('path')
|
||
|
|
||
|
//require the Elasticsearch librray
|
||
|
const elasticsearch = require('elasticsearch');
|
||
|
const client1 = new elasticsearch.Client({
|
||
|
hosts: [ 'localhost:9200']
|
||
|
});
|
||
|
client1.ping({
|
||
|
requestTimeout: 30000,
|
||
|
}, function(error) {
|
||
|
// at this point, eastic search is down, please check your Elasticsearch service
|
||
|
if (error) {
|
||
|
console.error('Elasticsearch cluster is down!');
|
||
|
} else {
|
||
|
console.log('ELasticSearch is ok');
|
||
|
}
|
||
|
});
|
||
|
|
||
|
//create new index - skweb2
|
||
|
client1.indices.create({
|
||
|
index: 'skweb2'
|
||
|
}, function(error, response, status) {
|
||
|
if (error) {
|
||
|
console.log(error);
|
||
|
} else {
|
||
|
console.log("created a new index", response);
|
||
|
}
|
||
|
});
|
||
|
|
||
|
//indexing method
|
||
|
const bulkIndex = function bulkIndex(index, type, data) {
|
||
|
let bulkBody = [];
|
||
|
//id = 1;
|
||
|
const errorCount = 0;
|
||
|
data.forEach(item => {
|
||
|
bulkBody.push({
|
||
|
index: {
|
||
|
_index: index,
|
||
|
_type: type,
|
||
|
_id : item.target_link, // documents id is url
|
||
|
}
|
||
|
});
|
||
|
bulkBody.push(item);
|
||
|
});
|
||
|
console.log(bulkBody);
|
||
|
//console.log(object_list.id);
|
||
|
|
||
|
client1.bulk({body: bulkBody})
|
||
|
.then(response => {
|
||
|
|
||
|
response.items.forEach(item => {
|
||
|
if (item.index && item.index.error) {
|
||
|
console.log(++errorCount, item.index.error);
|
||
|
}
|
||
|
});
|
||
|
console.log(
|
||
|
`Successfully indexed ${data.length - errorCount}
|
||
|
out of ${data.length} items`
|
||
|
);
|
||
|
})
|
||
|
.catch(console.err);
|
||
|
};
|
||
|
|
||
|
|
||
|
const cassandra = require('cassandra-driver');
|
||
|
const client2 = new cassandra.Client({ contactPoints: ['localhost:9042'], localDataCenter: 'datacenter1', keyspace: 'websucker' });
|
||
|
const query1 = 'SELECT domain_name FROM websucker.domain_quality WHERE good_count > 0 ALLOW FILTERING';
|
||
|
//const query2 = 'SELECT * from websucker.content WHERE domain_name = ' + domain_name[i] + 'ALLOW FILTERING'; // body_size > 0
|
||
|
|
||
|
//-------------------------------------------------------------------------
|
||
|
|
||
|
var domain_name = []; // pole domain name
|
||
|
var object_list = []; // pole ktore obsahuje vsetky dokumenty pre jednotilive domain name
|
||
|
const linkProb = 0.85; // high numbers are more stable (Pagerank)
|
||
|
const tolerance = 0.0001; // sensitivity for accuracy of convergence
|
||
|
|
||
|
|
||
|
client2.execute(query1) // vyselektujeme vsetky domenove mena a ulozime do pola
|
||
|
.then(result => {
|
||
|
let pole = result.rows.map(r => {
|
||
|
domain_name.push(r.domain_name)
|
||
|
});
|
||
|
console.log("Vsetky domenove mena : " , domain_name);
|
||
|
domain_name.forEach(name => { // pre kazde domenove meno spustime select nizsie, kt. vyberie vsetky clanky ktore niesu prazdne
|
||
|
let query = 'SELECT * from websucker.content WHERE domain_name = ' + "'" + name + "'" + ' and body_size > 0 ALLOW FILTERING';
|
||
|
client2.execute(query).then( res => {
|
||
|
object_list = res.rows.map(rr => {
|
||
|
return {
|
||
|
domain_name: rr.domain_name,
|
||
|
title: rr.title,
|
||
|
body: rr.body,
|
||
|
links: rr.links,
|
||
|
target_link: rr.target_link,
|
||
|
// pagerank: Pagerank(rr.links ,linkProb,tolerance, function (err, res) {
|
||
|
// return res;
|
||
|
// })
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
});
|
||
|
//console.log(object_list);
|
||
|
bulkIndex('skweb2', 'web_page', object_list);
|
||
|
}).catch(error => console.log(error));
|
||
|
})
|
||
|
|
||
|
}).catch(err => console.log(err));
|
||
|
|
||
|
//volanie funkcie pre vypocet Pageranku a definovane premenne
|
||
|
|
||
|
//Larger numbers (0.85) //var linkProb = 0.85;
|
||
|
//accuracy at which we terminate
|
||
|
//--------------Pagerank
|
||
|
// const linkProb = 0.85; // high numbers are more stable (Pagerank)
|
||
|
// const tolerance = 0.0001; // sensitivity for accuracy of convergence
|
||
|
/*
|
||
|
var nodeMatrix = [
|
||
|
[object_list.links]
|
||
|
];
|
||
|
*/
|
||
|
/*
|
||
|
|
||
|
const PR = function PR(nodeMatrix,linkProb,tolerance){
|
||
|
|
||
|
Pagerank(nodeMatrix, linkProb, tolerance, function (err, res) {
|
||
|
|
||
|
|
||
|
return res;
|
||
|
//console.log(res);
|
||
|
|
||
|
});
|
||
|
}
|
||
|
*/
|