# This notebook is clustering samples based on their semantic similarity.


In [1]:
# imports 

from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import numpy as np
import torch
import numpy as np
import warnings
import json
import os

warnings.filterwarnings("ignore")

 from .autonotebook import tqdm as notebook_tqdm


### Model init

In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr

In [2]:
model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')

### Data manipulation in file system

In [3]:
def load_jsonl(file_path):
 with open(file_path, 'r', encoding='utf-8') as file:
 return [json.loads(line) for line in file]

## Pipeline functions

### Embedding creation

In [4]:
def create_embeddings(jsonl_file):
 sentences = [item['text'] for item in jsonl_file]
 return model.encode(sentences), sentences

### Clustering algorithm

In [5]:
def cluster_data(embeddings, sentences):
 embeddings_np = np.array(embeddings)

 similarity_threshold = 0.65

 long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])

 cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()

 below_threshold_mask = cosine_sim_matrix < similarity_threshold

 filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))

 non_spam_indices = np.where(filtered_mask)

 filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))

 return filtered_sentences

### Prepare data to write it to JSONL

In [12]:
def filter_null_text(json_list):
 filtered_list = [obj for obj in json_list if "text" in obj and obj["text"] is not None]
 return filtered_list

In [6]:
def create_jsonl_format(filtered, jsonl_file):

 return [
 {
 'id': item['id'],
 'author': item['author'],
 'text': item['text']
 }
 for item in jsonl_file if item['text'] in filtered
 ]

### Write out JSONL file

In [7]:
def write_jsonl(filename, data):
 with open(filename, 'w') as f:
 for item in data:
 json.dump(item, f)
 f.write('\n')

### Pipeline execution

In [8]:
def execute_pipeline(jsonl_file):
 embeddings, sentences = create_embeddings(jsonl_file)
 filtered_data = cluster_data(embeddings, sentences)
 return create_jsonl_format(filtered_data, jsonl_file)

# Pipeline usecase

prepare data for clustering in a loop

In [9]:
data_to_cluster = [x for x in os.listdir('jsonl_data')]

data_to_cluster.remove('robert_fico_data.jsonl')

data_to_cluster

['aktuality_data.jsonl',
 'denník_n_data.jsonl',
 'televízia_joj_data.jsonl',
 'fakty_data.jsonl',
 'erik_kaliňák_data.jsonl',
 'zomri_data.jsonl',
 'igor_matovic_data.jsonl',
 'peter_marcin_data.jsonl',
 'ján_koleník_data.jsonl',
 'eva_-_hriešne_dobrá_data.jsonl',
 'emefka_data.jsonl',
 'marek_hamsik_data.jsonl',
 'hetrik_data.jsonl',
 'peter_sagan_data.jsonl',
 'marian_čekovský_data.jsonl',
 'zuzana_čaputová_data.jsonl',
 'sajfa_data.jsonl',
 'marian_kotleba_data.jsonl',
 'fico_chunk_3.jsonl',
 'fico_chunk_1.jsonl',
 'šport_v_rtvs_data.jsonl',
 'dominika_cibulkova_data.jsonl',
 'šport24_data.jsonl',
 'niké_liga_data.jsonl',
 'fico_chunk_0.jsonl',
 'ok,ale_ideš_prvý_:d_data.jsonl',
 'fico_chunk_2.jsonl']

Executing the actual pipeline

In [13]:
for dataset_name in tqdm(data_to_cluster):
 dataset = load_jsonl(f'jsonl_data/{dataset_name}')
 dataset = filter_null_text(dataset)
 write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))
 

100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] 


In [None]:
os.listdir('jsonl_data')

['aktuality_data.jsonl',
 'denník_n_data.jsonl',
 'televízia_joj_data.jsonl',
 '.DS_Store',
 'fakty_data.jsonl',
 'erik_kaliňák_data.jsonl',
 'zomri_data.jsonl',
 'igor_matovic_data.jsonl',
 'peter_marcin_data.jsonl',
 'ján_koleník_data.jsonl',
 'eva_-_hriešne_dobrá_data.jsonl',
 'emefka_data.jsonl',
 'marek_hamsik_data.jsonl',
 'hetrik_data.jsonl',
 'peter_sagan_data.jsonl',
 'marian_čekovský_data.jsonl',
 'zuzana_čaputová_data.jsonl',
 'sajfa_data.jsonl',
 'marian_kotleba_data.jsonl',
 'fico_chunk_3.jsonl',
 'fico_chunk_1.jsonl',
 'šport_v_rtvs_data.jsonl',
 'dominika_cibulkova_data.jsonl',
 'šport24_data.jsonl',
 'niké_liga_data.jsonl',
 'fico_chunk_0.jsonl',
 'ok,ale_ideš_prvý_:d_data.jsonl',
 'fico_chunk_2.jsonl']