BP2024/preprocessing/clustering.ipynb
2024-04-09 15:39:11 +02:00

390 lines
9.4 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# This notebook is clustering samples based on their semantic similarity.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# imports \n",
"\n",
"from sentence_transformers import SentenceTransformer, util\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"import torch\n",
"import numpy as np\n",
"import warnings\n",
"import json\n",
"import os\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model init\n",
"\n",
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data manipulation in file system"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_jsonl(file_path):\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" return [json.loads(line) for line in file]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline functions"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embedding creation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def create_embeddings(jsonl_file):\n",
" sentences = [item['text'] for item in jsonl_file]\n",
" return model.encode(sentences), sentences"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clustering algorithm"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def cluster_data(embeddings, sentences):\n",
" embeddings_np = np.array(embeddings)\n",
"\n",
" similarity_threshold = 0.65\n",
"\n",
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
"\n",
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
"\n",
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
"\n",
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
"\n",
" non_spam_indices = np.where(filtered_mask)\n",
"\n",
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
"\n",
" return filtered_sentences"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare data to write it to JSONL"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def filter_null_text(json_list):\n",
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
" return filtered_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl_format(filtered, jsonl_file):\n",
"\n",
" return [\n",
" {\n",
" 'id': item['id'],\n",
" 'author': item['author'],\n",
" 'text': item['text']\n",
" }\n",
" for item in jsonl_file if item['text'] in filtered\n",
" ]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write out JSONL file"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def write_jsonl(filename, data):\n",
" with open(filename, 'w') as f:\n",
" for item in data:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pipeline execution"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def execute_pipeline(jsonl_file):\n",
" embeddings, sentences = create_embeddings(jsonl_file)\n",
" filtered_data = cluster_data(embeddings, sentences)\n",
" return create_jsonl_format(filtered_data, jsonl_file)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pipeline usecase"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"prepare data for clustering in a loop"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['aktuality_data.jsonl',\n",
" 'denník_n_data.jsonl',\n",
" 'televízia_joj_data.jsonl',\n",
" 'fakty_data.jsonl',\n",
" 'erik_kaliňák_data.jsonl',\n",
" 'zomri_data.jsonl',\n",
" 'igor_matovic_data.jsonl',\n",
" 'peter_marcin_data.jsonl',\n",
" 'ján_koleník_data.jsonl',\n",
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
" 'emefka_data.jsonl',\n",
" 'marek_hamsik_data.jsonl',\n",
" 'hetrik_data.jsonl',\n",
" 'peter_sagan_data.jsonl',\n",
" 'marian_čekovský_data.jsonl',\n",
" 'zuzana_čaputová_data.jsonl',\n",
" 'sajfa_data.jsonl',\n",
" 'marian_kotleba_data.jsonl',\n",
" 'fico_chunk_3.jsonl',\n",
" 'fico_chunk_1.jsonl',\n",
" 'šport_v_rtvs_data.jsonl',\n",
" 'dominika_cibulkova_data.jsonl',\n",
" 'šport24_data.jsonl',\n",
" 'niké_liga_data.jsonl',\n",
" 'fico_chunk_0.jsonl',\n",
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
" 'fico_chunk_2.jsonl']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
"\n",
"data_to_cluster.remove('robert_fico_data.jsonl')\n",
"\n",
"data_to_cluster"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Executing the actual pipeline"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n"
]
}
],
"source": [
"for dataset_name in tqdm(data_to_cluster):\n",
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
" dataset = filter_null_text(dataset)\n",
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['aktuality_data.jsonl',\n",
" 'denník_n_data.jsonl',\n",
" 'televízia_joj_data.jsonl',\n",
" '.DS_Store',\n",
" 'fakty_data.jsonl',\n",
" 'erik_kaliňák_data.jsonl',\n",
" 'zomri_data.jsonl',\n",
" 'igor_matovic_data.jsonl',\n",
" 'peter_marcin_data.jsonl',\n",
" 'ján_koleník_data.jsonl',\n",
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
" 'emefka_data.jsonl',\n",
" 'marek_hamsik_data.jsonl',\n",
" 'hetrik_data.jsonl',\n",
" 'peter_sagan_data.jsonl',\n",
" 'marian_čekovský_data.jsonl',\n",
" 'zuzana_čaputová_data.jsonl',\n",
" 'sajfa_data.jsonl',\n",
" 'marian_kotleba_data.jsonl',\n",
" 'fico_chunk_3.jsonl',\n",
" 'fico_chunk_1.jsonl',\n",
" 'šport_v_rtvs_data.jsonl',\n",
" 'dominika_cibulkova_data.jsonl',\n",
" 'šport24_data.jsonl',\n",
" 'niké_liga_data.jsonl',\n",
" 'fico_chunk_0.jsonl',\n",
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
" 'fico_chunk_2.jsonl']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir('jsonl_data')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}