{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# This notebook is clustering samples based on their semantic similarity.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "# imports \n", "\n", "from sentence_transformers import SentenceTransformer, util\n", "from tqdm import tqdm\n", "import numpy as np\n", "import torch\n", "import numpy as np\n", "import warnings\n", "import json\n", "import os\n", "\n", "warnings.filterwarnings(\"ignore\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Model init\n", "\n", "In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Data manipulation in file system" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def load_jsonl(file_path):\n", " with open(file_path, 'r', encoding='utf-8') as file:\n", " return [json.loads(line) for line in file]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline functions" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Embedding creation" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def create_embeddings(jsonl_file):\n", " sentences = [item['text'] for item in jsonl_file]\n", " return model.encode(sentences), sentences" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Clustering algorithm" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def cluster_data(embeddings, sentences):\n", " embeddings_np = np.array(embeddings)\n", "\n", " similarity_threshold = 0.65\n", "\n", " long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n", "\n", " cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n", "\n", " below_threshold_mask = cosine_sim_matrix < similarity_threshold\n", "\n", " filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n", "\n", " non_spam_indices = np.where(filtered_mask)\n", "\n", " filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n", "\n", " return filtered_sentences" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare data to write it to JSONL" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def filter_null_text(json_list):\n", " filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n", " return filtered_list" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def create_jsonl_format(filtered, jsonl_file):\n", "\n", " return [\n", " {\n", " 'id': item['id'],\n", " 'author': item['author'],\n", " 'text': item['text']\n", " }\n", " for item in jsonl_file if item['text'] in filtered\n", " ]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Write out JSONL file" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def write_jsonl(filename, data):\n", " with open(filename, 'w') as f:\n", " for item in data:\n", " json.dump(item, f)\n", " f.write('\\n')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Pipeline execution" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def execute_pipeline(jsonl_file):\n", " embeddings, sentences = create_embeddings(jsonl_file)\n", " filtered_data = cluster_data(embeddings, sentences)\n", " return create_jsonl_format(filtered_data, jsonl_file)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Pipeline usecase" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "prepare data for clustering in a loop" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['aktuality_data.jsonl',\n", " 'denník_n_data.jsonl',\n", " 'televízia_joj_data.jsonl',\n", " 'fakty_data.jsonl',\n", " 'erik_kaliňák_data.jsonl',\n", " 'zomri_data.jsonl',\n", " 'igor_matovic_data.jsonl',\n", " 'peter_marcin_data.jsonl',\n", " 'ján_koleník_data.jsonl',\n", " 'eva_-_hriešne_dobrá_data.jsonl',\n", " 'emefka_data.jsonl',\n", " 'marek_hamsik_data.jsonl',\n", " 'hetrik_data.jsonl',\n", " 'peter_sagan_data.jsonl',\n", " 'marian_čekovský_data.jsonl',\n", " 'zuzana_čaputová_data.jsonl',\n", " 'sajfa_data.jsonl',\n", " 'marian_kotleba_data.jsonl',\n", " 'fico_chunk_3.jsonl',\n", " 'fico_chunk_1.jsonl',\n", " 'šport_v_rtvs_data.jsonl',\n", " 'dominika_cibulkova_data.jsonl',\n", " 'šport24_data.jsonl',\n", " 'niké_liga_data.jsonl',\n", " 'fico_chunk_0.jsonl',\n", " 'ok,ale_ideš_prvý_:d_data.jsonl',\n", " 'fico_chunk_2.jsonl']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_to_cluster = [x for x in os.listdir('jsonl_data')]\n", "\n", "data_to_cluster.remove('robert_fico_data.jsonl')\n", "\n", "data_to_cluster" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Executing the actual pipeline" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n" ] } ], "source": [ "for dataset_name in tqdm(data_to_cluster):\n", " dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n", " dataset = filter_null_text(dataset)\n", " write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['aktuality_data.jsonl',\n", " 'denník_n_data.jsonl',\n", " 'televízia_joj_data.jsonl',\n", " '.DS_Store',\n", " 'fakty_data.jsonl',\n", " 'erik_kaliňák_data.jsonl',\n", " 'zomri_data.jsonl',\n", " 'igor_matovic_data.jsonl',\n", " 'peter_marcin_data.jsonl',\n", " 'ján_koleník_data.jsonl',\n", " 'eva_-_hriešne_dobrá_data.jsonl',\n", " 'emefka_data.jsonl',\n", " 'marek_hamsik_data.jsonl',\n", " 'hetrik_data.jsonl',\n", " 'peter_sagan_data.jsonl',\n", " 'marian_čekovský_data.jsonl',\n", " 'zuzana_čaputová_data.jsonl',\n", " 'sajfa_data.jsonl',\n", " 'marian_kotleba_data.jsonl',\n", " 'fico_chunk_3.jsonl',\n", " 'fico_chunk_1.jsonl',\n", " 'šport_v_rtvs_data.jsonl',\n", " 'dominika_cibulkova_data.jsonl',\n", " 'šport24_data.jsonl',\n", " 'niké_liga_data.jsonl',\n", " 'fico_chunk_0.jsonl',\n", " 'ok,ale_ideš_prvý_:d_data.jsonl',\n", " 'fico_chunk_2.jsonl']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.listdir('jsonl_data')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "sentiment", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }