390 lines
9.4 KiB
Plaintext
390 lines
9.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# This notebook is clustering samples based on their semantic similarity.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# imports \n",
|
|
"\n",
|
|
"from sentence_transformers import SentenceTransformer, util\n",
|
|
"from tqdm import tqdm\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"import numpy as np\n",
|
|
"import warnings\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"\n",
|
|
"warnings.filterwarnings(\"ignore\")"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Model init\n",
|
|
"\n",
|
|
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Data manipulation in file system"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_jsonl(file_path):\n",
|
|
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
|
" return [json.loads(line) for line in file]"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Pipeline functions"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Embedding creation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_embeddings(jsonl_file):\n",
|
|
" sentences = [item['text'] for item in jsonl_file]\n",
|
|
" return model.encode(sentences), sentences"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Clustering algorithm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cluster_data(embeddings, sentences):\n",
|
|
" embeddings_np = np.array(embeddings)\n",
|
|
"\n",
|
|
" similarity_threshold = 0.65\n",
|
|
"\n",
|
|
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
|
|
"\n",
|
|
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
|
|
"\n",
|
|
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
|
|
"\n",
|
|
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
|
|
"\n",
|
|
" non_spam_indices = np.where(filtered_mask)\n",
|
|
"\n",
|
|
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
|
|
"\n",
|
|
" return filtered_sentences"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Prepare data to write it to JSONL"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_null_text(json_list):\n",
|
|
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
|
|
" return filtered_list"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_jsonl_format(filtered, jsonl_file):\n",
|
|
"\n",
|
|
" return [\n",
|
|
" {\n",
|
|
" 'id': item['id'],\n",
|
|
" 'author': item['author'],\n",
|
|
" 'text': item['text']\n",
|
|
" }\n",
|
|
" for item in jsonl_file if item['text'] in filtered\n",
|
|
" ]"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Write out JSONL file"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def write_jsonl(filename, data):\n",
|
|
" with open(filename, 'w') as f:\n",
|
|
" for item in data:\n",
|
|
" json.dump(item, f)\n",
|
|
" f.write('\\n')"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Pipeline execution"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def execute_pipeline(jsonl_file):\n",
|
|
" embeddings, sentences = create_embeddings(jsonl_file)\n",
|
|
" filtered_data = cluster_data(embeddings, sentences)\n",
|
|
" return create_jsonl_format(filtered_data, jsonl_file)"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pipeline usecase"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"prepare data for clustering in a loop"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['aktuality_data.jsonl',\n",
|
|
" 'denník_n_data.jsonl',\n",
|
|
" 'televízia_joj_data.jsonl',\n",
|
|
" 'fakty_data.jsonl',\n",
|
|
" 'erik_kaliňák_data.jsonl',\n",
|
|
" 'zomri_data.jsonl',\n",
|
|
" 'igor_matovic_data.jsonl',\n",
|
|
" 'peter_marcin_data.jsonl',\n",
|
|
" 'ján_koleník_data.jsonl',\n",
|
|
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
|
" 'emefka_data.jsonl',\n",
|
|
" 'marek_hamsik_data.jsonl',\n",
|
|
" 'hetrik_data.jsonl',\n",
|
|
" 'peter_sagan_data.jsonl',\n",
|
|
" 'marian_čekovský_data.jsonl',\n",
|
|
" 'zuzana_čaputová_data.jsonl',\n",
|
|
" 'sajfa_data.jsonl',\n",
|
|
" 'marian_kotleba_data.jsonl',\n",
|
|
" 'fico_chunk_3.jsonl',\n",
|
|
" 'fico_chunk_1.jsonl',\n",
|
|
" 'šport_v_rtvs_data.jsonl',\n",
|
|
" 'dominika_cibulkova_data.jsonl',\n",
|
|
" 'šport24_data.jsonl',\n",
|
|
" 'niké_liga_data.jsonl',\n",
|
|
" 'fico_chunk_0.jsonl',\n",
|
|
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
|
" 'fico_chunk_2.jsonl']"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
|
|
"\n",
|
|
"data_to_cluster.remove('robert_fico_data.jsonl')\n",
|
|
"\n",
|
|
"data_to_cluster"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Executing the actual pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for dataset_name in tqdm(data_to_cluster):\n",
|
|
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
|
|
" dataset = filter_null_text(dataset)\n",
|
|
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['aktuality_data.jsonl',\n",
|
|
" 'denník_n_data.jsonl',\n",
|
|
" 'televízia_joj_data.jsonl',\n",
|
|
" '.DS_Store',\n",
|
|
" 'fakty_data.jsonl',\n",
|
|
" 'erik_kaliňák_data.jsonl',\n",
|
|
" 'zomri_data.jsonl',\n",
|
|
" 'igor_matovic_data.jsonl',\n",
|
|
" 'peter_marcin_data.jsonl',\n",
|
|
" 'ján_koleník_data.jsonl',\n",
|
|
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
|
" 'emefka_data.jsonl',\n",
|
|
" 'marek_hamsik_data.jsonl',\n",
|
|
" 'hetrik_data.jsonl',\n",
|
|
" 'peter_sagan_data.jsonl',\n",
|
|
" 'marian_čekovský_data.jsonl',\n",
|
|
" 'zuzana_čaputová_data.jsonl',\n",
|
|
" 'sajfa_data.jsonl',\n",
|
|
" 'marian_kotleba_data.jsonl',\n",
|
|
" 'fico_chunk_3.jsonl',\n",
|
|
" 'fico_chunk_1.jsonl',\n",
|
|
" 'šport_v_rtvs_data.jsonl',\n",
|
|
" 'dominika_cibulkova_data.jsonl',\n",
|
|
" 'šport24_data.jsonl',\n",
|
|
" 'niké_liga_data.jsonl',\n",
|
|
" 'fico_chunk_0.jsonl',\n",
|
|
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
|
" 'fico_chunk_2.jsonl']"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"os.listdir('jsonl_data')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "sentiment",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.16"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|