BP2024/preprocessing/clustered_processing.ipynb

1743 lines
572 KiB
Plaintext
Raw Normal View History

2024-04-09 13:39:11 +00:00
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# This notebook does preprocessing, and pre-analysing data before the annotation\n",
"The actions done in this notebook are \n",
"- data merging\n",
"- data analysis and visualisation\n",
"- language identification\n",
"- sentiment analysis\n",
"- GDPR protection\n",
"- samples containing swear words detection"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import json\n",
"import numpy as np\n",
"from transformers import pipeline\n",
"from tqdm import tqdm\n",
"import random\n",
"import re\n",
"import pickle\n",
"\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# macros\n",
"\n",
"OPTIMUM_WIDTH = 0.6\n",
"ALIGN = 'center'\n",
"\n",
"# colors\n",
"BLUE = '#4040a1'\n",
"GREEN = '#618685'\n",
"YELLOW = '#feb236'\n",
"ORANGE = '#f18973'\n",
"AQUA = '#d5f4e6'\n",
"BLACK = '#000000'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_jsonl(file_path):\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" return [json.loads(line) for line in file]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def load_json(filename):\n",
" with open(filename) as json_file:\n",
" return json.load(json_file)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merge ficos data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"merged_fico = []\n",
"for ficos_chunk in range(0,4):\n",
" merged_fico += load_jsonl(f'clustered_jsonl/fico_chunk_{ficos_chunk}.jsonl')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl(filename, new_dataset):\n",
" with open(f'{filename}l', 'w') as jsonl_file:\n",
" for item in new_dataset:\n",
" jsonl_file.write(json.dumps(item) + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"create_jsonl('clustered_jsonl/robert_fico_data.json', merged_fico)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merge toppic data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_post_ids(posts):\n",
" return [post['id'] for post in posts]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def open_data(fname: str):\n",
" jsonl_data = load_jsonl(f'clustered_jsonl/{fname}l')\n",
" json_data = load_json(f'json_data_id/{fname}')\n",
"\n",
" post_ids = get_post_ids(json_data)\n",
" n_posts = len([post for post in jsonl_data if post['id'] in post_ids])\n",
"\n",
" dataset_info = {\n",
" 'name': \" \".join(fname.split('_')[: -1]),\n",
" 'samples': len(jsonl_data),\n",
" 'n_posts': n_posts,\n",
" 'post_ids': post_ids\n",
" }\n",
"\n",
" return jsonl_data, dataset_info"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Politicians"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"fico_samples, fico_info = open_data(\"robert_fico_data.json\")\n",
"matovic_samples, matovic_info = open_data(\"igor_matovic_data.json\")\n",
"ekalinak_samples, ekalinak_info = open_data(\"erik_kaliňák_data.json\")\n",
"caputova_samples, caputova_info = open_data(\"zuzana_čaputová_data.json\")\n",
"kotleba_samples, kotleba_info = open_data(\"marian_kotleba_data.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Memes"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"zomri_samples, zomri_info = open_data(\"zomri_data.json\")\n",
"emefka_samples, emefka_info = open_data(\"emefka_data.json\")\n",
"ok_prv_samples, ok_prv_info = open_data(\"ok,ale_ideš_prvý_:d_data.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Media"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"eva_samples, eva_info = open_data(\"eva_-_hriešne_dobrá_data.json\")\n",
"aktuality_samples, aktuality_info = open_data(\"aktuality_data.json\")\n",
"dennikN_samples, dennikN_info = open_data(\"denník_n_data.json\")\n",
"tvJOJ_samples, tvJOJ_info = open_data(\"televízia_joj_data.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Famous people"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"peter_marcin_samples, peter_marcin_info = open_data(\"peter_marcin_data.json\")\n",
"sajfa_samples, sajfa_info = open_data(\"sajfa_data.json\")\n",
"jan_kolenik_samples, jan_kolenik_info = open_data(\"ján_koleník_data.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sports"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"sport24_samples, sport24_info = open_data(\"šport24_data.json\")\n",
"dominika_cibulkova_samples, dominika_cibulkova_info= open_data(\"dominika_cibulkova_data.json\")\n",
"hetrik_samples, hetrik_info = open_data(\"hetrik_data.json\")\n",
"RTVSsport_samples, RTVSsport_info = open_data(\"šport_v_rtvs_data.json\")\n",
"sagan_samples, sagan_info = open_data(\"peter_sagan_data.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data analysis & visualisation"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def show_scraped_data(toppic: str, samples: list, names: list, posts: list):\n",
" fig, ax = plt.subplots(\n",
" nrows = 1,\n",
" ncols = 2,\n",
" figsize = (17, 5)\n",
" )\n",
"\n",
" fig.suptitle(f'Number of extracted records on toppic: {toppic}', fontsize=20)\n",
"\n",
" # first barh chart\n",
" \n",
" for axis in ax:\n",
" axis.set_ylabel('Facebook profiles')\n",
" axis.set_xlabel('Number of samples')\n",
" \n",
" ax[0].title.set_text(\"All samples\")\n",
" ax[0].set_xlim(0, max(samples) * 1.2)\n",
"\n",
" for idx, val in enumerate(samples):\n",
" ax[0].text(val, idx, str(val), color='black', va='center')\n",
"\n",
"\n",
" ax[0].barh(\n",
" y = names,\n",
" width = samples,\n",
" edgecolor = BLACK,\n",
" color = [BLUE, GREEN, AQUA, ORANGE, YELLOW]\n",
" )\n",
"\n",
" # second barh chart\n",
" \n",
" ax[1].title.set_text(\"Posts\")\n",
" ax[1].set_xlim(0, max(posts) * 1.2)\n",
"\n",
" ax[1].barh(\n",
" y = names,\n",
" width = posts,\n",
" edgecolor = BLACK,\n",
" color = [BLUE, GREEN, AQUA, ORANGE, YELLOW]\n",
" )\n",
"\n",
" for idx, val in enumerate(posts):\n",
" ax[1].text(val, idx, str(val), color='black', va='center')\n",
"\n",
" plt.subplots_adjust(wspace=0.5)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def execute_visualisation(toppic: str, info_list: list):\n",
"\n",
" if toppic == None:\n",
" each_sample = np.empty(0, dtype=np.int32)\n",
" each_post = np.empty(0, dtype=np.int32)\n",
" names = np.array([\n",
" 'Politicians',\n",
" 'Memes',\n",
" 'Media',\n",
" 'Famous people',\n",
" 'Sports'\n",
" ]\n",
" )\n",
"\n",
" for toppic in info_list:\n",
" each_sample = np.append(each_sample, sum([profile['samples'] for profile in toppic]))\n",
" each_post = np.append(each_post, sum([profile['n_posts'] for profile in toppic]))\n",
"\n",
" show_scraped_data('By toppic', each_sample, names, each_post)\n",
" else:\n",
" each_sample = np.array(\n",
" [item['samples'] for item in info_list]\n",
" )\n",
"\n",
" names = np.array(\n",
" [item['name'] for item in info_list]\n",
" )\n",
"\n",
" each_post = np.array(\n",
" [item['n_posts'] for item in info_list]\n",
" )\n",
"\n",
" show_scraped_data(toppic, each_sample, names, each_post)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Scraped data visualisation\n",
"Those graphs represent how many samples we were able to extract from Facebook, those data are cleaned and clustered based on their semantical similarity."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABc0AAAHyCAYAAAA5sObnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACp5UlEQVR4nOzdd3QU5dvG8WtDCglJCCmUQIDQAwKhWOhBQDoCUkSULtKko8JPKRaKUlVElN5EqnSRjiAllCBK79KVDqEm8/5BMm+W1IUNCeH7OWfPSWaemblndmd37nuffcZiGIYhAAAAAAAAAAAgh5QOAAAAAAAAAACA1IKiOQAAAAAAAAAAUSiaAwAAAAAAAAAQhaI5AAAAAAAAAABRKJoDAAAAAAAAABCFojkAAAAAAAAAAFEomgMAAAAAAAAAEIWiOQAAAAAAAAAAUSiaAwAAAAAAAAAQhaI5AACwO4vFIovFooEDB6Z0KKlWRESExowZo5deekmenp7mMatfv35Kh4ZUjvMr5Q0cONB8HoC4nDhxwnyNTJkyJaXDSbXWr19vHqf169fHmm/Pcy137tyyWCxq1arVE68LAJD2UTQHAMCOYiZ/FotFTZs2TXSZVq1aUXx5DjVr1kzdu3dXaGiobty4kdLhAACeQyEhIVbXLdGPdOnSydvbWy+++KL69OmjI0eOpHSoAAA8VRTNAQBIRnPnztXevXtTOgykMn/88Yfmzp0rSapdu7ZWrVqlP//8U3v37tXXX3+dwtHZJi33uKVXIvBk6G397IqMjNSVK1e0Y8cODR8+XEWKFNF3332X0mHFEl30DwkJSelQAABpjGNKBwAAQFpmGIYGDBigBQsWpHQoSEVWr14tSUqXLp1mzZolT0/PFI4IAGBPuXPnlmEYKR2GTWJ+yR8REaHTp09r7ty5mjp1qu7du6cuXbooV65cql279lOLaeDAgXYbiurEiRN2WQ8A4PlAT3MAAJKJr6+vJGnhwoXavXt3CkeD1OTMmTOSpCxZslAwBwCkCi+88IL5KF68uGrXrq0pU6aYv4AyDEP9+/dP4SgBAHg6KJoDAJBMunbtKhcXF0kiyYSVu3fvSpKcnJxSOBIAABLWuXNn5cqVS5K0a9cuXbx4MYUjAgAg+VE0BwAgmQQEBKh9+/aSpKVLl2r79u2PtZ6kjqscfUPR3Llzx5oX17iyCxYs0GuvvabMmTMrQ4YMKl68uL755hvdv3/fXM4wDM2aNUshISHKnDmz3NzcVLJkSX3//fc2/ex89erVqlevnrJly6b06dMrT5486tKli9njOjG7du1Shw4dVLBgQbm7uytDhgwqWLCgOnbsqEOHDsW73JQpU8z9PnHihO7evavRo0frlVdeka+vrywWy2P/7Hvv3r1q37698ufPLzc3N3l4eKhIkSLq0aNHvD8Bj45l6tSpkqSTJ0/Guvna41q3bp1atmypPHnyyM3NTZ6enipatKj69Omjs2fPxmp/9+5dFStWTBaLRR4eHjp27Fi86+7Zs6cZ3+TJkyX9/7EdNGhQrP2L+Yh5LB4de/bw4cPq0qWLeQwfbX/u3Dl99913atSokfLnz68MGTLIxcVF2bNn1+uvv66ff/5ZkZGRSTo+J06c0IcffqhSpUrJx8dHTk5O8vX1VYUKFTRw4ECr/Y+O8+TJk5KkqVOnxtqv+MbPPX/+vP73v/+pdOnS8vb2louLiwICAtSkSRNzWJ7ERJ9zmTJlkru7u1544QUNGDBAV69eTdLyiXn0veLcuXP68MMPVaRIEXl4eMhisWj9+vVWy0RERGjq1KmqU6eO/P395eLiIh8fH5UvX14jR47U7du3E91uZGSkfvrpJ73xxhvKmTOnXF1d5erqqgIFCqh58+aaN2+e1ftPTPfu3dN3332nypUry8/PT87OzsqaNatq1aqlGTNmJPg6eJz9PX36tDp37qw8efIoffr08vf3V7169ZL8HEoPf2VUv3595ciRQy4uLvLw8FCePHlUoUIFffLJJ4/9mRDt5s2bGjp0qMqUKWO+1nLkyKFGjRpp6dKlCS776Ll45swZ9ezZU/ny5ZOrq6t8fHxUvXp1rVix4rHjs1gsCgwMNP9v3bp1rPMorvffyMhIzZgxQ7Vq1VLWrFnl7OwsPz8/Va5cWd99953u3bsX7zYfvcfC1atXNWDAABUpUkTu7u7y9vZW5cqV9dNPPyUY+6Ofu6GhoWrWrJkCAgKUPn16BQQEqHXr1jpw4EC867BlPPfNmzerXbt2KliwoDw9PeXs7KwcOXKoTp06Gjt2rN3O/cfl4OCg0qVLm/+fOnUqVpslS5aoUaNG5uvdx8dHZcqU0dChQ3Xz5s3H3nZ8982IPq83bNggSdqwYUOs19ej10NJvZ6y5fMipuQ+5wEAT5kBAADsZt26dYYkQ5IxefJk4+zZs4arq6shyXjttdfiXKZly5bmMnHJlSuXIclo2bJlgtuOXk+uXLlizTt+/LhVXB07djT/f/TRsGFD48GDB8adO3eMRo0axdvu3XffjTeW6DYDBgwwBg4cGO86MmbMaGzcuDHe9URERBg9evQwLBZLvOtwdHQ0xo8fH+fykydPNtuFhoYawcHBsZYfMGBAgsc1LoMHDzYcHBzijcnFxcWYOnVqvMcloYetbt++bbz55psJrjNDhgzG4sWLYy37559/Gi4uLoYko0yZMsaDBw9itVm1apV5/N944w1zesxjm9Dj+PHj5jKVKlUyJBmVKlUyfvnlFyNDhgzxtn/w4EGCxzj6Ua1aNePGjRsJHqOvvvrKcHJySnA9lSpVihVnUttHmzFjRpz7FPPRtm1b4/79+3HGef/+faNx48bxLpsnTx7j2LFjT/TaNQzr94otW7YYvr6+sba1bt06s/3JkyeN4sWLJ7hf+fLlMw4ePBjvNo8fPx7n+ZfQdmMuW6hQoQSXK1++vHHp0iW77O/GjRsNT0/PeLc1cOBAY8CAAfGesw8ePEjweYx+lCpVKsnP2aN27dpl+Pv7J7j+hg0bGrdv345z+Zjn4qZNm+I8JtGPr7766rFiTMr7w6Ov4UuXLhnlypVLcJmgoCDjxIkTcW4z5vNy7NgxI2/evPGup0mTJvGeizE/dydOnGg4OjrGuQ4XFxdjzpw5ca7j0c/duISHhxvNmjWz+Tg9uq/xrT8pYr7fJSTm58zWrVvN6bdv3zYaNGiQYPz+/v7G7t2741xvzOumuM7/+M61mNdO8T0evR5KyvWUrZ8XhvF0znkAwNNH0RwAADt6tGhuGIbRs2dPc9rvv/8ea5mnXTR/+eWXDUlGrVq1jAULFhg7d+40fvnlF3O6JOPHH3803n//fUOS8dZbbxlLly41du7cacyePduqeLVixYo4Y4meX7p0aUOSUbBgQWPixIlGaGiosXr1auO9994zC6Kenp7GqVOn4lxPp06dzHVVrFjRmDRpkrF+/Xpj+/btxo8//mgUKVLEnL9o0aJYy8cs7BYrVsywWCxGixYtjGXLlhk7d+40Fi5caCxfvjzB4/qosWPHmuv08/Mzhg8fbmzZssXYtGmTMXDgQLNoarFYjGXLllktu3fvXmPv3r3G66+/bhYSoqdFP2wRGRlp1K5d24ynbt26xvTp043NmzcbW7ZsMcaMGWPkzJnTkGQ4OzsboaGhsdYxatQoc/mBAwdazbt06ZJZmPP39zf+++8/c96VK1eMvXv3Wn0B8+i+7N2717h37565THRxJjAw0HB3dzf8/PyMoUOHGps3bza2bt1qfPPNN8a///5rGMbDArKDg4Px6quvGl999ZXx66+/Gjt37jTWr19vTJo0yShTpoy53RYtWsR7jD799FOznZeXl9GvXz9j1apVxq5du4y1a9caw4cPN8qWLWuEhISYyxw7dszYu3evue+vv/56rP06duyY1XZ+/vln88uFPHnyGCNHjjRjnj9/vlGrVi0zjh49esQZa7du3cw28Z0zL774YoKFtKSIfq/w8fEx/P39DXd3d+N///ufeW5NnDjROHDggGEYhvHff/8ZAQEBZoGwS5cuxty5c43Q0FBj3bp1Rt++fQ03Nzdzv69
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Politics\n",
"politics_info_list = [fico_info, matovic_info, ekalinak_info, caputova_info, kotleba_info]\n",
"execute_visualisation('Politics', politics_info_list)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdYAAAHyCAYAAADm/pz4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACSvklEQVR4nOzdd3gU1fv38c+GhBSSUBJaJIQSegcBASEUkd4VRBQQEFERKaLCVyEoTVBEUVAR6UgVBQUpQlA6AYIISC9KlV5CTc7zB0/ml5A6kBCSvF/XtRfszJkz98zuTua+d/aMwxhjBAAAAAAAAAAAksQptQMAAAAAAAAAACAtobAOAAAAAAAAAIANFNYBAAAAAAAAALCBwjoAAAAAAAAAADZQWAcAAAAAAAAAwAYK6wAAAAAAAAAA2EBhHQAAAAAAAAAAGyisAwAAAAAAAABgA4V1AAAAAAAAAABsoLAOAAAeOofDIYfDoeDg4NQO5ZEVERGhzz77TFWqVJG3t7e1z1q2bJnaoeERx+cr9QUHB1uvAxCXI0eOWO+RKVOmpHY4AADgPlBYBwDgIQoJCbESaYfDoXbt2iW6TOfOnSnQZEDt27dX7969tWXLFl25ciW1wwEAZEC1a9eOcd7ywQcfJGm5YcOGxViudu3aKRsoAACpgMI6AACpaN68edq5c2dqh4FHzPr16zVv3jxJUpMmTbRixQr9+eef2rlzpz7//PNUjs6e9HzlboECBeRwONS5c+fUDgVIk7hqO+2ZOXNmktpNnz49hSMBACD1Oad2AAAAZGTGGA0ePFg//PBDaoeCR8jKlSslSZkyZdKsWbPk7e2dyhEBAJJTgQIFZIxJ7TCSzM3NTTdu3NC+ffu0adMmVa1aNd62W7Zs0d69e2MsBwBAesQV6wAApBJfX19J0sKFC7V9+/ZUjgaPkuPHj0uScufOTVEdAJDqcufOrSpVqkhK/Gr0qPlVq1ZV7ty5Uzw2AABSC4V1AABSSa9eveTq6ipJGjRoUCpHg0fJzZs3JUkuLi6pHAkAAHd17NhRkjRnzhzdvn07zjZ37tzR7NmzY7QHACC9orAOAEAq8ff3V/fu3SVJP//8szZv3nxf/SR1nOeom6AWKFAg1ry4xrn94Ycf9PTTTytXrlzKkiWLypUrp3HjxsVIpo0xmjVrlmrXrq1cuXLJw8NDFStW1FdffWXrJ+4rV65U8+bNlTdvXrm5ualQoULq2bOndeV2YrZt26YePXqoWLFi8vT0VJYsWVSsWDG9+uqr2rdvX7zLTZkyxdruI0eO6ObNmxo7dqyeeOIJ+fr6yuFwKDg4OMnbEd3OnTvVvXt3FSlSRB4eHvLy8lKpUqXUp08fHTlyJM5lomKZOnWqJOno0aMxbv72IOOUr169Wp06dVKhQoXk4eEhb29vlSlTRv3799eJEyditb9586bKli0rh8MhLy8vHTp0KN6++/bta8U3efJkSf+3b4cMGRJr+6I/ou+LqJvkRd3kbv/+/erZs6e1D+9tf/LkSY0fP17PPPOMihQpoixZssjV1VWPPfaYWrRooTlz5igyMjJJ++fIkSN65513VKlSJfn4+MjFxUW+vr6qWbOmgoODY2x/VJxHjx6VJE2dOjXWdsV3o75Tp07pf//7nx5//HHlyJFDrq6u8vf3V9u2ba0hgBIT9ZnLnj27PD09Vbp0aQ0ePFgXL15M0vKJufdYcfLkSb3zzjsqVaqUvLy85HA4FBISEmOZiIgITZ06VU2bNpWfn59cXV3l4+OjJ598UmPGjNH169cTXW9kZKS+//57tWnTRvnz55e7u7vc3d1VtGhRdejQQfPnz4+3mHfr1i2NHz9ederUUc6cOZU5c2blyZNHjRs31owZMxJ8H9zP9v777796/fXXVahQIbm5ucnPz0/NmzdP8mso3f21UsuWLZUvXz65urrKy8tLhQoVUs2aNfX+++/f99+EKFevXtXIkSNVrVo1672WL18+PfPMM/r5558TXPbez+Lx48fVt29fBQYGyt3dXT4+PmrQoIGWLl163/E5HA4VLFjQev7SSy/F+hzFdfyNjIzUjBkz1LhxY+XJk0eZM2dWzpw5VadOHY0fP163bt2Kd5333vPh4sWLGjx4sEqVKiVPT0/lyJFDderU0ffff59g7Pf+3d2yZYvat28vf39/ubm5yd/fXy+99JL+/vvvePuwM778unXr1K1bNxUrVkze3t7KnDmz8uXLp6ZNm+rLL79Mts9+Ytq1aycXFxedPXs23td+6dKl+u+//+Ti4pKkG7RHuXTpkkaMGKEaNWpYn+G8efOqWbNmmj9/foLnFPe+X1avXq2WLVvKz89P7u7uKlGihD788ENdu3YtxnJLlixR48aNrXYlS5bUiBEjEnwPRblx44a++OIL1atXz3of5sqVS0899ZQmTZqkO3fuJLj8qlWr1L59exUsWFDu7u7y8PBQQECAnnjiCb311ltatWpV4jsNAJD6DAAAeGhWr15tJBlJZvLkyebEiRPG3d3dSDJPP/10nMt06tTJWiYuAQEBRpLp1KlTguuO6icgICDWvMOHD8eI69VXX7We3/to3bq1uXPnjrlx44Z55pln4m338ssvxxtLVJvBgweb4ODgePvImjWr+f333+PtJyIiwvTp08c4HI54+3B2djZff/11nMtPnjzZardlyxZTvnz5WMsPHjw4wf0al+HDhxsnJ6d4Y3J1dTVTp06Nd78k9LDr+vXr5rnnnkuwzyxZsphFixbFWvbPP/80rq6uRpKpVq2auXPnTqw2K1assPZ/mzZtrOnR921Cj8OHD1vLBAUFGUkmKCjI/PjjjyZLlizxtr9z506C+zjqUb9+fXPlypUE99Ho0aONi4tLgv0EBQXFijOp7aPMmDEjzm2K/ujatau5fft2nHHevn3bPPvss/EuW6hQIXPo0KEHeu8aE/NYsWHDBuPr6xtrXatXr7baHz161JQrVy7B7QoMDDR79+6Nd52HDx+O8/OX0HqjL1u8ePEEl3vyySfNuXPnkmV7f//9d+Pt7R3vuoKDg83gwYPj/czeuXMnwdcx6lGpUqUkv2b32rZtm/Hz80uw/9atW5vr16/HuXz0z+LatWvj3CdRj9GjR99XjEk5Ptz7Hj537pypUaNGgsuUKFHCHDlyJM51Rn9dDh06ZAoXLhxvP23bto33sxj97+6kSZOMs7NznH24urqauXPnxtnHvX934xIeHm7at29vez/du63x9Z8UUe+FqHOH5s2bG0nmmWeeibN91Hu7RYsWxpj/21dxHROjrFy50vj4+CS4jY0bN473WB59P4wYMSLec4Lq1aubq1evmsjISNOrV69419WwYcM4/95FCQsLs7YrvkflypXNqVOn4ly+d+/eib6mPj4+8a4fAPDooLAOAMBDdG9h3Rhj+vbta037448/Yi3zsAvrVatWtZLYH374wWzdutX8+OOP1nRJZuLEieaNN94wkszzzz9vfv75Z7N161Yze/bsGAWupUuXxhlL1PzHH3/cSDLFihUzkyZNMlu2bDErV640r7zyilU09fb2NseOHYuzn9dee83qq1atWua7774zISEhZvPmzWbixImmVKlS1vyffvop1vLRi79ly5Y1DofDdOzY0fzyyy9m69atZuHChWbJkiUJ7td7ffnll1afOXPmNB9//LHZsGGDWbt2rQkODrYKqw6Hw/zyyy8xlt25c6fZuXOnadGihZFk/Pz8rGlRDzsiIyNNkyZNrHiaNWtmpk+fbtatW2c2bNhgPvvsM5M/f34jyWTOnNls2bIlVh+ffvqptXxwcHCMeefOnbOKd35+fubs2bPWvAsXLpidO3fG+JLm3m3ZuXOnuXXrlrVMVAGnYMGCxtPT0+TMmdOMHDnSrFu3zmzcuNGMGzfO/Pfff8aYu0VmJycnU7duXTN69Gjz66+/mq1bt5qQkBDz3XffmWrVqlnr7dixY7z76IMPPrDaZcuWzQwcONCsWLHCbNu2zaxatcp8/PHHpnr16qZ27drWMocOHTI7d+60tr1FixaxtuvQoUMx1jNnzhyr2FOoUCEzZswYK+Y
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Memes\n",
"memes_info_list = [zomri_info, emefka_info, ok_prv_info]\n",
"execute_visualisation('Memes', memes_info_list)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABeIAAAHyCAYAAACH5r66AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACb7UlEQVR4nOzdd3gU5ff38c+GFBLSIAklEAi9VxEEhARB6VWqKE1UUKSrgF8kqIgiAgqCokhHRAQFRDD0IjUBpPfee4u0ZJ4/eDK/LMkm2ZAlkLxf17UXZOaee87M7s7OOTt7j8UwDEMAAAAAAAAAAMAhnNI6AAAAAAAAAAAA0jMK8QAAAAAAAAAAOBCFeAAAAAAAAAAAHIhCPAAAAAAAAAAADkQhHgAAAAAAAAAAB6IQDwAAAAAAAACAA1GIBwAAAAAAAADAgSjEAwAAAAAAAADgQBTiAQAAAAAAAABwIArxAADgiWOxWGSxWBQWFpbWoTyxoqOj9fXXX6tSpUry9vY291nTpk3TOjQ84Xh/pb2wsDDzeQAScvToUfM1Mnny5LQOJ0MIDg6WxWJRx44d481buXKl+XysXLnysccGAEgfKMQDAPAEiZvoWSwWtW7dOsllOnbsSEEnA2rbtq169eqlzZs368aNG2kdDgAgAwoNDbU6b/n444+TtdzQoUOtlgsNDXVsoAAAPAEoxAMA8AT79ddftWPHjrQOA0+Yf/75R7/++qskqUGDBgoPD9e///6rHTt26Jtvvknj6OyTnq8MTuzqSgBJ46rwp8+MGTOS1W7atGkOjgQAgCePc1oHAAAAbDMMQ4MHD9bcuXPTOhQ8QZYuXSpJypQpk2bOnClvb+80jggAkJqCg4NlGEZah5FsmTNn1u3bt7V//35t3LhRlStXttl28+bN2rdvn9VyT7rQ0NCn6vkAADyZuCIeAIAnlL+/vyRp3rx52rp1axpHgyfJqVOnJEk5cuSgCA8ASHM5cuRQpUqVJCV9tXvs/MqVKytHjhwOjw0AgCcFhXgAAJ5QPXr0kJubmyTpo48+SuNo8CS5c+eOJMnFxSWNIwEA4IH27dtLkn755Rfdu3cvwTb379/XrFmzrNoDAJBRUIgHAOAJFRQUpDfffFOStHDhQm3atClF/SR3nOrYm74GBwfHm5fQOL1z587VSy+9pOzZsytLliwqW7asxowZY5V8G4ahmTNnKjQ0VNmzZ5eHh4cqVKig7777zq6feC9dulSNGzdWrly5lDlzZhUoUEDdu3c3rwxPSmRkpLp27aqiRYvK09NTWbJkUdGiRdWtWzft37/f5nKTJ082t/vo0aO6c+eORo8ereeee07+/v6yWCwKCwtL9nbEtWPHDr355psqXLiwPDw85OXlpZIlS6p37946evRogsvExjJlyhRJ0rFjx6xudvco46yvWLFCHTp0UIECBeTh4SFvb2+VLl1a7733nk6fPh2v/Z07d1SmTBlZLBZ5eXnp8OHDNvvu06ePGd+kSZMk/d++HTJkSLzti/uIuy9ibwoYe1O/AwcOqHv37uY+fLj9mTNnNG7cOLVo0UKFCxdWlixZ5Obmpty5c6tJkyb65ZdfFBMTk6z9c/ToUX3wwQd65pln5OfnJxcXF/n7+6t69eoKCwuz2v7YOI8dOyZJmjJlSrztsnVjwrNnz+rDDz9UxYoVlS1bNrm5uSkoKEitWrUyhyRKSux7LmvWrPL09FSpUqU0ePBgXb16NVnLJ+XhY8WZM2f0wQcfqGTJkvLy8pLFYtHKlSutlomOjtaUKVPUsGFDBQYGys3NTX5+fnr++ec1cuRI/ffff0muNyYmRj///LNefvll5c2bV+7u7nJ3d1eRIkXUrl07zZkzx2bx7+7duxo3bpxq1qypgIAAubq6KmfOnKpfv76mT5+e6OsgJdt78uRJvfPOOypQoIAyZ86swMBANW7cONnPofTg11BNmzZVnjx55ObmJi8vLxUoUEDVq1fXoEGDUvyZEOvmzZv6/PPPVaVKFfO1lidPHrVo0UILFy5MdNmH34unTp1Snz59VKhQIbm7u8vPz0916tTRX3/9leL4LBaL8ufPb/7dqVOneO+jhI6/MTExmj59uurXr6+cOXPK1dVVAQEBqlmzpsaNG6e7d+/aXOfD96y4evWqBg8erJIlS8rT01PZsmVTzZo19fPPPyca+8Ofu5s3b1bbtm0VFBSkzJkzKygoSJ06ddLevXtt9mHP+Pjr1q1Tly5dVLRoUXl7e8vV1VV58uRRw4YN9e2336baez8prVu3louLiy5evGjzuf/rr7904cIFubi4JOuG9LGuXbumYcOGqVq1auZ7OFeuXGrUqJHmzJmTrHOKv/76S/Xr11dAQIA8PDxUpEgR9enTJ1nnEitXrjSfj4ff77E2bNig//3vfwoNDTVfe97e3ipRooS6deum3bt3J7me/fv3691331WpUqXk5eUlV1dXBQYGqly5curcubN++eUX88t4AMBTyAAAAE+MFStWGJIMScakSZOM06dPG+7u7oYk46WXXkpwmQ4dOpjLJCRfvnyGJKNDhw6Jrju2n3z58sWbd+TIEau4unXrZv798KN58+bG/fv3jdu3bxstWrSw2e6NN96wGUtsm8GDBxthYWE2+/Dx8TFWr15ts5/o6Gijd+/ehsVisdmHs7Oz8f333ye4/KRJk8x2mzdvNsqVKxdv+cGDBye6XxPy2WefGU5OTjZjcnNzM6ZMmWJzvyT2sNd///1ntGnTJtE+s2TJYsyfPz/esv/++6/h5uZmSDKqVKli3L9/P16b8PBwc/+//PLL5vS4+zaxx5EjR8xlQkJCDElGSEiI8fvvvxtZsmSx2f7+/fuJ7uPYx4svvmjcuHEj0X305ZdfGi4uLon2ExISEi/O5LaPNX369AS3Ke7j9ddfN+7du5dgnPfu3TNatmxpc9kCBQoYhw8ffqTXrmFYHyvWr19v+Pv7x1vXihUrzPbHjh0zypYtm+h2FSpUyNi3b5/NdR45ciTB919i6427bLFixRJd7vnnnzcuXbqUKtu7evVqw9vb2+a6wsLCjMGDB9t8z96/fz/R5zH28cwzzyT7OXtYZGSkERgYmGj/zZs3N/77778El4/7Xly7dm2C+yT28eWXX6YoxuQcHx5+DV+6dMmoVq1aossUL17cOHr0aILrjPu8HD582ChYsKDNflq1amXzvRj3c3fixImGs7Nzgn24ubkZs2fPTrCPhz93ExIVFWW0bdvW7v308Lba6j85Yl8LsecOjRs3NiQZLVq0SLB97Gu7SZMmhmH8375K6JgYa+nSpYafn1+i21i/fv1Ej+W9e/e2uWxAQICxefPmRM+X4p6fJXScSc5nWqZMmYxvv/3WZoyzZ882XF1dk+xnx44dNvsAADzZKMQDAPAEebgQbxiG0adPH3PamjVr4i3zuAvxlStXNpPeuXPnGhEREcbvv/9uTpdk/PDDD8a7775rSDJeeeUVY+HChUZERIQxa9Ysq4LYX3/9lWAssfMrVqxoSDKKFi1qTJw40di8ebOxdOlS46233jKLrN7e3sbx48cT7Oftt982+6pRo4bx008/GStXrjQ2bdpk/PDDD0bJkiXN+X/88Ue85eMm1mXKlDEsFovRvn17488//zQiIiKMefPmGYsWLUp0vz7s22+/tUr+R4wYYaxfv95Yu3atERYWZhZiLRaL8eeff1otu2PHDmPHjh1GkyZNDElGYGCgOS32YY+YmBijQYMGZjyNGjUypk2bZqxbt85Yv3698fXXXxt58+Y1JBmurq7G5s2b4/UxatQoc/mwsDCreZcuXTKLfYGBgcbFixfNeVeuXDF27Nhh9aXOw9uyY8cO4+7du+YysQWf/PnzG56enkZAQIDx+eefG+vWrTM2bNhgjBkzxrhw4YJhGA+K0k5OTsYLL7xgfPnll8bixYuNiIgIY+XKlcZPP/1kVKlSxVxv+/btbe6jjz/+2Gzn6+trDBw40AgPDzciIyON5cuXGyNGjDCqVq1qhIaGmsscPnzY2LFjh7ntTZo0ibddhw8ftlrPL7/8Yn5hUaBAAWPkyJFmzL/
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Media\n",
"media_info_list = [eva_info, aktuality_info, dennikN_info, tvJOJ_info]\n",
"execute_visualisation('Medias', media_info_list)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABbAAAAHyCAYAAAA6B+jPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACU10lEQVR4nOzdeZxO9f//8ec1hlnMxsxYJmPsu+yEZKSF7EJUiFRECH1CxaiUkqVIpWSXLSQp2bfsY6SUyFiyZl8m28z794ffnO9cZr3GNeYy87jfbtftNnPO+7zP61xnuc7rdZ3rHJsxxggAAAAAAAAAABfjltkBAAAAAAAAAACQFArYAAAAAAAAAACXRAEbAAAAAAAAAOCSKGADAAAAAAAAAFwSBWwAAAAAAAAAgEuigA0AAAAAAAAAcEkUsAEAAAAAAAAALokCNgAAAAAAAADAJVHABgAAAAAAAAC4JArYAIBsz2azyWazKSIiIrNDcVmxsbH6+OOPVbNmTfn5+VnvWcuWLTM7NLg49q/MFxERYa0HICkHDx60tpEpU6ZkdjhAtrRmzRprP1yzZk1mhwMALoUCNgBkYwlPlG02m5566qlUp3nuuecohGRDHTp0UN++fbVt2zZdunQps8MBAGRD4eHhductKb2KFCmS2eECAAAnoYANALDMmzdPu3fvzuww4GJ++eUXzZs3T5LUpEkTLV++XL/++qt2796tTz75JJOjc0xWvhK1SJEistlseu655zI7FOCexFXIAAAArsk9swMAALgOY4yGDh2qBQsWZHYocCErVqyQJOXIkUOzZs2Sn59fJkcEAHCmIkWKyBiT2WE4JLUv3HPlynWXIgEAABmNAjYAQJIUFBSk06dPa+HChdq5c6eqVKmS2SHBRRw9elSSlD9/forXAACXUKFChcwOAQAA3CXcQgQAIEnq3bu3PDw8JElDhgzJ5GjgSq5duyZJypkzZyZHAgAAAADIbihgAwAkSaGhoXrxxRclSUuWLNHWrVvT1U9a78Mb/zDIpB6ylNR9SBcsWKDHHntM+fLlU+7cuVWpUiWNGzdON27csKYzxmjWrFkKDw9Xvnz55O3trapVq+rzzz936KfRK1asUPPmzVWwYEF5enqqWLFi6tWrl3UlcmoiIyPVvXt3lS5dWj4+PsqdO7dKly6tHj166K+//kp2uilTpljLffDgQV27dk1jx47VAw88oKCgINlsNkVERKR5ORLavXu3XnzxRZUsWVLe3t7y9fVV+fLl9eqrr+rgwYNJThMfy9SpUyVJhw4dSvSQrPRavXq1OnfurGLFisnb21t+fn6qWLGiXnvtNR07dixR+2vXrun++++XzWaTr6+vDhw4kGzf/fr1s+KbPHmypP97b4cNG5Zo+RK+Er4X8Q8LCw8PlyTt27dPvXr1st7D29sfP35cEyZMUJs2bVSyZEnlzp1bHh4euu+++9SiRQvNmTNHcXFxaXp/Dh48qNdff13VqlVTYGCgcubMqaCgINWrV08RERF2yx8f56FDhyRJU6dOTbRc8ctwuxMnTuiNN95Q9erVlTdvXnl4eCg0NFTt2rWzbh2Tmvh9Lk+ePPLx8VGFChU0dOhQnT9/Pk3Tp+b2Y8Xx48f1+uuvq3z58vL19ZXNZtOaNWvspomNjdXUqVPVtGlThYSEyMPDQ4GBgXrwwQc1evRo/ffff6nONy4uTt98842efPJJFS5cWF5eXvLy8lKpUqX0zDPPaP78+XbHn4SuX7+uCRMmqEGDBgoODlauXLlUoEABPfHEE5oxY0aK20F6lveff/5Rz549VaxYMXl6eiokJETNmzdP8zqUpIULF6ply5YqVKiQPDw85Ovrq2LFiqlevXp666230v2ZEO/y5csaMWKEateubW1rhQoVUps2bbRkyZIUp719Xzx69Kj69eunEiVKyMvLS4GBgXr88cf1448/pjs+m82mokWLWv936dIl0X6U1PE3Li5OM2bM0BNPPKECBQooV65cCg4OVoMGDTRhwgRdv3492Xnefk/+8+fPa+jQoSpfvrx8fHyUN29eNWjQQN98802Ksd/+ubtt2zZ16NBBoaGh8vT0VGhoqLp06aI///wz2T4cuf/3xo0b1a1bN5UuXVp+fn7KlSuXChUqpKZNm+rTTz912r7vTL/99pveffddPf7449Y27uPjo5IlS6pz587avHlzitPfvq4uXryoiIgIVaxYUT4+PsqXL5+eeOIJ/fLLL3bTnTp1Sm+++abKly+v3LlzKzAwUC1atNDOnTtTjflOtq2UzrESuv28IymrVq1Shw4dVLRoUXl5ecnb21thYWF64IEHNGDAAK1atSrVZUmKs7b/eFevXtX48ePVsGFD6/3Kly+fHnnkEU2aNEk3b95MtY+DBw/q1VdftY633t7eKlmypF566aVUb1lz+3HiTs8lU7No0SK1bdtWhQsXlqenpwICAlS9enUNGzZM586dc8o8AMClGABAtrV69WojyUgykydPNseOHTNeXl5GknnssceSnKZz587WNEkJCwszkkznzp1TnHd8P2FhYYnGRUdH28XVo0cP6//bX61btzY3b940V69eNW3atEm23QsvvJBsLPFthg4daiIiIpLtw9/f36xbty7ZfmJjY82rr75qbDZbsn24u7ubL774IsnpJ0+ebLXbtm2bqVy5cqLphw4dmuL7mpT33nvPuLm5JRuTh4eHmTp1arLvS0ovR/3333+mffv2KfaZO3dus3jx4kTT/vrrr8bDw8NIMrVr1zY3b95M1Gb58uXW+//kk09awxO+tym9oqOjrWnq169vJJn69eubRYsWmdy5cyfb/ubNmym+x/GvRx991Fy6dCnF92jkyJEmZ86cKfZTv379RHGmtX28GTNmJLlMCV/PP/+8uXHjRpJx3rhxw7Rt2zbZaYsVK2YOHDhwR9uuMfbHik2bNpmgoKBE81q9erXV/tChQ6ZSpUopLleJEiXM3r17k51ndHR0kvtfSvNNOG2ZMmVSnO7BBx80Z86cccryrlu3zvj5+SU7r4iICDN06NBk99mbN2+muB7jX9WqVUvzOrtdZGSkCQkJSbH/1q1bm//++y/J6RPuixs2bEjyPYl/jRw5Ml0xpuX4cPs2fObMGVO3bt0Upylbtqw5ePBgkvNMuF4OHDhgihcvnmw/7dq1S3ZfTPi5O2nSJOPu7p5kHx4eHmbu3LlJ9nH7525SYmJiTIcOHRx+n25f1uT6T4uEx7u0Sniuk9Jr4MCByfaRMP7Dhw+bUqVKJdlHjhw5rPd4165d5r777kt2XaxatSrZ+d3ptpXSOVZCCT8bE37+xevbt2+q71tgYGCK80iOs7Z/Y4yJioqy9oPkXjVq1DAnTpxIto+pU6da5xjJrdv33nsv2ekTbv/pPZdMuK0m9flijDFnz541Dz/8cIrLmi9fPrNp06ZkYwWAexEFbADIxm4vYBtjTL9+/axh69evTzTN3S5g16pVy0gyTzzxhFmwYIHZsWOHWbRokTVckvnyyy/NK6+8YiSZp59+2ixZssTs2LHDzJ49266Q9OOPPyYZS/z46tWrG0mmdOnSZtKkSWbbtm1mxYoV5qWXXrKKk35+fubw4cNJ9vPyyy9bfT300EPm66+/NmvWrDFbt241X375pSlfvrw1/rvvvks0fcJE8v777zc2m8106tTJ/PDDD2bHjh1m4cKFZunSpSm+r7f79NNPrT6Dg4PNRx99ZDZt2mQ2bNhgIiIirAKmzWYzP/zwg920u3fvNrt37zYtWrQwkkxISIg1LP7liLi4ONOkSRMrnmbNmpnp06ebjRs3mk2bNpmPP/7YFC5c2EgyuXLlMtu2bUvUx5gxY6zpIyIi7MadOXPGKpKFhISY06dPW+POnTtndu/ebfdlyO3Lsnv3bnP9+nVrmvhCSdGiRY2Pj48JDg42I0aMMBs3bjSbN28248aNM//++68x5lYx183NzTz88MNm5MiR5qeffjI7duwwa9asMV9//bWpXbu2Nd9OnTol+x69/fbbVruAgAAzePBgs3z5chMZGWlWrVplPvroI1OnTh0THh5uTXPgwAG
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Famous people\n",
"fame_info_list = [peter_marcin_info, sajfa_info, jan_kolenik_info]\n",
"execute_visualisation('Famous people', fame_info_list)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABd0AAAHyCAYAAAAeHmcPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACtN0lEQVR4nOzdd3QU5fv38c+mN5JAAoQSCCX0XqWHIr13iIIoIigiVQUFgooFEcHGVwWpAVQEpAkCEqSXxAACIiUUAQHpvSTz/MGT+WVJhw0Jy/t1zp6TzNxzzzUzW+a6dvYei2EYhgAAAAAAAAAAwENzyOwAAAAAAAAAAACwFxTdAQAAAAAAAACwEYruAAAAAAAAAADYCEV3AAAAAAAAAABshKI7AAAAAAAAAAA2QtEdAAAAAAAAAAAboegOAAAAAAAAAICNUHQHAAAAAAAAAMBGKLoDAAAAAAAAAGAjFN0BAECWZ7FYZLFYFBYWltmhZFmxsbGaNGmSqlWrJm9vb3OftW3bNrNDQxbH6yvzhYWFmccBSMqRI0fM58j06dMzOxwAAJAKiu4AAGRhERERZpJtsVjUpUuXVJd57rnnKN48gbp166aBAwdq+/btunLlSmaHAwB4gkVGRuqVV15RhQoV5OvrKycnJ3l5ealo0aJq1qyZRo4cqbVr1+rOnTuZHSoAABmCojsAAI+RH3/8Ubt3787sMJDFbNq0ST/++KMkqUWLFlq1apV27dql3bt367PPPsvk6NLHnq/4DQoKksVi0XPPPZfZoQCPJa72zvru3r2rfv36qUqVKvrqq6+0c+dOXbp0SbGxsbp27ZoOHTqkFStW6L333lODBg00derUzA7ZZhJeKBEREZHZ4QAAMplTZgcAAADSzjAMjR49WgsWLMjsUJCFrF69WpLk6OioOXPmyNvbO5MjAgDYUlBQkAzDyOwwUtW/f399/fXXkqQ8efLopZdeUs2aNZUzZ07duHFDR44c0ebNm/Xzzz/r2LFjmRwtAAAZh6I7AACPCX9/f/33339auHCh/vjjD1WsWDGzQ0IWceLECUlS7ty5KbgDADLFn3/+qW+++UaSVKFCBa1du1a+vr5WbWrUqKFu3brps88+06pVq+Th4ZEJkQIAkPEYXgYAgMfEgAED5OrqKkkaNWpUJkeDrOTWrVuSJGdn50yOBADwpFq8eLF5Nf57772XqOB+v6efflq1atV6BJEBAPDoUXQHAOAxERgYqD59+kiSli5dqm3btj1QP2kdVzr+hqxBQUGJ5iU1ru6CBQvUuHFj5cqVS56enipfvrw+//xzq5ukGYahOXPmKCQkRLly5ZKHh4cqVaqk//3vf+n62fzq1avVunVr5cmTR25ubipcuLD69+9vXvGdmqioKPXt21fFixeXl5eXPD09Vbx4cfXr109///13sstNnz7d3O4jR47o1q1bmjhxop566in5+/vLYrEoLCwszduR0O7du9WnTx8FBwfLw8ND2bJlU+nSpTVo0CAdOXIkyWXiY5kxY4Yk6ejRo1Y33n2YcdHXrl2rnj17qnDhwvLw8JC3t7fKli2rYcOG6eTJk4na37p1S+XKlZPFYlG2bNl0+PDhZPsePHiwGd+0adMk/d++HTNmTKLtS/hIuC9CQkJksVgUEhIiSTpw4ID69+9v7sP72586dUpfffWVOnbsqODgYHl6esrV1VX58uVTmzZt9P333ysuLi5N++fIkSN64403VLlyZfn5+cnZ2Vn+/v6qU6eOwsLCrLY/Ps6jR49KkmbMmJFou+K34X7//vuv3nrrLVWpUkU5cuSQq6urAgMD1blzZ3NYodTEv+ayZ88uLy8vlSlTRqNHj9bFixfTtHxq7n+vOHXqlN544w2VLl1a2bJlS3J849jYWM2YMUMtW7ZU3rx55erqKj8/P9WuXVsTJkzQjRs3Ul1vXFyc5s6dqw4dOqhAgQJyd3eXu7u7ihUrptDQUM2fPz/ZmzTevn1bX331lerXr6+cOXPKxcVFAQEBat68uWbPnp3i8+BBtveff/7RK6+8osKFC8vNzU158+ZV69at03wMJWnhwoVq27at8ufPL1dXV2XLlk2FCxdWnTp1NHLkyAf+TIh39epVffjhh6pRo4b5XMufP786duyopUuXprjs/a/FEydOaPDgwSpatKjc3d3l5+enJk2a6Jdffnng+CwWiwoVKmT+36tXr0Svo6Tef+Pi4jR79mw1b95cAQEBcnFxUc6cOVW/fn199dVXun37drLrvP8eExcvXtTo0aNVunRpeXl5KUeOHKpfv77mzp2bYuz3f+5u375d3bp1U2BgoNzc3BQYGKhevXrpr7/+SraP9Ixnv3HjRvXu3VvFixeXt7e3XFxclD9/frVs2VJffvmlzV7790s4XEzRokUfuJ+kPmvHjx+vSpUqycfHR97e3qpevbq++uorxcbGptrf2bNn9fbbb6tixYry9fWVm5ubgoKC9Oyzz2rDhg0pLnv/sYuMjNRzzz2nQoUKydXV1YzRYrGofv365nL169dP9Py8/7idPHlSb775prldzs7Oyp07t8qWLatu3bpp+vTpunz5crr3HwAgizAAAECWtXbtWkOSIcmYNm2acfLkScPd3d2QZDRu3DjJZXr27Gkuk5SCBQsakoyePXumuO74fgoWLJhoXkxMjFVc/fr1M/+//9G+fXvj7t27xs2bN42OHTsm2+7FF19MNpb4NqNHjzbCwsKS7cPHx8f4/fffk+0nNjbWGDRokGGxWJLtw8nJyfj666+TXH7atGlmu+3btxsVKlRItPzo0aNT3K9Jef/99w0HB4dkY3J1dTVmzJiR7H5J6ZFeN27cMLp27Zpin56ensbixYsTLbtr1y7D1dXVkGTUqFHDuHv3bqI2q1atMvd/hw4dzOkJ921Kj5iYGHOZevXqGZKMevXqGYsWLTI8PT2TbX/37t0U93H84+mnnzauXLmS4j76+OOPDWdn5xT7qVevXqI409o+3uzZs5PcpoSPF154wbhz506Scd65c8fo1KlTsssWLlzYOHz48EM9dw3D+r1i8+bNhr+/f6J1rV271mx/9OhRo3z58iluV9GiRY39+/cnu86YmJgkX38prTfhsiVKlEhxudq1axvnzp2zyfb+/vvvhre3d7LrCgsLM0aPHp3sa/bu3bspHsf4R+XKldN8zO4XFRVl5M2bN8X+27dvb9y4cSPJ5RO+Fjds2JDkPol/fPzxxw8UY1reH+5/Dp87d86oVatWisuULFnSOHLkSJLrTHhcDh8+bBQpUiTZfjp37pzsazHh5+7UqVMNJyenJPtwdXU1fvjhhyT7uP9zNynXr183unXrlu79dP+2Jtd/al599VWzj4ULFz5QH4Zh/XkQFRVlVK5cOdltqVu3borv2StXrkzx9SfJeOWVV4zY2Ngkl0947CZPnpzksUt4bFJ6JNyvqb0vxD+WLFnywPsRAJC5KLoDAJCF3V90NwzDGDx4sDlt/fr1iZZ51EX36tWrG5KM5s2bGwsWLDAiIyONRYsWmdMlGd9++62ZjHfv3t1YunSpERkZacybN8+q+PXLL78kGUv8/CpVqhiSjOLFixtTp041tm/fbqxevdp46aWXzIKqt7e3cezYsST7efnll60S9e+++86IiIgwtm3bZnz77bdG6dKlzfk///xzouUTFgLKlStnWCwWo0ePHsayZcuMyMhIY+HChcby5ctT3K/3+/LLL80+c+bMaYwfP97YvHmzsWHDBiMsLMwsulosFmPZsmVWy+7evdvYvXu30aZNG0OSkTdvXnNa/CM94uLijBYtWpjxtGrVypg1a5axceNGY/PmzcakSZOMAgUKGJIMFxcXY/v27Yn6+PTTT83lw8LCrOadO3fOLOzlzZvX+O+//8x5Fy5cMHbv3m31Bc7927J7927j9u3b5jLxhb5ChQoZXl5eRs6cOY0PP/zQ2Lhxo7Flyxbj888/N86ePWsYxr0CtIODg9GgQQPj448/NlasWGFERkYaERERxnfffWfUqFHDXG+PHj2S3UfvvPOO2c7X19cYMWKEsWrVKiMqKsr47bffjPH
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Sports\n",
"sport_info_list = [sport24_info, dominika_cibulkova_info, hetrik_info, RTVSsport_info, sagan_info]\n",
"execute_visualisation('Sports', sport_info_list)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABb8AAAHyCAYAAADLDLNCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACsJklEQVR4nOzdd3QU5fv38c+GkJBKIAklEAgQSuhF6V3pvYgiSmgiCFJFga9SRIogoCAgKhIQMBRBEYFIC01KKEGkd0Sa1FATSOb5gyfzy5JCAgmB5f06Z8/JztxzzzWzs5u5rp29x2IYhiEAAAAAAAAAAGyIXXoHAAAAAAAAAABAaqP4DQAAAAAAAACwORS/AQAAAAAAAAA2h+I3AAAAAAAAAMDmUPwGAAAAAAAAANgcit8AAAAAAAAAAJtD8RsAAAAAAAAAYHMofgMAAAAAAAAAbA7FbwAAAAAAAACAzaH4DQCADbNYLLJYLBo2bFh6h/LMio6O1ldffaXy5cvL3d3d3GfNmzdP79DwjOP9lf6GDRtmvg5AQk6ePGkeI0FBQekdDmwAnzsA8Hyh+A0AsEmhoaFmYmKxWPT6668/cpkOHTqQzLyA2rZtqz59+igsLEw3btxI73AAAC+gmjVrWp23xH1kzJhR3t7eql69usaMGaMrV66kd7gAADw3KH4DAF4ICxcu1N69e9M7DDxj/vzzTy1cuFCS1KhRI61atUp//fWX9u7dq0mTJqVzdCljy1ei+fn5yWKxqEOHDukdCvBc4urn59v9+/d16dIlbdy4UYMGDVJAQIA2b96crjHZ8v8cAIBtsU/vAAAAeBoMw9DQoUO1ePHi9A4Fz5DVq1dLkjJkyKB58+bJ3d09nSMCAKQmPz8/GYaR3mGkyMNf1kdFRen48eP68ccftXTpUl28eFFNmjTRoUOH5O3tnU5RvriGDRvGcFcA8Bzhym8AgM3z8vKSJC1ZskS7d+9O52jwLPn3338lSdmzZ6fwDQB4JhQvXtzqUbZsWbVu3Vq//vqr2rdvL0m6evWqvv/++3SOFACAZx/FbwCAzevVq5ccHR0lSUOGDEnnaPAsiYyMlCRlzJgxnSMBAODRBgwYYP4dFhaWjpEAAPB8oPgNALB5vr6+6tq1qyRp2bJl2r59+2P1k9xxh2NvnOnn5xdvXkLjri5evFh169ZVtmzZ5OLiolKlSmny5Mm6d++euZxhGJo3b55q1qypbNmyydnZWWXLltU333yTop9zr169Wk2bNlXOnDmVKVMm5c+fXz179jSvgH6UXbt2qVu3bipcuLBcXV3l4uKiwoULq3v37jp8+HCiywUFBZnbffLkSUVGRurLL79UxYoV5eXlJYvF8tg/Id67d6+6du2qggULytnZWW5ubipWrJj69u2rkydPJrhMbCyzZs2SJJ06dSreDcYe17p16xQYGKj8+fPL2dlZ7u7uKlGihAYMGKCzZ8/Gax8ZGamSJUvKYrHIzc1Nx48fT7Tvfv36mfHNnDlT0v/t2+HDh8fbvriPuPsi9sZqNWvWlCQdOXJEPXv2NPfhw+3PnTunqVOnqnXr1ipYsKBcXFzk6OioXLlyqVmzZpo/f75iYmKStX9Onjypjz76SOXKlZOnp6cyZswoLy8vVatWTcOGDbPa/tg4T506JUmaNWtWvO2K3YaHnT9/Xv/73//00ksvKWvWrHJ0dJSvr6/atGljDnfzKLHvuSxZssjV1VXFixfX0KFDde3atWQt/ygPf1acO3dOH330kYoVKyY3NzdZLBaFhoZaLRMdHa1Zs2apcePG8vHxkaOjozw9PVW1alVNmDBBd+7ceeR6Y2Ji9NNPP6lVq1bKkyePnJyc5OTkpEKFCqldu3ZatGiR1edPXFFRUZo6dapq1aolb29vOTg4KEeOHGrYsKHmzJmT5HHwONt75swZ9ejRQ/nz51emTJnk4+Ojpk2bJvs1lB786qd58+bKnTu3HB0d5ebmpvz586tatWr65JNPHvt/QqybN29qzJgxqlSpknms5c6dW61bt9ayZcuSXPbh9+K///6rfv36yd/fX05OTvL09FS9evW0YsWKx47PYrEoX7585vOOHTvGex8l9PkbExOjOXPmqGHDhsqRI4ccHBzk7e2tWrVqaerUqYqKikp0nQ+PB33t2jUNHTpUxYoVk6urq7JmzapatWrpp59+SjL2h//vhoWFqW3btvL19VWmTJnk6+urjh076uDBg4n2kZLxzjdv3qwuXbqocOHCcnd3l4ODg3Lnzq3GjRtrypQpqfbefxJxX8vYL3BjLV261NzW4ODgR/bVv39/WSwW2dvbJ/j/KSGP8z8n1n///aePP/5YZcqUkYeHhzJlyiQ/Pz+9/fbb2rRpU5LrTY1jIe7N0ENDQxUTE6PvvvtOlStXVtasWc1zsNGjR+vu3buJ9pPc8c4jIyP17bffqlGjRsqVK5ccHR3l4uKiYsWKqUuXLgoJCXnuhuQBgOeSAQCADVq3bp0hyZBkzJw50zh79qzh5ORkSDLq1q2b4DKBgYHmMgnJmzevIckIDAxMct2x/eTNmzfevBMnTljF1b17d/P5w4+WLVsa9+/fN+7evWu0bt060XbvvPNOorHEthk6dKgxbNiwRPvInDmzsWHDhkT7iY6ONvr27WtYLJZE+7C3tzemT5+e4PIzZ84024WFhRmlS5eOt/zQoUOT3K8JGTVqlGFnZ5doTI6OjsasWbMS3S9JPVLqzp07xhtvvJFkny4uLsbSpUvjLfvXX38Zjo6OhiSjUqVKxv379+O1WbVqlbn/W7VqZU6Pu2+Tepw4ccJcpkaNGoYko0aNGsYvv/xiuLi4JNr+/v37Se7j2EedOnWMGzduJLmPxo0bZ2TMmDHJfmrUqBEvzuS2jzVnzpwEtynuo3Pnzsa9e/cSjPPevXvGa6+9luiy+fPnN44fP/5Ex65hWH9WbNmyxfDy8oq3rnXr1pntT506ZZQqVSrJ7fL39zcOHTqU6DpPnDiR4PsvqfXGXbZIkSJJLle1alXj8uXLqbK9GzZsMNzd3RNd17Bhw4yhQ4cm+p69f/9+kq9j7KNcuXLJfs0etmvXLsPHxyfJ/lu2bGncuXMnweXjvhc3bdqU4D6JfYwbN+6xYkzO58PDx/Dly5eNKlWqJLlMQECAcfLkyQTXGfd1OX78uFGgQIFE+2nTpk2i78W4/3dnzJhh2NvbJ9iHo6OjsWDBggT7ePj/bkJu375ttG3bNsX76eFtTaz/5Ij7eZeUffv2me26detmNe/+/ftGzpw5DUlGvXr1kuzn3r17RrZs2QxJRqNGjZId5+P8zzEMwwgJCUny/SzJ6NGjhxEdHZ3gelPjWIh7bhgSEmLUr18/0ViKFi1qnDt3LsF+kvrcibV7924jX758Kd5PAIDUR/EbAGCTHi5+G4Zh9OvXz5y2cePGeMs87eJ3hQoVDElGw4YNjcWLFxs7d+40fvnlF3O6JOO7774z3n//fUOS8eabbxrLli0zdu7caQQHB1sVoVasWJFgLLHzX3rpJUOSUbhwYWPGjBlGWFiYsXr1auPdd981C5vu7u7G6dOnE+znvffeM/uqXr268cMPPxihoaHG9u3bje+++84oVqyYOf/XX3+Nt3zcZLlkyZKGxWIx2rdvb/z+++/Gzp07jSVLlhjLly9Pcr8+bMqUKWaf3t7exhdffGFs2bLF2LRpkzFs2DCz+GmxWIzff//datm9e/cae/fuNZo1a2ZIMnx8fMxpsY+UiImJMRo1amTG06RJE+PHH380Nm/ebGzZssX46quvjDx58hiSDAcHByMsLCxeHxMnTjSXHzZsmNW8y5cvmwU2Hx8f49KlS+a8q1evGnv37rX6IuXhbdm7d68RFRVlLhNbZMmXL5/h6upqeHt7G2PGjDE2b95sbN261Zg8ebLx33//GYbxoEBiZ2dn1K5d2xg3bpyxcuVKY+fOnUZoaKjxww8/GJUqVTLX2759+0T30aeffmq28/DwMAYPHmysWrXK2LVrl7F27Vrjiy++MCpXrmzUrFnTXOb48ePG3r17zW1v1qxZvO06fvy41Xrmz59vfkm
"text/plain": [
"<Figure size 1700x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# By toppic\n",
"execute_visualisation(None, [politics_info_list, memes_info_list, media_info_list, fame_info_list, sport_info_list])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Language identification"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"lang_identification_pipeline = pipeline(\"text-classification\", model=\"papluca/xlm-roberta-base-language-detection\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 4%|▍ | 333/8050 [00:17<05:52, 21.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (535) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 535]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 26%|██▌ | 2113/8050 [01:53<04:38, 21.31it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (524) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 524]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 77%|███████▋ | 6215/8050 [05:33<01:25, 21.49it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (554) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 554]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 8050/8050 [07:14<00:00, 18.53it/s]\n"
]
}
],
"source": [
"cibulkova_idxs = np.empty(0, dtype=np.int32)\n",
"\n",
"for idx, text in enumerate(tqdm(dominika_cibulkova_samples)):\n",
" try:\n",
" identified_text = lang_identification_pipeline(text['text'])[0]\n",
" if identified_text['label'] in ['en', 'es'] and identified_text['score'] > 0.70:\n",
" cibulkova_idxs = np.append(cibulkova_idxs, idx)\n",
" except Exception as err:\n",
" print(err)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 33%|███▎ | 6049/18515 [05:07<09:39, 21.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (1268) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 1268]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 51%|█████ | 9469/18515 [07:58<06:49, 22.10it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (1764) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 1764]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 66%|██████▌ | 12208/18515 [10:15<05:12, 20.16it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (1036) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 1036]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 71%|███████ | 13139/18515 [11:02<04:08, 21.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (959) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 959]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 72%|███████▏ | 13298/18515 [11:10<03:59, 21.81it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The expanded size of the tensor (893) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 893]. Tensor sizes: [1, 514]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 18515/18515 [15:34<00:00, 19.81it/s]\n"
]
}
],
"source": [
"sagan_idxs = np.empty(0, dtype=np.int32)\n",
"\n",
"for idx, text in enumerate(tqdm(sagan_samples)):\n",
" try:\n",
" identified_text = lang_identification_pipeline(text['text'])[0]\n",
" if identified_text['label'] in ['en', 'es'] and identified_text['score'] > 0.70:\n",
" sagan_idxs = np.append(sagan_idxs, idx)\n",
" except Exception as err:\n",
" print(err)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dropping english and spannish data from famous slovak sportsmen"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"cibulkova_idxs_reversed = cibulkova_idxs[::-1]\n",
"for idx in cibulkova_idxs_reversed:\n",
" dominika_cibulkova_samples.pop(idx)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"sagan_idxs_reversed = sagan_idxs[::-1]\n",
"for idx in sagan_idxs_reversed:\n",
" sagan_samples.pop(idx)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"with open('clustered_jsonl/dominika_slovak.jsonl', 'w') as f:\n",
" for item in dominika_cibulkova_samples:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get number of posts in the data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def get_num_of_posts(data: list, info: dict):\n",
" return len([item for item in data if item['id'] in info['post_ids']])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data merging"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Meme data merging"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"meme_samples_final = []\n",
"\n",
"meme_samples_final += zomri_samples\n",
"meme_samples_final += emefka_samples\n",
"\n",
"# ok ale ides prvy dont consist any type of criticism or hate\n",
"post_meme = get_num_of_posts(zomri_samples, zomri_info) + get_num_of_posts(emefka_samples, emefka_info)\n",
"meme_share = len(meme_samples_final)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Media data merging"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"media_samples_final = []\n",
"\n",
"eva_hate = eva_samples[250: 387]\n",
"\n",
"eva_hate_1 = eva_samples[700:]\n",
"\n",
"media_samples_final += eva_hate\n",
"media_samples_final += eva_hate_1\n",
"media_samples_final += aktuality_samples\n",
"media_samples_final += dennikN_samples\n",
"media_samples_final += tvJOJ_samples\n",
"\n",
"post_media = get_num_of_posts(eva_hate, eva_info) + get_num_of_posts(eva_hate_1, eva_info) + get_num_of_posts(aktuality_samples, aktuality_info) + get_num_of_posts(dennikN_samples, dennikN_info) + get_num_of_posts(tvJOJ_samples, tvJOJ_info)\n",
"media_share = len(media_samples_final)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Famous people data merging"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"famous_samples_final = []\n",
"\n",
"# jan_kolenik_samples he has just possitive conversations on his profile\n",
"famous_samples_final += sajfa_samples[:240]\n",
"# peter_marcin_samples has also positive conversations\n",
"\n",
"post_fame = get_num_of_posts(sajfa_samples[:240], sajfa_info)\n",
"fame_share = len(famous_samples_final)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sports"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"sport_samples_final = []\n",
"\n",
"sport_samples_final += sport24_samples\n",
"sport_samples_final += RTVSsport_samples\n",
"\n",
"post_sport = get_num_of_posts(sport24_samples, sport24_info) + get_num_of_posts(RTVSsport_samples, RTVSsport_info)\n",
"sport_share = len(sport_samples_final)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Politician data merging"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"politican_samples_final = []\n",
"\n",
"# fico_samples\n",
"politican_samples_final += ekalinak_samples\n",
"politican_samples_final += kotleba_samples\n",
"politican_samples_final += fico_samples[:50000]\n",
"\n",
"post_politics = get_num_of_posts(ekalinak_samples, ekalinak_info) + get_num_of_posts(kotleba_samples, kotleba_info) + get_num_of_posts(fico_samples[:50000], fico_info)\n",
"politics_share = len(politican_samples_final)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pie chart of data ratio in dataset"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAcoAAAGbCAYAAABETtCOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB/M0lEQVR4nO3dd1xW5f/H8dd9s/ceCigioCgu3HtvTcsyV0qmpmlmZUP7llqZlT/LtNSsXA2tzJVp7j1yb0AEkSkCsjfc5/eHSpKICziMz/Px4CGc+9znfM7NLe/7Ouc616VRFEVBCCGEEEXSql2AEEIIUZ5JUAohhBDFkKAUQgghiiFBKYQQQhRDglIIIYQohgSlEEIIUQwJSiGEEKIYEpRCCCFEMSQohRBCiGJIUArVubu74+/vr8q+586di4eHB3p6ejRu3FiVGp6ERqNh5syZZbKvvXv3otFo2Lt3b4ltMy8vj7fffhs3Nze0Wi0DBw4Eyua4OnXqRKdOnR77+WX52gt1SVCKMnH48GFmzpxJUlKS2qUU2L59O2+//TZt27Zl+fLlfPLJJ/dd95dffmH+/PllV1wVsWzZMubOncuzzz7LypUref3119UuqdRdunSJmTNnEhYWpnYpgLy3H4a+2gWIquHw4cPMmjULf39/rK2tCz0WFBSEVlv2n9l2796NVqvlhx9+wNDQsNh1f/nlFy5cuMCUKVPKpriHlJmZib5+2fw37tChA5mZmQ98rR7F7t27cXFx4csvvyy0vCyPq6xdunSJWbNm0alTJ9zd3dUup9y+t8uTyvlOFKUuPT0dMzOzEtmWkZFRiWznUd24cQMTE5MS/cNf1oyNjctsX1qttsT3d+PGjXs+OEHZHpcQD6QI8QAzZsxQAOXixYvK0KFDFWtra6Vx48aKoijK2bNnlVGjRim1atVSjIyMFCcnJ+XFF19U4uPj73n+f7+uXr2qKIqi1KxZUxk1alShfYaEhCjPPvusYmNjo5iYmCgtW7ZUNm/e/FD15ubmKh9++KHi4eGhGBoaKjVr1lSmTZumZGVlFaxTVD3Lly8vcnsdO3a8Z92aNWsWPB4bG6uMHj1acXR0VIyMjJSGDRsqK1asKLSNq1evKoAyd+5c5YsvvlBq1KihGBsbKx06dFDOnz9faN1Ro0YpZmZmSkhIiNKjRw/F1NRUqVatmjJr1ixFp9MVWhdQZsyYUWhZZGSkMnr0aKVatWqKoaGh4u7urowfP17Jzs5WFEVRcnJylJkzZyqenp6KkZGRYmtrq7Rt21bZvn17sa/rnj17FEDZs2dPodemfv36ysWLF5VOnTopJiYmSvXq1ZXPPvus2G3deT3++3Vn2/89rjvvoeDgYGXUqFGKlZWVYmlpqfj7+yvp6emFtr1s2TKlc+fOioODg2JoaKj4+PgoixYtuqeGjh07Kh07diy2TkVRlKysLGXKlCmKvb29Ym5urvTv31+JiIi4p8awsDBlwoQJire3t2JsbKzY2toqzz77bMH7XFEUZfny5cUe94YNG5Q+ffoU/O48PDyUDz/8UMnLyytU0+XLl5VnnnlGcXJyUoyMjBQXFxfl+eefV5KSkgqt9+OPPyp+fn6KsbGxYmNjozz//PNKeHh4odeguPe2uEValOKhPffcc3h5efHJJ5+g3J6dbceOHYSGhvLiiy/i7OzMxYsXWbp0KRcvXuTo0aNoNBqeeeYZLl++zOrVq/nyyy+xt7cHwMHBocj9xMbG0qZNGzIyMpg8eTJ2dnasXLmSp556irVr1/L0008XW+eYMWNYuXIlzz77LG+++Sb//PMPc+bMISAggPXr1wPw448/snTpUo4dO8b3338PQJs2bYrc3nvvvUdycjKRkZEFpwjNzc2BW6cIO3XqxJUrV5g0aRK1atXi999/x9/fn6SkJF577bVC21q1ahWpqalMnDiRrKwsvvrqK7p06cL58+dxcnIqWC8/P59evXrRqlUrPv/8c/7++29mzJhBXl4eH3744X2PPTo6mhYtWpCUlMS4ceOoW7cuUVFRrF27loyMDAwNDZk5cyZz5sxhzJgxtGjRgpSUFE6cOMGpU6fo3r17sa9tURITE+nVqxfPPPMMgwcPZu3atbzzzjs0aNCA3r17F/kcBwcHfvzxR2bPnk1aWhpz5swBwMfHp9h9DR48mFq1ajFnzhxOnTrF999/j6OjI5999lnBOosXL6Z+/fo89dRT6Ovr8+eff/LKK6+g0+mYOHHiIx/fmDFj+Omnnxg2bBht2rRh9+7d9O3b9571jh8/zuHDhxkyZAiurq6EhYWxePFiOnXqxKVLlzA1NaVDhw5MnjyZBQsWMH369ILjvfPvihUrMDc354033sDc3Jzdu3fzwQcfkJKSwty5cwHIycmhZ8+eZGdn8+qrr+Ls7ExUVBSbN28mKSkJKysrAGbPns3777/P4MGDGTNmDHFxcSxcuJAOHTpw+vRprK2ti31vi7uondSi/LvzaX7o0KH3PJaRkXHPstWrVyuAsn///oJlc+fOLdSKvNt/W5RTpkxRAOXAgQMFy1JTU5VatWop7u7uSn5+/n1rPXPmjAIoY8aMKbR86tSpCqDs3r27YNmdltvD6Nu3b5GftOfPn68Ayk8//VSwLCcnR2ndurVibm6upKSkKIrybwvKxMREiYyMLFj3n3/+UQDl9ddfL1QXoLz66qsFy3Q6ndK3b1/F0NBQiYuLK1jOf1o1I0eOVLRarXL8+PF7ar3TGm3UqJHSt2/fhzruu92vRQkoq1atKliWnZ2tODs7K4MGDXrgNu+0SP/rv8d15z04evToQus9/fTTip2dXaFlRb0ne/bsqXh4eNyz7we1KO+8n1555ZVCy4cNG3ZPjUXt98iRI/e8Pr///vs9r2Nx23j55ZcVU1PTgjMip0+fVgDl999/v2/dYWFhip6enjJ79uxCy8+fP6/o6+sXWn6/97b4l/R6FQ9t/Pjx9ywzMTEp+D4rK4v4+HhatWoFwKlTpx5rP1u2bKFFixa0a9euYJm5uTnjxo0jLCyMS5cuFftcgDfeeKPQ8jfffBOAv/7667FqKm5/zs7ODB06tGCZgYEBkydPJi0tjX379hVaf+DAgbi4uBT83KJFC1q2bFlQ990mTZpU8L1Go2HSpEnk5OSwc+fOImvR6XRs2LCB/v3706xZs3se12g0AFhbW3Px4kWCg4Mf7WDvw9zcnBEjRhT8bGhoSIsWLQgNDS2R7d/tv+/B9u3bk5CQQEpKSsGyu9+TycnJxMfH07FjR0JDQ0lOTn6k/d35vUyePLnQ8qI6vty939zcXBISEvD09MTa2vqh/y/cvY3U1FTi4+Np3749GRkZBAYGAhS0GLdt20ZGRkaR21m3bh06nY7BgwcTHx9f8OXs7IyXlxd79ux5qHrELRKU4qHVqlXrnmU3b97ktddew8nJCRMTExwcHArWe9Q/Sndcu3aNOnXq3LP8zumpa9euFftcrVaLp6dnoeXOzs5YW1sX+9zHrdXLy+ueXrv3q9XLy+uebXh7e99zq4BWq8XDw+Oe9YD73lYQFxdHSkoKvr6+xdb84YcfkpSUhLe3Nw0aNOCtt97i3LlzxT6nOK6urgUhfIeNjQ2JiYmPvc37qVGjxj37AQrt69ChQ3Tr1g0zMzOsra1xcHBg+vTpwKO/J++8n2rXrl1oeVHvz8zMTD744APc3NwwMjLC3t4eBwcHkpKSHnq/Fy9e5Omnn8bKygpLS0scHBwKPoTc2UatWrV44403+P7777G3t6dnz5588803hfYRHByMoih4eXnh4OBQ6CsgIIAbN2480utQ1ck1SvHQ7v60e8fgwYM5fPgwb731Fo0bN8bc3BydTkevXr3Q6XQqVHnLf/9wi3916NCBkJAQNm7cyPbt2/n+++/58ssvWbJkCWPGjHnk7enp6RW5XLl9HbskPWhfISEhdO3albp16/LFF1/g5uaGoaEhW7Zs4csvvyzV9+Srr77K8uXLmTJlCq1bt8bKygqNRsOQIUMear9JSUl07NgRS0tLPvzwQ2rXro2xsTGnTp3inXfeKbSNefPm4e/vX/A7nDx
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.pie(\n",
" [meme_share, media_share, fame_share, sport_share, politics_share],\n",
" labels=['Memes', 'Medias', 'Famous people', 'Sports', 'Politicians'],\n",
" autopct='%1.1f%%',\n",
" wedgeprops= {\n",
" 'edgecolor': 'BLACK'\n",
" },\n",
" colors=[BLUE, GREEN, AQUA, ORANGE, YELLOW]\n",
")\n",
"\n",
"plt.title('ratio of toppics in final dataset')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"final_dataset = []\n",
"\n",
"final_dataset += meme_samples_final\n",
"final_dataset += media_samples_final\n",
"final_dataset += famous_samples_final\n",
"final_dataset += sport_samples_final\n",
"final_dataset += politican_samples_final\n",
"\n",
"final_post_num = post_fame + post_media + post_meme + post_sport + post_politics"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset has 171579 samples. 468 are posts out of it.\n"
]
}
],
"source": [
"print(f'Dataset has {len(final_dataset)} samples. {final_post_num} are posts out of it.')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pie chart of post ratio in the dataset"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAdgAAAGbCAYAAABnFYFbAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB9kElEQVR4nO3dd3xN9x/H8dfN3iLbCBGCiD1ib7VD7CpqbzVqVVtbKb+qWbNmzdqjRlFU1SaoGSGxEyQiMm/uPb8/wq00tsTJ+DwfjzySnHvG50Tkfb/f8z3fo1EURUEIIYQQqcpI7QKEEEKIzEgCVgghhEgDErBCCCFEGpCAFUIIIdKABKwQQgiRBiRghRBCiDQgASuEEEKkAQlYIYQQIg1IwAohhBBpQAI2nfHw8KBTp06qHPt///sfnp6eGBsbU7JkSVVq+Jhq1KhBjRo1UnWfJ06coFKlSlhbW6PRaAgICGDMmDFoNJpUPc5/HThwAI1Gw4EDB95r+49RoxBZjQSsCv7++2/GjBnD48eP1S7F4Pfff2fYsGFUrlyZJUuWMHHiRLVLSubixYuMGTOG4OBgtUt5Ja1WS6tWrQgPD2fatGn88ssv5M2bV+2y0tycOXNYunSp2mUAcPfuXcaMGUNAQIDapQgBivjo/ve//ymAcuPGjRSvxcXFKQkJCR+9puHDhytGRkZKfHz8Rz/221i3bp0CKPv370+1fcbHx6fq+V66dEkBlIULFyZbrtVqldjY2FQ7zsvs37//g34+o0ePVt73z4GPj49SvXr199o2tZ04cUIBlCVLlqhdihCKiZrhnllER0djbW2dKvsyNzdPlf28q7CwMCwtLTEzM1Pl+GpI7XMNCwsDwN7ePtlyExMTTEzkv5oQWY7aCZ/RPH+nf+HCBaVt27aKvb29UrJkSUVRFOXs2bNKx44dlXz58inm5uaKq6ur0rlzZ+Xhw4cptv/vx/PWbN68eZWOHTsmO2ZQUJDSsmVLJXv27IqlpaVSvnx5Zfv27W9Vr1arVcaNG6d4enoqZmZmSt68eZURI0YocXFxhnVeVs/rWgDVq1dXfHx8lJMnTyoVK1ZULCwsFA8PD2Xu3Lkp1g0NDVW6dOmiuLi4KObm5krx4sWVpUuXplhv9erVSunSpRUbGxvF1tZWKVq0qDJ9+nRFURRlyZIlL63xeWvtxIkTSt26dRVHR0dDLZ07d37jz6Z69erJWl7PW4Fr165VJkyYoOTKlUsxNzdXatWqpQQGBr52Xx07dkxR3/N9v6x1CCh9+/ZVNm3apPj4+ChmZmZKkSJFlJ07dyZbLzg4WOndu7dSsGBBxcLCQnFwcFBatmyZovfjXVqwhw4dUsqWLauYm5srnp6eyrx5815a4+LFi5WaNWsqzs7OipmZmeLt7a3MmTMn2Tp58+Z95Xk/evRIGTx4sFK0aFHF2tpasbW1VerXr68EBASkqGnmzJlKkSJFFEtLS8Xe3l4pU6aMsnLlymTr3L59W+ncubPi4uJi+HktWrQoxc/gXX6XhUhL8rb6PbVq1QovLy8mTpyI8uyJf3v27OH69et07twZNzc3Lly4wIIFC7hw4QJHjx5Fo9HQvHlzrl69yurVq5k2bRpOTk4AODs7v/Q4oaGhVKpUiZiYGPr374+joyPLli2jSZMmrF+/nmbNmr22zm7durFs2TJatmzJ4MGDOXbsGJMmTeLSpUts2rQJgF9++YUFCxZw/Phxfv75ZwAqVar02v1GRETQsGFDWrduTdu2bfn111/p3bs3ZmZmdOnSBYDY2Fhq1KjBtWvX6NevH/ny5WPdunV06tSJx48fM2DAAMPPrW3bttSuXZvJkycDcOnSJQ4fPsyAAQOoVq0a/fv3Z+bMmXz99dd4e3sD4O3tTVhYGHXr1sXZ2ZmvvvoKe3t7goOD2bhx4xv/DV/l+++/x8jIiCFDhhAZGcmUKVNo164dx44de+U2PXv2JFeuXEycOJH+/ftTrlw5XF1dX3ucv/76i40bN9KnTx9sbW2ZOXMmLVq04ObNmzg6OgJJg6b+/vtvPv30U3Lnzk1wcDBz586lRo0aXLx4ESsrq3c6t/Pnzxt+XmPGjCExMZHRo0e/tNa5c+fi4+NDkyZNMDExYdu2bfTp0we9Xk/fvn0BmD59Ol988QU2NjZ88803AIZ9Xb9+nc2bN9OqVSvy5ctHaGgo8+fPp3r16ly8eJGcOXMCsHDhQvr370/Lli0ZMGAAcXFxnDt3jmPHjvHZZ58BSf8PKlSogEajoV+/fjg7O7Nz5066du3KkydPGDhwIN7e3owbN45Ro0bRo0cPqlatCrz5d1mINKN2wmc0z9/pt23bNsVrMTExKZatXr1aAZQ///zTsOx112D/24IdOHCgAiiHDh0yLIuKilLy5cuneHh4KDqd7pW1BgQEKIDSrVu3ZMuHDBmiAMoff/xhWNaxY0fF2tr6lft6UfXq1RVAmTp1qmFZfHy8UrJkScXFxcVwDXn69OkKoKxYscKwXkJCglKxYkXFxsZGefLkiaIoijJgwADFzs5OSUxMfOUxX3UNdtOmTQqgnDhx4q1q/+95vKwF6+3tneza7IwZMxRAOX/+/Gv393z7devWJVv+qhasmZmZcu3aNcOys2fPKoAya9Ysw7KX/U4dOXJEAZTly5enOPabWrD+/v6KhYWFEhISYlh28eJFxdjYOEWNLzt2vXr1FE9Pz2TLXnUNNi4uLsXv540bNxRzc3Nl3LhxhmVNmzZVfHx8Xlt3165dlRw5ciTrDVIURfn000+VbNmyGWqVa7AiPZFRxO+pV69eKZZZWloavo6Li+Phw4dUqFABgNOnT7/XcXbs2IGvry9VqlQxLLOxsaFHjx4EBwdz8eLF124L8OWXXyZbPnjwYAB+++2396oJkq4r9uzZ0/C9mZkZPXv2JCwsjFOnThmO7+bmRtu2bQ3rmZqa0r9/f54+fcrBgweBpGuW0dHR7Nmz553reH69c/v27Wi12vc+nxd17tw52fXZ5y2h69evp8r+n6tTpw758+c3fF+8eHHs7OySHefF3ymtVsujR48oUKAA9vb27/w7pdPp2L17N/7+/uTJk8ew3Nvbm3r16qVY/8VjR0ZG8vDhQ6pXr87169eJjIx84/HMzc0xMjIyHPvRo0fY2NhQqFChZLXb29tz+/ZtTpw48dL9KIrChg0b8PPzQ1EUHj58aPioV68ekZGR7/3/S4i0JAH7nvLly5diWXh4OAMGDMDV1RVLS0ucnZ0N673NH6SXCQkJoVChQimWP+8mDQkJee22RkZGFChQINlyNzc37O3tX7vtm+TMmTPFwK6CBQsCGG6lCQkJwcvLy/BH9lW19+nTh4IFC9KgQQNy585Nly5d2LVr11vVUb16dVq0aMHYsWNxcnKiadOmLFmyhPj4+Pc+txfDByB79uxAUrd4avrvcZ4f68XjxMbGMmrUKNzd3TE3N8fJyQlnZ2ceP378zr9TDx48IDY2Fi8vrxSvvex37PDhw9SpUwdra2vs7e1xdnbm66+/Bt7u91mv1zNt2jS8vLyS1X7u3Llk2w8fPhwbGxt8fX3x8vKib9++HD58OFndjx8/ZsGCBTg7Oyf76Ny5M/DvADMh0hO5BvueXnx3/1zr1q35+++/GTp0KCVLlsTGxga9Xk/9+vXR6/UqVJkkvU8g4OLiQkBAALt372bnzp3s3LmTJUuW8Pnnn7Ns2bLXbqvRaFi/fj1Hjx5l27Zt7N69my5dujB16lSOHj2KjY3NO9djbGz80uXKs2vtqeVtjvPFF1+wZMkSBg4cSMWKFcmWLRsajYZPP/00TX+ngoKCqF27NoULF+bHH3/E3d0dMzMzduzYwbRp097q2BMnTmTkyJF06dKF8ePH4+DggJGREQMHDky2vbe3N1euXGH79u3s2rWLDRs2MGfOHEaNGsXYsWMN67Zv356OHTu+9FjFixdPnRMXIhVJwKaSiIgI9u3bx9ixYxk1apRheWBgYIp13yXw8ubNy5UrV1Isv3z5suH1122r1+sJDAw0tBohacDI48ePP2gShLt376a4Penq1atA0mxUz49/7tw
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.pie(\n",
" [post_fame, post_media, post_meme, post_sport, post_politics],\n",
" labels=['Famous people', 'Medias', 'Memes', 'Sports', 'Politicians'],\n",
" autopct='%1.1f%%',\n",
" wedgeprops= {\n",
" 'edgecolor': 'BLACK'\n",
" },\n",
" colors=[BLUE, GREEN, AQUA, ORANGE, YELLOW]\n",
")\n",
"\n",
"plt.title('ratio of posts in final dataset')\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# GDPR protection"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"GDPR_final = final_dataset.copy()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"author_mapping = {}\n",
"idx = 1\n",
"for sample in final_dataset:\n",
" author = sample['author'].lower()\n",
" if author not in author_mapping:\n",
" author_mapping[author] = f'user{idx}'\n",
" idx += 1"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 47254/47254 [00:00<00:00, 2697741.07it/s]\n"
]
}
],
"source": [
"keys_to_delete = []\n",
"for key in tqdm(author_mapping.keys()):\n",
" if len(key) < 7:\n",
" keys_to_delete.append(key)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"for key in keys_to_delete:\n",
" del author_mapping[key]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 171579/171579 [1:01:55<00:00, 46.18it/s]\n"
]
}
],
"source": [
"for sample in tqdm(final_dataset):\n",
" author = sample['author'].lower()\n",
" if author in author_mapping:\n",
" sample['author'] = author_mapping[author]\n",
" \n",
" for key in author_mapping:\n",
" sample['text'] = sample['text'].lower().replace(key, author_mapping[key])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"with open('final.jsonl', 'w') as f:\n",
" for item in final_dataset:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### ID appending"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 171579/171579 [00:00<00:00, 2332284.88it/s]\n"
]
}
],
"source": [
"id = 1\n",
"for item in tqdm(final_dataset):\n",
" item['id'] = id\n",
" id += 1"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"with open('final_id_v2.jsonl', 'w') as f:\n",
" for item in final_dataset:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sentiment data reducing"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"sentiment_pipe = pipeline(\"text-classification\", model=\"kinit/slovakbert-sentiment-twitter\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 344/344 [2:30:05<00:00, 26.18s/it] \n"
]
}
],
"source": [
"positive_samples = []\n",
"negative_samples = []\n",
"neutral_samples = []\n",
"\n",
"# Batch processing for efficiency\n",
"batch_size = 500\n",
"batched_texts = [final_dataset[i:i + batch_size] for i in range(0, len(final_dataset), batch_size)]\n",
"\n",
"for batch in tqdm(batched_texts):\n",
" try:\n",
" sentiments = sentiment_pipe([sample['text'] for sample in batch])\n",
" for sample, sentiment in zip(batch, sentiments):\n",
" sentiment_val = sentiment['label']\n",
" if sentiment_val == '1':\n",
" positive_samples.append(sample)\n",
" continue\n",
" elif sentiment_val == '0':\n",
" neutral_samples.append(sample)\n",
" continue\n",
" negative_samples.append(sample)\n",
" except Exception as err:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"with open('classified_data/neutral_v1.jsonl', 'w') as f:\n",
" for item in neutral_samples:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merging classified data"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"def load_jsonl(file_path):\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" return [json.loads(line) for line in file]"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"positive_samples = load_jsonl('classified_data/positive.jsonl')\n",
"negative_samples = load_jsonl('classified_data/negative.jsonl')\n",
"neutral_samples = load_jsonl('classified_data/neutral.jsonl')"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAGbCAYAAADeN3riAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACCwElEQVR4nO3dd1QUVxsG8GcLZem9iiBVOiqigoAFBQsKsWtU7MYWa2yJPRo1xZJYY9cYu9FobCj2roAVsSBFpIM0Kbv3+8OPTVZAQYFh4f2dw1FmZ+88szs7vHvnzgyPMcZACCGEECKH+FwHIIQQQgj5VFTIEEIIIURuUSFDCCGEELlFhQwhhBBC5BYVMoQQQgiRW1TIEEIIIURuUSFDCCGEELlFhQwhhBBC5BYVMoQQQgiRW3WmkJk3bx54PB4ny7558yY8PT2hqqoKHo+H8PBwTnLUhDZt2qBNmzZcx6hVYmJiwOPxsHXrVuk0LrfHmrZ161bweDzcunWL6yg1KiwsDDweD2FhYVxHqXY8Hg/z5s2r0WWW9bmqjapi/29hYYGuXbtWfbh6otoKmZKdW8mPUCiEqakpQkJCkJCQ8Elt5uXlYd68ebVqx1FUVIRevXohPT0dv/zyC3bs2AFzc3OuY5F67I8//sCKFSuqvN01a9bU+j8q1UHe17u6tgdSuf3/w4cPMW/ePMTExNRsyP+oqW35ypUrmDdvHjIzM6t9WQAAVk22bNnCALAFCxawHTt2sI0bN7Jhw4YxgUDArKysWH5+fqXbTElJYQDY3LlzSz1WVFT0SW1+rkePHjEAbOPGjTW+bC74+voyX19frmPUKi9evGAA2JYtW6TTuNoeGWOsS5cuzNzcvMrbdXR0LPO9L/ms37x5s8qXWRuUt95isZjl5+czsVhc86EqoSq2h/L2u9WprM9VbVOZ/f++ffsYAHbu3LlSj5mbm7MuXbpUQ0JZ5W3LVW358uUMAHvx4kW1L4sxxoTVXSh16tQJ7u7uAIDhw4dDT08PS5cuxZEjR9C7d+8qW45QKIRQWO2rU0pycjIAQEtL65Oen5ubC1VV1SpM9HlqW57aIi8vDyoqKhWen6vtkdQcPp8PZWVlrmMQDn3u/p9UjRofI+Pt7Q0AePbsmXRaYWEh5syZg2bNmkFTUxOqqqrw9vbGuXPnpPPExMRAX18fADB//nzpIauS47ZljUkoLi7GwoULYWVlBSUlJVhYWGDWrFkoKCioUNazZ8/C29sbqqqq0NLSQvfu3fHo0SPp4yEhIfD19QUA9OrVCzwe74PjR0oOt50/fx5jxoyBgYEBGjRoIH38n3/+kS5PXV0dXbp0wYMHD0q18/jxY/Tu3Rv6+voQiUSws7PD7NmzZea5e/cuOnXqBA0NDaipqaF9+/a4du1apfJs2LABVlZWEIlE8PDwwMWLF8tcr9WrV8PR0REqKirQ1taGu7s7/vjjj/JfWFTsPS8hkUiwcuVKODs7Q1lZGfr6+ggICCg1JmPnzp3w8PCQ5vDx8cGpU6dk5lmzZg0cHR2hpKQEExMTjB07tlT3Z5s2beDk5ITbt2/Dx8cHKioqmDVrFgAgMzMTISEh0NTUhJaWFgYPHlxm92lZ2yOPx8O4ceNw+PBhODk5QUlJCY6Ojjhx4kSp54eFhcHd3R3KysqwsrLC+vXrKzTupk2bNjh27Bhevnwp/YxYWFhIH09OTsawYcNgaGgIZWVluLq6Ytu2bR9sE3h3DP/Bgwc4f/68tN33t/WCggJMnjwZ+vr6UFVVRXBwMFJSUkq1VdHt/H1FRUWYP38+bGxsoKysDF1dXbRu3RqnT5+Wme/x48fo2bMndHR0oKysDHd3dxw5ckRmnpJt//Llyx/M/KH1LmuMTMm2ExkZCV9fX6ioqMDa2hr79+8HAJw/fx4tWrSQfm7PnDlTaj0TEhIwdOhQGBoaSreRzZs3y8xTsuy9e/fi+++/R4MGDaCsrIz27dvj6dOnMnnK2x4q8xn8mKSkJAiFQsyfP7/UY1FRUeDxePj1118BAOnp6Zg6dSqcnZ2hpqYGDQ0NdOrUCRERER9dTnlj9EJCQmS2c+DdfmPFihVwdHSEsrIyDA0NMWrUKGRkZFRonapy/79161b06tULANC2bVvpe/H+MIlLly7Bw8MDysrKsLS0xPbt20u1lZmZiYkTJ8LMzAxKSkqwtrbG0qVLIZFIPrg+H/sMf6xdxhjatm0LfX19aQEHvNuOnJ2dYWVlhdzcXMybNw/Tpk0DADRq1Ei6rGo9pFZdXT3ldTf/+uuvDABbu3atdFpKSgozNjZmkydPZmvXrmXLli1jdnZ2TEFBgd29e5cxxlhOTg5bu3YtA8CCg4PZjh072I4dO1hERARjjLG5c+ey91dn8ODBDADr2bMn++2339igQYMYABYUFPTR/KdPn2ZCoZDZ2tqyZcuWsfnz5zM9PT2mra0t7S67cuUKmzVrFgPAJkyYwHbs2MFOnTr10dfEwcGB+fr6stWrV7MffviBMcbY9u3bGY/HYwEBAWz16tVs6dKlzMLCgmlpacl0z0VERDANDQ2mq6vLZs6cydavX8+++eYb5uzsLJ3n/v37TFVVlRkbG7OFCxeyH374gTVq1IgpKSmxa9euVSjP77//zgAwT09PtmrVKjZx4kSmpaXFLC0tZbomN2zYIH2N169fz1auXMmGDRvGJkyY8MHXtyLveYmQkBAGgHXq1ImtWLGC/fjjj6x79+5s9erV0nnmzZsnzbt8+XK2cuVK1r9/fzZ9+nTpPCXbiJ+fH1u9ejUbN24cEwgErHnz5qywsFA6n6+vLzMyMmL6+vps/PjxbP369ezw4cNMIpEwHx8fxufz2ZgxY9jq1atZu3btmIuLS6ku8LK2RwDM1dVV+r6sWLGCWVpaMhUVFZaamiqd786dO0xJSYlZWFiwH374gX3//ffMxMSEubq6lmrzfadOnWJubm5MT09P+hk5dOgQY4yxvLw8Zm9vzxQUFNikSZPYqlWrmLe3NwPAVqxY8cF2Dx06xBo0aMAaN24sbbdkWy/Zjpo0acLatWvHVq9ezaZMmcIEAgHr3bu3TDsV3c7LMmvWLMbj8diIESPYxo0b2U8//cT69esn3WYZe7fta2pqMgcHB7Z06VL266+/Mh8fH8bj8djBgwel81U084fW+9y5c6UOFfj6+jITExNmZmbGpk2bxlavXs0cHByYQCBgf/75JzMyMmLz5s1jK1asYKampkxTU5O9efNG+vzXr1+zBg0aMDMzM7ZgwQK2du1a1q1bNwaA/fLLL9L5SpbdpEkT1qxZM/bLL7+wefPmMRUVFebh4VGh7aEyn0FU4NBSu3btmIODQ6np8+fPZwKBgL1+/ZoxxtjNmzeZlZUVmzFjBlu/fj1bsGCB9LVISEiQPq+sQ0vlHdoePHhwqcNnw4cPZ0KhkI0YMYKtW7eOTZ8+namqqpb6vJelqvf/z549YxMmTGAA2KxZs6TvRclrYm5uzuzs7JihoSGbNWsW+/XXX1nTpk0Zj8dj9+/fl7aTm5vLXFxcmK6uLps1axZbt24dGzRoEOPxeOzrr7/+4Dp9aFuuaLvPnz9nampqLDg4WDptxowZjMfjsfPnzzPG3v2N6tevn3SbLVlWTk7OB/N9jmovZM6cOcNSUlJYXFwc279/P9PX12dKSkosLi5OOm9xcTErKCiQeX5GRgYzNDRkQ4cOlU770BiZ9/9whIeHMwBs+PDhMvNNnTqVAWBnz579YH43NzdmYGDA0tLSpNMiIiIYn89ngwYNkk4r2aHs27fvwy8I+/c1ad26NSsuLpZOz87OZlpaWmzEiBEy879+/ZppamrKTPfx8WHq6urs5cuXMvNKJBLp/4OCgpiioiJ79uyZdNqrV6+Yuro68/Hx+WiewsJCZmBgwNzc3GTel5Ki5b87ku7duzNHR8ePrvv7Kvqenz17VrqjeF/JOkdHRzM+n8+Cg4NLjVcomSc5OZkpKiqyjh07ysxTUlhv3rxZOs3X15cBYOvWrZNp6/DhwwwAW7Zsmcx6lBQDFSlkFBUV2dOnT6XTIiIiGACZoiwwMJCpqKj
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.pie(\n",
" [len(positive_samples), len(negative_samples), len(neutral_samples)],\n",
" labels=['Positive samples', 'Negative samples', 'Neutral samples'],\n",
" autopct='%1.1f%%',\n",
" wedgeprops= {\n",
" 'edgecolor': 'BLACK'\n",
" },\n",
" colors=[BLUE, GREEN, AQUA]\n",
")\n",
"\n",
"plt.title('Ratio of records according to the sentimental value of the text')\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting samples with swear words"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"def find_word(words: list, sample: str):\n",
"\n",
" for word in words:\n",
" if re.search(r'\\b' + word + r'\\b', sample.lower()) is not None:\n",
" return True\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"swear_words = (\n",
" 'kokot',\n",
" 'kkt',\n",
" 'koko*',\n",
" 'pi.',\n",
" 'pi*'\n",
" 'pic',\n",
" 'pič',\n",
" 'sedl',\n",
" 'hovn',\n",
" 'retard',\n",
" 'bastard',\n",
" 'idiot',\n",
" 'kurv',\n",
" 'geň',\n",
" 'geno',\n",
" 'riť',\n",
" 'rit',\n",
" 'anal',\n",
" 'jeb',\n",
" 'curak',\n",
" 'čur',\n",
" 'chuj',\n",
" 'srack',\n",
" 'srač',\n",
" 'hoveda',\n",
" 'hovada',\n",
" 'hoväd',\n",
" 'cigan',\n",
" 'cigán',\n",
" 'neger',\n",
" 'negri',\n",
" 'blaznov',\n",
" 'bláznov',\n",
" 'blazon',\n",
" 'blázon',\n",
" 'krist',\n",
" 'hajz',\n",
" 'stet',\n",
" 'štet',\n",
" 'sex',\n",
" 'gay',\n",
" 'gej',\n",
" 'buzerant',\n",
" 'buzik',\n",
" 'tepl',\n",
" 'hanb',\n",
" 'hamb',\n",
" 'nigg',\n",
" 'uchyl',\n",
" 'uchil',\n",
" 'hnus',\n",
" 'potkan',\n",
" 'trapn',\n",
" 'odpad',\n",
" 'zmrd',\n",
" 'drog',\n",
" 'zlatok',\n",
" 'vymydl',\n",
" 'mrch',\n",
" 'fet',\n",
" 'zbabelec',\n",
" 'psycho',\n",
" 'k.k.t',\n",
" 'maria',\n",
" 'mariu',\n",
" 'svin',\n",
" 'egocent',\n",
" 'chuda',\n",
" 'bit',\n",
" 'nasilnik',\n",
" 'spodina',\n",
" 'spo.dina',\n",
" 'magor',\n",
" 'tyran',\n",
" 'tiran',\n",
" 'boha',\n",
" 'spina',\n",
" 'dement',\n",
" 'hnoj',\n",
" 'dobil',\n",
" 'psicho',\n",
" 'hajzl',\n",
" 'komediant',\n",
" 'nechutny',\n",
" 'sprostak',\n",
" 'mafia',\n",
" 'cecky',\n",
" 'ser',\n",
" 'vola',\n",
" 'klam',\n",
" 'cirkus',\n",
" 'huba',\n",
" 'huby',\n",
" 'trtko',\n",
" 'trtkat',\n",
" 'o.e'\n",
")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"neg_swear_samples = []\n",
"pos_swear_samples = []\n",
"neu_swear_samples = []"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 111568/111568 [00:18<00:00, 6070.15it/s]\n"
]
},
{
"data": {
"text/plain": [
"3485"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for sample in tqdm(negative_samples):\n",
" if find_word(swear_words, sample['text']):\n",
" neg_swear_samples.append(sample)\n",
"\n",
"len(neg_swear_samples)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 34083/34083 [00:04<00:00, 7840.59it/s]\n"
]
},
{
"data": {
"text/plain": [
"182"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for sample in tqdm(positive_samples):\n",
" if find_word(swear_words, sample['text']):\n",
" pos_swear_samples.append(sample)\n",
"\n",
"len(pos_swear_samples)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 11428/11428 [00:00<00:00, 11980.43it/s]\n"
]
},
{
"data": {
"text/plain": [
"79"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for sample in tqdm(neutral_samples):\n",
" if find_word(swear_words, sample['text']):\n",
" neu_swear_samples.append(sample)\n",
"\n",
"len(neu_swear_samples)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"def get_samples_without_swear_words(samples: list, swear_samples: list):\n",
" return [item for item in samples if item not in swear_samples]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"positive_without_swear = get_samples_without_swear_words(positive_samples, pos_swear_samples)\n",
"negative_without_swear = get_samples_without_swear_words(negative_samples, neg_swear_samples)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"def get_random_samples_by_len(samples: list, length: int):\n",
" random.shuffle(samples)\n",
" return samples[:length]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Computing sample percentage"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11428"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(neutral_samples) # 20 % of the data\n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"shuffled_positive_data = get_random_samples_by_len(positive_without_swear, len(neutral_samples) - len(pos_swear_samples)) # other 20% the same as neutral data"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"shuffled_negative_data = get_random_samples_by_len(negative_without_swear, int((((2 * len(neutral_samples)) / 40) * 60)) - len(neg_swear_samples)) # next 60%"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34284"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(shuffled_negative_data) + len(neg_swear_samples)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Final dataset merge"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"classified_data = []\n",
"\n",
"def append_final(input_list: list):\n",
" for sample in input_list:\n",
" classified_data.append(sample)\n",
"\n",
"\n",
"append_final(neutral_samples)\n",
"append_final(pos_swear_samples)\n",
"append_final(shuffled_positive_data)\n",
"append_final(neg_swear_samples)\n",
"append_final(shuffled_negative_data)\n",
"\n",
"random.shuffle(classified_data)\n",
"\n",
"id = 1\n",
"for sample in classified_data:\n",
" sample['id'] = id\n",
" id += 1"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"57140"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(classified_data)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGbCAYAAADqTrv+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB3h0lEQVR4nO3dd1iV9f/H8ec5HA57T0H2cE/cO7XEHGlmppbatjRzZtORfTVbmpaZDUeZWebee+8JIioiQwVFQWTv+/eHyS8CzQHcB3g/rourvM997vt1jkd4cX8+931rFEVREEIIIYQwQFq1AwghhBBC3I0UFSGEEEIYLCkqQgghhDBYUlSEEEIIYbCkqAghhBDCYElREUIIIYTBkqIihBBCCIMlRUUIIYQQBkuKihBCCCEMlhQVUSV5e3szZMgQVfb9+eef4+vri5GREQ0bNnzg53fo0IEOHTqUeq6HFR0djUajYcGCBWW+rwULFqDRaDh69GiZ7ysiIoInnngCGxsbNBoNK1euLNx/dHR0me33Ud/P8sgoRHmSoiIqrf379zNp0iSSk5PVjlJo8+bNvPPOO7Ru3Zr58+czderUctt3RkYGkyZNYufOneW2z4c1Z86ccik+9zJ48GBCQ0P53//+xy+//EKTJk1UzVMefvvtN2bOnKl2DKBifV5F2dKpHUCIsrJ//34mT57MkCFDsLW1LfLYuXPn0GrLv6dv374drVbLTz/9hF6vL9d9Z2RkMHnyZIBSPSLj5eVFZmYmxsbGpbbNOXPm4OjoqNpRr8zMTA4cOMAHH3zA8OHDC5e/8MILPPfcc5iYmKiSq6z99ttvnD59mpEjR6odpcw+r6LikaIiKoz09HQsLCxKZVtq/aBJSEjAzMys3EtKWdJoNJiamqodo1Rdv34doFjBNTIywsjISIVEQlRdMvQjDNKkSZPQaDScOXOGAQMGYGdnR5s2bQAICQlhyJAh+Pr6YmpqiqurKy+99BKJiYlFnj9u3DgAfHx80Gg0RcbtS5qjcvHiRfr27Yu9vT3m5ua0aNGCdevW3VfevLw8pkyZgp+fHyYmJnh7e/P++++TnZ1duI5Go2H+/Pmkp6cX5vmv4Y158+bh5+eHmZkZzZo1Y8+ePcXWycnJYcKECQQFBWFjY4OFhQVt27Zlx44dhetER0fj5OQEwOTJkwv3P2nSpPt+T++mpDkVQ4YMwdLSkitXrtCrVy8sLS1xcnJi7Nix5Ofn33N73t7ehIWFsWvXrsKc//6NOjs7m9GjR+Pk5ISFhQW9e/cuLBf/tGHDBtq2bYuFhQVWVlZ069aNsLCwe+5/0qRJeHl5ATBu3Dg0Gg3e3t5AyfM/vL296d69O3v37qVZs2aYmpri6+vLokWLimw3KSmJsWPHUq9ePSwtLbG2tqZr166cOnXqnnnuJSwsjI4dO2JmZkb16tX55JNPKCgoKLbeqlWr6NatG25ubpiYmODn58eUKVOK/F106NCBdevWERMTU/i+33nd9/MZu+P3338nKCgIKysrrK2tqVevHl9//XWRdZKTkxk5ciQeHh6YmJjg7+/P9OnTC7P/1+dVVC1yREUYtL59+xIQEMDUqVNRFAWALVu2cPHiRV588UVcXV0JCwtj3rx5hIWFcfDgQTQaDU8//TTnz59nyZIlzJgxA0dHR4DCb37/du3aNVq1akVGRgYjRozAwcGBhQsX0rNnT5YtW0bv3r3vmfOVV15h4cKFPPPMM4wZM4ZDhw4xbdo0wsPDWbFiBQC//PIL8+bN4/Dhw/z4448AtGrV6q7b/Omnn3j99ddp1aoVI0eO5OLFi/Ts2RN7e3s8PDwK10tJSeHHH3+kf//+vPrqq6SmpvLTTz/RpUsXDh8+TMOGDXFycuK7777jjTfeoHfv3jz99NMA1K9f/77f0weVn59Ply5daN68OV988QVbt27lyy+/xM/PjzfeeOOuz5s5cyZvvfUWlpaWfPDBBwC4uLgUWeett97Czs6OiRMnEh0dzcyZMxk+fDhLly4tXOeXX35h8ODBdOnShenTp5ORkcF3331HmzZtOHHiROEP4X97+umnsbW1ZdSoUfTv358nn3wSS0vLe77WCxcu8Mwzz/Dyyy8zePBgfv75Z4YMGUJQUBB16tQBbhfhlStX0rdvX3x8fLh27Rrff/897du358yZM7i5ud3P21ro6tWrPPbYY+Tl5fHuu+9iYWHBvHnzMDMzK7buggULsLS0ZPTo0VhaWrJ9+3YmTJhASkoKn3/+OQAffPABt27d4vLly8yYMQOg8HXfz2cMbn+O+vfvT6dOnZg+fToA4eHh7Nu3j7fffhu4PaTTvn17rly5wuuvv46npyf79+/nvffeIz4+npkzZ/7n51VUMYoQBmjixIkKoPTv37/YYxkZGcWWLVmyRAGU3bt3Fy77/PPPFUCJiooqtr6Xl5cyePDgwj+PHDlSAZQ9e/YULktNTVV8fHwUb29vJT8//65ZT548qQDKK6+8UmT52LFjFUDZvn174bLBgwcrFhYWd93WHTk5OYqzs7PSsGFDJTs7u3D5vHnzFEBp37594bK8vLwi6yiKoty8eVNxcXFRXnrppcJl169fVwBl4sSJxfZ3v+9pSaKiohRAmT9/fuGywYMHK4Dy8ccfF1m3UaNGSlBQ0D23pyiKUqdOnSKv8Y758+crgNK5c2eloKCgcPmoUaMUIyMjJTk5WVGU2393tra2yquvvlrk+VevXlVsbGyKLb/ba/r8889L3P8/P1NeXl7F3qeEhATFxMREGTNmTOGyrKysYp+jqKgoxcTEpMj7VNL7WZI7n9lDhw4V2a+NjU2xjCX9/b7++uuKubm5kpWVVbisW7duipeXV7F17/cz9vbbbyvW1tZKXl7eXXNPmTJFsbCwUM6fP19k+bvvvqsYGRkpsbGxiqLc+/MqqhYZ+hEGbejQocWW/fM3xqysLG7cuEGLFi0AOH78+EPtZ/369TRr1qxweAlu/zb52muvER0dzZkzZ+75XIDRo0cXWT5mzBiA+x4++qejR4+SkJDA0KFDi8xnGTJkCDY2NkXWNTIyKlynoKCApKQk8vLyaNKkyX2/H2XxnkLxv7+2bdty8eLFh97eHa+99lqRozxt27YlPz+fmJgY4PZv9snJyfTv358bN24UfhkZGdG8efMShyweRe3atWnbtm3hn52cnKhRo0aR12piYlI4gTs/P5/ExEQsLS2pUaPGQ73H69evp0WLFjRr1qzIfgcOHFhs3X/+/aampnLjxg3atm1LRkYGZ8+e/c993e9nzNbWlvT0dLZs2XLXbf3555+0bdsWOzu7In83nTt3Jj8/n927d9/X6xdVhwz9CIPm4+NTbFlSUhKTJ0/m999/JyEhochjt27deqj9xMTE0Lx582LLa9WqVfh43bp17/pcrVaLv79/keWurq7Y2toW/vB80DwAAQEBRZYbGxvj6+tbbP2FCxfy5ZdfcvbsWXJzcwuXl/T+laQs3lNTU9NiQ212dnbcvHnzobb3T56ensW2CxRuOyIiAoCOHTuW+Hxra+tHznCvPHcy/fO1FhQU8PXXXzNnzhyioqKKzA9xcHB44H3e7TNbo0aNYsvCwsL48MMP2b59OykpKUUeu9+/3/v5jL355pv88ccfdO3aFXd3d5544gmeffZZgoODC9eJiIggJCTkrsOw//78CSFFRRi0ksbbn332Wfbv38+4ceNo2LAhlpaWFBQUEBwcXOJEwvLyMPM4SsOvv/7KkCFD6NWrF+PGjcPZ2RkjIyOmTZtGZGTkfW2jLN7Tsjw75m7bVv6ex3Qn8y+//IKrq2ux9XS60v3W9195AKZOncpHH33ESy+9xJQpU7C3t0er1TJy5Mgy/dwmJyfTvn17rK2t+fjjj/Hz88PU1JTjx48zfvz4+9r3/X7GnJ2dOXnyJJs2bWLDhg1s2LCB+fPnM2jQIBYuXAjc/rt5/PHHeeedd0rcV2BgYOm8cFFpSFERFcrNmzfZtm0bkydPZsKECYXL7/wG/U8PUhy8vLw4d+5cseV3DovfOQvkbs8tKCggIiKi8AgM3J6gm5ycfM/n3mu
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.pie(\n",
" [len(shuffled_positive_data) + len(pos_swear_samples), len(shuffled_negative_data) + len(neg_swear_samples), len(neutral_samples)],\n",
" labels=['Positive samples', 'Negative samples', 'Neutral samples'],\n",
" autopct='%1.1f%%',\n",
" wedgeprops= {\n",
" 'edgecolor': 'BLACK'\n",
" },\n",
" colors=[BLUE, GREEN, AQUA]\n",
")\n",
"\n",
"plt.title('ratio of data in the final dataset')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkYAAAGbCAYAAAAhlV8AAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB0ZElEQVR4nO3dd1yU9QMH8M/dcexx7D0FHCiKe+FEcY9cqWU4Misz22n91JZmS9NylmZlWVmpufdeuGUIyAaRJXveeH5/gJcEKirwMD7v18uX+txzz33uOOBz3+f7PI9EEAQBRERERASp2AGIiIiI6gsWIyIiIqJyLEZERERE5ViMiIiIiMqxGBERERGVYzEiIiIiKsdiRERERFSOxYiIiIioHIsRERERUTkWI2o03NzcEBQUJMpjf/755/Dw8IBMJkO7du1EyVCbJBIJFi1aJHaMh1q0aBEkEskjrZuRkVHLqehJPcrXlehJsRhRg3L69GksWrQI2dnZYkfR2r9/P95++2306NEDGzduxOLFi8WORPdYvHgxtm3bJnYMImogWIyoQTl9+jQ++OCDKotRREQE1q9fX+eZDh8+DKlUiu+//x5TpkzBkCFD6jwDlXn//fdRVFRUYRmLERE9ChYjElVBQUGNbUtPTw9yubzGtlddaWlpMDAwgK6ubp0/NlWko6MDfX19sWPUexqNBsXFxWLH0FKpVCgtLRU7BhEAFiOqQ3fnCYSFhWHSpEkwNzdHz549AQDXrl1DUFAQPDw8oK+vDzs7O0ybNg2ZmZkV7v/WW28BANzd3SGRSCCRSBAXFweg6jlGMTExGDduHCwsLGBoaIiuXbti165d1cqrUqnw0UcfoVmzZtDT04Obmxvmz5+PkpIS7ToSiQQbN25EQUGBNs8PP/xw321GRUVhzJgxsLOzg76+PpycnPD0008jJydHu87GjRvRr18/2NjYQE9PD61atcLq1asrbcvNzQ3Dhg3D0aNH0bFjRxgYGKBNmzY4evQoAOCvv/5CmzZtoK+vjw4dOuDy5csV7h8UFARjY2PExMQgMDAQRkZGcHBwwIcffghBEB76+iQnJ2PatGmwtbWFnp4efHx8sGHDhkrrrVy5Ej4+PjA0NIS5uTk6duyIX3755b7bFQQBVlZWeP3117XLNBoNFAoFZDJZhdHCpUuXQkdHB/n5+QAqz0WRSCQoKCjApk2btF+f/75HsrOzERQUBIVCATMzM0ydOhWFhYUPff4P+1o+9dRTaN++fYX7DB8+HBKJBDt27NAuO3fuHCQSCfbs2VMh09y5c+Hs7Aw9PT14enpi6dKl0Gg0Fbb3xRdfoHv37rC0tISBgQE6dOiArVu3VsoqkUgwe/ZsbN68GT4+PtDT08PevXurfF6vv/46LC0tK7wHXnnlFUgkEqxYsUK7LDU1FRKJpMJ7My0tDdOnT4etrS309fXRtm1bbNq0qcL24+LiIJFI8MUXX2D58uXa76+wsDAAwMmTJ9GpUyfo6+ujWbNmWLt2bZU5Dxw4gJ49e0KhUMDY2BjNmzfH/Pnzq1yX6FHoiB2Amp5x48bBy8sLixcv1v7wPXDgAGJiYjB16lTY2dkhNDQU69atQ2hoKM6ePQuJRIKnnnoKkZGR+PXXX7Fs2TJYWVkBAKytrat8nNTUVHTv3h2FhYWYM2cOLC0tsWnTJowYMQJbt27F6NGjH5hzxowZ2LRpE8aOHYs33ngD586dw5IlSxAeHo6///4bAPDTTz9h3bp1OH/+PL777jsAQPfu3avcXmlpKQIDA1FSUoJXXnkFdnZ2SE5Oxs6dO5GdnQ0zMzMAwOrVq+Hj44MRI0ZAR0cH//zzD1566SVoNBq8/PLLFbZ58+ZNTJo0CS+88AKeeeYZfPHFFxg+fDjWrFmD+fPn46WXXgIALFmyBOPHj0dERASk0n8/D6nVagwaNAhdu3bFZ599hr1792LhwoVQqVT48MMP7/vapKamomvXrtpfuNbW1tizZw+mT5+O3NxczJ07FwCwfv16zJkzB2PHjsWrr76K4uJiXLt2DefOncOkSZOq3LZEIkGPHj1w/Phx7bJr164hJycHUqkUp06dwtChQwEAJ06cgJ+fH4yNjavc1k8//YQZM2agc+fOmDlzJgCgWbNmFdYZP3483N3dsWTJEly6dAnfffcdbGxssHTp0vs+/+p8Lf39/bF9+3bk5ubC1NQUgiDg1KlTkEqlOHHiBEaMGKF9DlKpFD169AAAFBYWonfv3khOTsYLL7wAFxcXnD59GvPmzUNKSgqWL1+uzfH1119jxIgRmDx5MkpLS7FlyxaMGzcOO3fu1L5Gdx0+fBi///47Zs+eDSsrK7i5uVX53Pz9/bFs2TKEhoaidevWFTKeOHECc+bM0S4DgF69egEAioqK0KdPH9y8eROzZ8+Gu7s7/vjjDwQFBSE7OxuvvvpqhcfZuHEjiouLMXPmTOjp6cHCwgLXr1/HwIEDYW1tjUWLFkGlUmHhwoWwtbWtcN/Q0FAMGzYMvr6++PDDD6Gnp4ebN2/i1KlT9/2aEVWbQFRHFi5cKAAQJk6cWOm2wsLCSst+/fVXAYBw/Phx7bLPP/9cACDExsZWWt/V1VV47rnntP+fO3euAEA4ceKEdlleXp7g7u4uuLm5CWq1+r5Zr1y5IgAQZsyYUWH5m2++KQAQDh8+rF323HPPCUZGRvfd1l2XL18WAAh//PHHA9er6rUIDAwUPDw8KixzdXUVAAinT5/WLtu3b58AQDAwMBDi4+O1y9euXSsAEI4cOVIhNwDhlVde0S7TaDTC0KFDBV1dXSE9PV27HICwcOFC7f+nT58u2NvbCxkZGRUyPf3004KZmZn2OYwcOVLw8fF54POtyueffy7IZDIhNzdXEARBWLFiheDq6ip07txZeOeddwRBEAS1Wi0oFArhtdde097v7nvsXkZGRhXeF/9dd9q0aRWWjx49WrC0tHxgvup8LYODgwUAwu7duwVBEIRr164JAIRx48YJXbp00a43YsQIwc/PT/v/jz76SDAyMhIiIyMrbO/dd98VZDKZkJCQoF323/dKaWmp0Lp1a6Ffv34VlgMQpFKpEBoa+sDnJQiCkJaWJgAQVq1aJQiCIGRnZwtSqVQYN26cYGtrq11vzpw5goWFhaDRaARBEITly5cLAISff/65Qp5u3boJxsbG2q9lbGysAEAwNTUV0tLSKjz2qFGjBH19/Qrv3bCwMEEmk1X4ui5btkwAUOE9SlRTuCuN6tysWbMqLTMwMND+u7i4GBkZGejatSsA4NKlS4/1OLt370bnzp21u+sAwNjYGDNnzkRcXJx26P5+9wVQYXcOALzxxhsAUO3dcfe6OyK0b9++B+6qufe1yMnJQUZGBnr37o2YmJgKu9wAoFWrVujWrZv2/126dAEA9OvXDy4uLpWWx8TEVHq82bNna/99dwSotLQUBw8erDKfIAj4888/MXz4cAiCgIyMDO2fwMBA5OTkaL9mCoUCSUlJCA4Ovu/zrYq/vz/UajVOnz4NoGx0wt/fH/7+/tqRipCQEGRnZ8Pf3/+Rtv1f/30/+vv7IzMzE7m5ufe9T3W+lndHsu6OfJ04cQJOTk6YMmUKLl26hMLCQgiCgJMnT1Z4Dn/88Qf8/f1hbm5e4bUNCAiAWq2uMJJ273slKysLOTk58Pf3r/J7pnfv3mjVqtVDXw9ra2u0aNFC+zinTp2CTCbDW2+9hdTUVERFRWmfT8+ePbW7Lnfv3g07OztMnDhRuy25XI45c+YgPz8fx44dq/A4Y8aMqTDaq1arsW/fPowaNarCe7dly5YIDAyscF+FQgEA2L59e6Xdi0RPisWI6py7u3ulZXfu3MGrr74KW1tbGBgYwNraWrvef8tAdcXHx6N58+aVlrds2VJ7+4PuK5VK4enpWWG5nZ0dFArFA+97P+7u7nj99dfx3XffwcrKCoGBgfj2228rPb9Tp04hICAARkZGUCgUsLa21s6d+O+69/4CAf79he3s7Fzl8qysrArLpVIpPDw8Kizz9vYGAO3crf9KT09HdnY21q1bB2tr6wp/pk6dCqBsrgkAvPP
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.pie(\n",
" [len(pos_swear_samples), len(neg_swear_samples), len(neu_swear_samples)],\n",
" labels=['Positive samples', 'Negative samples', 'Neutral samples'],\n",
" autopct='%1.1f%%',\n",
" wedgeprops= {\n",
" 'edgecolor': 'BLACK'\n",
" },\n",
" colors=[BLUE, GREEN, AQUA]\n",
")\n",
"\n",
"plt.title('ratio of samples with swear words')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"with open('final_id_v2.jsonl', 'w') as f:\n",
" for item in classified_data:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}