BP2024/preprocessing/create_jsonl.ipynb

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# JSON to JSONL file converter\n",
    "This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports \n",
    "import json\n",
    "import os"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Open JSON data, then write it as JSONL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def open_json(filename):\n",
    "    # Read the JSON file\n",
    "    with open(filename, 'r') as json_file:\n",
    "        return json.load(json_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_jsonl(filename, new_dataset):\n",
    "    with open(f'{filename}l', 'w') as jsonl_file:\n",
    "        for item in new_dataset:\n",
    "            jsonl_file.write(json.dumps(item) + '\\n')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loop through dataset, create new list of dictionaries, drop duplicate data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def traverse_dataset(dataset):\n",
    "    new_dataset = []\n",
    "    for post in dataset:\n",
    "        new_dataset.append(post)\n",
    "        for comment in post['comments']:\n",
    "            new_dataset.append(comment)\n",
    "            try:\n",
    "                for reply in comment['replies']:\n",
    "                    new_dataset.append(reply)\n",
    "\n",
    "                    for sec_reply in reply['replies']:\n",
    "                        new_dataset.append(sec_reply)\n",
    "            except KeyError:\n",
    "                pass\n",
    "    \n",
    "    return new_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def drop_keywords(dataset):\n",
    "    for item in dataset:\n",
    "        try:\n",
    "            del item['comments']\n",
    "        except KeyError:\n",
    "            pass\n",
    "        try:\n",
    "            del item['replies']\n",
    "        except KeyError:\n",
    "            pass\n",
    "    \n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_dataset(dataset):\n",
    "    cleaned_dataset = []\n",
    "    for data in dataset:\n",
    "\n",
    "        cleaned_data = {}\n",
    "        if 'id' in data:\n",
    "            cleaned_data['id'] = data.get('id')\n",
    "        \n",
    "        if 'publisher' in data:\n",
    "            cleaned_data['author'] = data.get('publisher')\n",
    "        \n",
    "        if 'text' in data:\n",
    "            cleaned_data['text'] = data.get('text')\n",
    "        elif 'title' in data:\n",
    "            cleaned_data['text'] = data.get('title')\n",
    "\n",
    "        cleaned_dataset.append(cleaned_data)\n",
    "\n",
    "    return cleaned_dataset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Execution of functions defined above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset_name in os.listdir('json_data_id/'):\n",
    "    dataset = open_json(f'json_data_id/{dataset_name}')\n",
    "\n",
    "    new_dataset = traverse_dataset(dataset)\n",
    "    new_dataset = drop_keywords(new_dataset)\n",
    "    new_dataset = clean_dataset(new_dataset)\n",
    "\n",
    "    create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sentiment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}