{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# JSON to JSONL file converter\n", "This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# imports \n", "import json\n", "import os" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Open JSON data, then write it as JSONL" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def open_json(filename):\n", " # Read the JSON file\n", " with open(filename, 'r') as json_file:\n", " return json.load(json_file)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def create_jsonl(filename, new_dataset):\n", " with open(f'{filename}l', 'w') as jsonl_file:\n", " for item in new_dataset:\n", " jsonl_file.write(json.dumps(item) + '\\n')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through dataset, create new list of dictionaries, drop duplicate data" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def traverse_dataset(dataset):\n", " new_dataset = []\n", " for post in dataset:\n", " new_dataset.append(post)\n", " for comment in post['comments']:\n", " new_dataset.append(comment)\n", " try:\n", " for reply in comment['replies']:\n", " new_dataset.append(reply)\n", "\n", " for sec_reply in reply['replies']:\n", " new_dataset.append(sec_reply)\n", " except KeyError:\n", " pass\n", " \n", " return new_dataset" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def drop_keywords(dataset):\n", " for item in dataset:\n", " try:\n", " del item['comments']\n", " except KeyError:\n", " pass\n", " try:\n", " del item['replies']\n", " except KeyError:\n", " pass\n", " \n", " return dataset" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "def clean_dataset(dataset):\n", " cleaned_dataset = []\n", " for data in dataset:\n", "\n", " cleaned_data = {}\n", " if 'id' in data:\n", " cleaned_data['id'] = data.get('id')\n", " \n", " if 'publisher' in data:\n", " cleaned_data['author'] = data.get('publisher')\n", " \n", " if 'text' in data:\n", " cleaned_data['text'] = data.get('text')\n", " elif 'title' in data:\n", " cleaned_data['text'] = data.get('title')\n", "\n", " cleaned_dataset.append(cleaned_data)\n", "\n", " return cleaned_dataset" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Execution of functions defined above" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "for dataset_name in os.listdir('json_data_id/'):\n", " dataset = open_json(f'json_data_id/{dataset_name}')\n", "\n", " new_dataset = traverse_dataset(dataset)\n", " new_dataset = drop_keywords(new_dataset)\n", " new_dataset = clean_dataset(new_dataset)\n", "\n", " create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)" ] } ], "metadata": { "kernelspec": { "display_name": "sentiment", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }