182 lines
4.4 KiB
Plaintext
182 lines
4.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# JSON to JSONL file converter\n",
|
|
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports \n",
|
|
"import json\n",
|
|
"import os"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Open JSON data, then write it as JSONL"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def open_json(filename):\n",
|
|
" # Read the JSON file\n",
|
|
" with open(filename, 'r') as json_file:\n",
|
|
" return json.load(json_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_jsonl(filename, new_dataset):\n",
|
|
" with open(f'{filename}l', 'w') as jsonl_file:\n",
|
|
" for item in new_dataset:\n",
|
|
" jsonl_file.write(json.dumps(item) + '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Loop through dataset, create new list of dictionaries, drop duplicate data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def traverse_dataset(dataset):\n",
|
|
" new_dataset = []\n",
|
|
" for post in dataset:\n",
|
|
" new_dataset.append(post)\n",
|
|
" for comment in post['comments']:\n",
|
|
" new_dataset.append(comment)\n",
|
|
" try:\n",
|
|
" for reply in comment['replies']:\n",
|
|
" new_dataset.append(reply)\n",
|
|
"\n",
|
|
" for sec_reply in reply['replies']:\n",
|
|
" new_dataset.append(sec_reply)\n",
|
|
" except KeyError:\n",
|
|
" pass\n",
|
|
" \n",
|
|
" return new_dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def drop_keywords(dataset):\n",
|
|
" for item in dataset:\n",
|
|
" try:\n",
|
|
" del item['comments']\n",
|
|
" except KeyError:\n",
|
|
" pass\n",
|
|
" try:\n",
|
|
" del item['replies']\n",
|
|
" except KeyError:\n",
|
|
" pass\n",
|
|
" \n",
|
|
" return dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def clean_dataset(dataset):\n",
|
|
" cleaned_dataset = []\n",
|
|
" for data in dataset:\n",
|
|
"\n",
|
|
" cleaned_data = {}\n",
|
|
" if 'id' in data:\n",
|
|
" cleaned_data['id'] = data.get('id')\n",
|
|
" \n",
|
|
" if 'publisher' in data:\n",
|
|
" cleaned_data['author'] = data.get('publisher')\n",
|
|
" \n",
|
|
" if 'text' in data:\n",
|
|
" cleaned_data['text'] = data.get('text')\n",
|
|
" elif 'title' in data:\n",
|
|
" cleaned_data['text'] = data.get('title')\n",
|
|
"\n",
|
|
" cleaned_dataset.append(cleaned_data)\n",
|
|
"\n",
|
|
" return cleaned_dataset"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Execution of functions defined above"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for dataset_name in os.listdir('json_data_id/'):\n",
|
|
" dataset = open_json(f'json_data_id/{dataset_name}')\n",
|
|
"\n",
|
|
" new_dataset = traverse_dataset(dataset)\n",
|
|
" new_dataset = drop_keywords(new_dataset)\n",
|
|
" new_dataset = clean_dataset(new_dataset)\n",
|
|
"\n",
|
|
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "sentiment",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.16"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|