BP2024/preprocessing/create_jsonl.ipynb
2024-04-09 15:39:11 +02:00

182 lines
4.4 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# JSON to JSONL file converter\n",
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# imports \n",
"import json\n",
"import os"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Open JSON data, then write it as JSONL"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def open_json(filename):\n",
" # Read the JSON file\n",
" with open(filename, 'r') as json_file:\n",
" return json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl(filename, new_dataset):\n",
" with open(f'{filename}l', 'w') as jsonl_file:\n",
" for item in new_dataset:\n",
" jsonl_file.write(json.dumps(item) + '\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loop through dataset, create new list of dictionaries, drop duplicate data"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def traverse_dataset(dataset):\n",
" new_dataset = []\n",
" for post in dataset:\n",
" new_dataset.append(post)\n",
" for comment in post['comments']:\n",
" new_dataset.append(comment)\n",
" try:\n",
" for reply in comment['replies']:\n",
" new_dataset.append(reply)\n",
"\n",
" for sec_reply in reply['replies']:\n",
" new_dataset.append(sec_reply)\n",
" except KeyError:\n",
" pass\n",
" \n",
" return new_dataset"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def drop_keywords(dataset):\n",
" for item in dataset:\n",
" try:\n",
" del item['comments']\n",
" except KeyError:\n",
" pass\n",
" try:\n",
" del item['replies']\n",
" except KeyError:\n",
" pass\n",
" \n",
" return dataset"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"def clean_dataset(dataset):\n",
" cleaned_dataset = []\n",
" for data in dataset:\n",
"\n",
" cleaned_data = {}\n",
" if 'id' in data:\n",
" cleaned_data['id'] = data.get('id')\n",
" \n",
" if 'publisher' in data:\n",
" cleaned_data['author'] = data.get('publisher')\n",
" \n",
" if 'text' in data:\n",
" cleaned_data['text'] = data.get('text')\n",
" elif 'title' in data:\n",
" cleaned_data['text'] = data.get('title')\n",
"\n",
" cleaned_dataset.append(cleaned_data)\n",
"\n",
" return cleaned_dataset"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Execution of functions defined above"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"for dataset_name in os.listdir('json_data_id/'):\n",
" dataset = open_json(f'json_data_id/{dataset_name}')\n",
"\n",
" new_dataset = traverse_dataset(dataset)\n",
" new_dataset = drop_keywords(new_dataset)\n",
" new_dataset = clean_dataset(new_dataset)\n",
"\n",
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}