# JSON to JSONL file converter
This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation

In [27]:
# imports 
import json
import os

### Open JSON data, then write it as JSONL

In [28]:
def open_json(filename):
    # Read the JSON file
    with open(filename, 'r') as json_file:
        return json.load(json_file)

In [29]:
def create_jsonl(filename, new_dataset):
    with open(f'{filename}l', 'w') as jsonl_file:
        for item in new_dataset:
            jsonl_file.write(json.dumps(item) + '\n')

### Loop through dataset, create new list of dictionaries, drop duplicate data

In [30]:
def traverse_dataset(dataset):
    new_dataset = []
    for post in dataset:
        new_dataset.append(post)
        for comment in post['comments']:
            new_dataset.append(comment)
            try:
                for reply in comment['replies']:
                    new_dataset.append(reply)

                    for sec_reply in reply['replies']:
                        new_dataset.append(sec_reply)
            except KeyError:
                pass
    
    return new_dataset

In [31]:
def drop_keywords(dataset):
    for item in dataset:
        try:
            del item['comments']
        except KeyError:
            pass
        try:
            del item['replies']
        except KeyError:
            pass
    
    return dataset

In [37]:
def clean_dataset(dataset):
    cleaned_dataset = []
    for data in dataset:

        cleaned_data = {}
        if 'id' in data:
            cleaned_data['id'] = data.get('id')
        
        if 'publisher' in data:
            cleaned_data['author'] = data.get('publisher')
        
        if 'text' in data:
            cleaned_data['text'] = data.get('text')
        elif 'title' in data:
            cleaned_data['text'] = data.get('title')

        cleaned_dataset.append(cleaned_data)

    return cleaned_dataset

### Execution of functions defined above

In [38]:
for dataset_name in os.listdir('json_data_id/'):
    dataset = open_json(f'json_data_id/{dataset_name}')

    new_dataset = traverse_dataset(dataset)
    new_dataset = drop_keywords(new_dataset)
    new_dataset = clean_dataset(new_dataset)

    create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)