fix: fix index bug in save mfcc to json

chore: add information about sessions
into the mfcc json file
2021-07-09 17:19:56 +02:00 · 2021-07-09 16:58:16 +02:00
4 changed files with 132 additions and 28 deletions
--- a/Handle_emg_data.py
+++ b/Handle_emg_data.py
@ -528,12 +528,12 @@ class NN_handler:
                                        4: [],
                                        5: []
                                        }
-        # Should med 4 sessions * (~150, 208) of mfcc samples per person. One DataFrame per subject
-        self.mfcc_samples_per_subject = {1: None,  
-                                         2: None, 
-                                         3: None,
-                                         4: None,
-                                         5: None
+        # Should med 4 sessions * (~150, 208) of mfcc samples per person. One [DataFrame, session_length_list] per subject
+        self.mfcc_samples_per_subject = {1: [],  
+                                         2: [], 
+                                         3: [],
+                                         4: [],
+                                         5: []
                                         }

    # GET method for reg_samples_dict
@ -627,8 +627,8 @@ class NN_handler:

    # Takes in all EMG session Dataframe and creates DataFrame of MFCC samples
    # Input: DataFrame(shape[1]=16, EMG data)
-    # Output: DataFrame(merged MFCC data, shape: (n, 13*16))
-    def make_mfcc_df_from_session_df(self, session_df) -> DataFrame:
+    # Output: DataFrame(merged MFCC data, shape: (n, 13*16)), length of session datapoints
+    def make_mfcc_df_from_session_df(self, session_df):
        session_df.rename(columns={0:'timestamp'}, inplace=True)
        samplerate = get_samplerate(session_df)
        attach_func = lambda list_1, list_2: list_1.extend(list_2)
@ -645,16 +645,19 @@ class NN_handler:
            mfcc_i = DataFrame(mfcc_i).dropna()
            mfcc_i['combined'] = mfcc_i.values.tolist()
            df = result_df.combine(mfcc_i['combined'], attach_func)
+        
+        session_length = (len(result_df.index)) # Add the length of session data points

-        return result_df
+        return result_df, session_length

    # Merges MFCC data from all sessions and stores the sample data in 
    # the NN_handler's mfcc_samples_per_subject dict
    # Input: None(NN_handler)
-    # Output: None -> stores in NN_handler
+    # Output: None -> stores in NN_handler [samples, session_length_list] for each subject
    def store_mfcc_samples(self) -> None:
        for subject_nr in range(5):
            subj_samples = []
+            session_length_list = []
            for session_nr in range(4):
                list_of_emg = self.get_emg_list(subject_nr+1, session_nr+1)
                tot_session_df = self.make_subj_sample(list_of_emg)
@ -663,11 +666,12 @@ class NN_handler:
                if tot_session_df.isnull().values.any():
                    print('NaN in: subject', subject_nr+1, 'session:', session_nr+1, 'where? HERE')
                
-                mfcc_df_i = self.make_mfcc_df_from_session_df(tot_session_df)
+                mfcc_df_i, session_length = self.make_mfcc_df_from_session_df(tot_session_df)
                subj_samples.append(mfcc_df_i)
+                session_length_list.append(session_length)            
            
            result_df = pd.concat(subj_samples, axis=0, ignore_index=True)
-            self.mfcc_samples_per_subject[subject_nr+1] = result_df
+            self.mfcc_samples_per_subject[subject_nr+1] = [result_df, session_length_list]


    # Makes MFCC data from reg_samples_per_subject and stores it in a json file
@ -735,7 +739,9 @@ class NN_handler:
            data = {
                "mapping": [],
                "labels": [],
-                "mfcc": []
+                "mfcc": [],
+
+                "session_lengths": []
            }

            raw_data_dict = self.get_mfcc_samples_dict()
@ -746,13 +752,15 @@ class NN_handler:
                # save subject label in the mapping
                subject_label = 'Subject ' + str(key)
                print("\nProcessing: {}".format(subject_label))
-                data["mapping"].append(subject_label)
+                data["mapping"].append(subject_label)       # Subject label
+                data["session_lengths"].append(value[1])   # List[subject][session_length_list]

                # process all samples per subject
-                for i, sample in enumerate(value):
+                for i, sample in enumerate(value[0]):

-                    data["labels"].append(key-1)
-                    data["mfcc"].append(sample)
+                    data["labels"].append(key-1)    # Subject nr
+                    data["mfcc"].append(sample)  # MFCC sample on same index
+                     
                    print("sample:{} is done".format(i+1))
                    #print(np.array(mfcc_data).shape)

--- a/Neural_Network_Analysis.py
+++ b/Neural_Network_Analysis.py
@ -13,7 +13,7 @@ DATA_PATH_MFCC = str(Path.cwd()) + "/mfcc_data.json"

 # Loads data from the json file and reshapes X_data(samples, 1, 208) and y_data(samples, 1)
 # Input: JSON path
-# Ouput: X(mfcc data), y(labels)
+# Ouput: X(mfcc data), y(labels), session_lengths
 def load_data_from_json(data_path): 

    with open(data_path, "r") as fp:
@ -21,17 +21,22 @@ def load_data_from_json(data_path):

    # convert lists to numpy arraysls
    X = np.array(data['mfcc'])
+    #print(X.shape)
    X = X.reshape(X.shape[0], 1, X.shape[1])
    #print(X.shape)
    
    y = np.array(data["labels"])
+    #print(y.shape)
    y = y.reshape(y.shape[0], 1)
    #print(y.shape)
+
+    session_lengths = np.array(data['session_lengths'])
+    #print(session_lengths.shape)
    

    print("Data succesfully loaded!")

-    return X, y
+    return X, y, session_lengths

 # Plots the training history with two subplots. First training and test accuracy, and then 
 # loss with respect to epochs
@ -62,17 +67,61 @@ def plot_history(history):

    plt.show()

-# Takes in data and labels, and splits it into train, validation and test sets
+# Takes in data and labels, and splits it into train, validation and test sets by percentage
 # Input: Data, labels, whether to shuffle, % validatiion, % test
 # Ouput: X_train, X_validation, X_test, y_train, y_validation, y_test
-def prepare_datasets_percentsplit(X, y, shuffle_vars:bool, validation_size=0.2, test_size=0.25,):
+def prepare_datasets_percentsplit(X, y, shuffle_vars, validation_size=0.2, test_size=0.25,):

-    # create train, validation and test split
+    # Create train, validation and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=shuffle_vars)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size, shuffle=shuffle_vars)

    return X_train, X_validation, X_test, y_train, y_validation, y_test

+# Takes in data, labels, and session_lengths and splits it into train and test sets by session_index
+# Input: Data, labels, session_lengths, test_session_index
+# Ouput: X_train, X_test, y_train, y_test
+def prepare_datasets_sessions(X, y, session_lengths, test_session_index=4, nr_subjects=5):
+
+    
+    #X_train = np.empty((1, 1, 208))
+    #y_train = np.empty((1, 208))
+    #X_test = np.empty((1, 1, 208))
+    #y_test = np.empty((1, 208))
+    X = X.tolist()
+    y = y.tolist()
+    session_lengths = session_lengths.tolist()
+    X_train = y_train = X_test = y_test = []
+
+    subject_starting_index = 0
+
+    for i in range(nr_subjects):
+        start_test_index = sum(session_lengths[i][:test_session_index])
+        end_test_index = start_test_index + session_lengths[i][test_session_index-1]
+        end_subject_index = sum(session_lengths[i])
+        if start_test_index == subject_starting_index:
+            X_test.append(X[start_test_index:end_test_index])
+            y_test.append(y[start_test_index:end_test_index])
+            X_train.append(X[end_test_index:end_subject_index])
+            y_train.append(y[end_test_index:end_subject_index])
+            
+        elif end_test_index == end_subject_index:
+            X_train.append(X[subject_starting_index:start_test_index])
+            y_train.append(y[subject_starting_index:start_test_index])
+            X_test.append(X[start_test_index:end_test_index])
+            y_test.append(y[start_test_index:end_test_index])
+        else:
+            X_train.append(X[subject_starting_index:start_test_index])
+            y_train.append(y[subject_starting_index:start_test_index])
+            X_test.append(X[start_test_index:end_test_index])
+            y_test.append(y[start_test_index:end_test_index])
+            X_train.append(X[end_test_index:end_subject_index])
+            y_train.append(y[end_test_index:end_subject_index])
+        subject_starting_index = end_subject_index
+
+
+    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
+
 # Creates a RNN_LSTM neural network model
 # Input: input shape, classes of classification
 # Ouput: model:Keras.model
@ -119,15 +168,29 @@ def train(model, batch_size, epochs, X_train, X_validation, y_train, y_validatio
 if __name__ == "__main__":

    # Load data
-    X, y = load_data_from_json(DATA_PATH_MFCC)
+    X, y, session_lengths = load_data_from_json(DATA_PATH_MFCC)
+
+    print(X.shape)
+    print(y.shape)
+    print(session_lengths.shape)

    # Get prepared data: train, validation, and test
-    X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets_percentsplit(X, y,
-                                                                                                validation_size=0.2, 
-                                                                                                test_size=0.25,  
-                                                                                                shuffle_vars=True)
-    print(X_train.shape)
+    '''
+    (X_train, X_validation, 
+    X_test, y_train, 
+    y_validation, 
+    y_test) = prepare_datasets_percentsplit(X, y, validation_size=0.2, test_size=0.25,  shuffle_vars=True)
+    '''
+    (X_train, X_test, 
+    y_train, y_test) = prepare_datasets_sessions(X, y, session_lengths)

+    print(X_train.size)
+    print(X_train.shape)
+    print(X_test.shape)
+    print(y_train.shape)
+    print(y_test.shape)
+
+    '''
    # Make model
    model = RNN_LSTM(input_shape=(1, 208))
    model.summary()
@ -141,6 +204,7 @@ if __name__ == "__main__":
    # evaluate model on test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print('\nTest accuracy:', test_acc)
+    '''
    


--- a/pycache/Handle_emg_data.cpython-38.pyc
+++ b/pycache/Handle_emg_data.cpython-38.pyc
--- a/mfcc_data.json
+++ b/mfcc_data.json
@ -592075,5 +592075,37 @@
            11.059385475491096,
            -2.718611940111453
        ]
+    ],
+    "session_lengths": [
+        [
+            162,
+            126,
+            157,
+            149
+        ],
+        [
+            137,
+            127,
+            143,
+            127
+        ],
+        [
+            178,
+            193,
+            180,
+            176
+        ],
+        [
+            132,
+            115,
+            122,
+            123
+        ],
+        [
+            151,
+            100,
+            102,
+            106
+        ]
    ]
 }
Author	SHA1	Message	Date
Skudalen	ec6c2c9dcc	fix: fix index bug in save mfcc to json	2021-07-09 17:19:56 +02:00
Skudalen	4ba390a268	chore: add information about sessions into the mfcc json file	2021-07-09 16:58:16 +02:00