From 02feb535b3dc85481bbee40369633b0aa9d8a6a2 Mon Sep 17 00:00:00 2001 From: em474re Date: Tue, 7 Sep 2021 14:44:25 +0200 Subject: [PATCH] openl3 svm --- src/svm_openl3.py | 143 ++++++++++++++++++++++++++++++ src/svm_openl3_all.py | 193 +++++++++++++++++++++++++++++++++++++++++ src/svm_openl3_hand.py | 174 +++++++++++++++++++++++++++++++++++++ src/svm_openl3_vgg.py | 178 +++++++++++++++++++++++++++++++++++++ 4 files changed, 688 insertions(+) create mode 100644 src/svm_openl3.py create mode 100644 src/svm_openl3_all.py create mode 100644 src/svm_openl3_hand.py create mode 100644 src/svm_openl3_vgg.py diff --git a/src/svm_openl3.py b/src/svm_openl3.py new file mode 100644 index 0000000..26ebbad --- /dev/null +++ b/src/svm_openl3.py @@ -0,0 +1,143 @@ +from sklearn.svm import LinearSVC +from sklearn.base import clone +from sklearn.pipeline import Pipeline +from sklearn.model_selection import PredefinedSplit, GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix +# import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os +import pandas as pd + +RANDOM_SEED = 42 + +GRID = [ + {'scaler': [StandardScaler(), None], + 'estimator': [LinearSVC(random_state=RANDOM_SEED)], + 'estimator__loss': ['squared_hinge'], + 'estimator__C': np.logspace(-1, -5, num=5), + 'estimator__class_weight': ['balanced', None], + 'estimator__max_iter': [1000] + } +] + +PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))]) + + +def sta_fun_2(npdata): # 1D np array + """Extract various statistical features from the numpy array provided as input. + + :param np_data: the numpy array to extract the features from + :type np_data: numpy.ndarray + :return: The extracted features as a vector + :rtype: numpy.ndarray + """ + + # perform a sanity check + if npdata is None: + raise ValueError("Input array cannot be None") + + # perform the feature extraction + Mean = np.mean(npdata, axis=0) + Std = np.std(npdata, axis=0) + + # finally return the features in a concatenated array (as a vector) + return np.concatenate((Mean, Std), axis=0).reshape(1, -1) + + +if __name__ == '__main__': + + # load openL3 features and labels + files = os.listdir('./features/openl3/train/') + filenames = ['./features/openl3/train/' + f for f in files] + + X_train = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_train.extend(sta_fun_2(emb)) + X_train = np.array(X_train, dtype=object) + + files = os.listdir('./features/openl3/test/') + filenames = ['./features/openl3/test/' + f for f in files] + + X_test = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_test.extend(sta_fun_2(emb)) + X_test = np.array(X_test, dtype=object) + + files = os.listdir('./features/openl3/devel/') + filenames = ['./features/openl3/devel/' + f for f in files] + X_devel = [np.load(fname)['embedding'] for fname in filenames] + + X_devel = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_devel.extend(sta_fun_2(emb)) + X_devel = np.array(X_devel, dtype=object) + + df = pd.read_csv('./dist/lab/train.csv', sep=',') + y_train = df.label + + df = pd.read_csv('./dist/lab/test.csv', sep=',') + y_test = df.label + + df = pd.read_csv('./dist/lab/devel.csv', sep=',') + y_devel = df.label + + num_train = X_train.shape[0] + num_devel = X_devel.shape[0] + split_indices = np.repeat([-1, 0], [num_train, num_devel]) + split = PredefinedSplit(split_indices) + + train_X = np.squeeze(X_train) + devel_X = np.squeeze(X_devel) + test_X = np.squeeze(X_test) + + X = np.append(train_X, devel_X, axis=0) + y = np.append(y_train, y_devel, axis=0) + + grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, + scoring=make_scorer( + recall_score, average='macro'), + n_jobs=-1, cv=split, refit=True, verbose=1, + return_train_score=False) + + # find best estimator with grid search + grid_search.fit(np.asarray(X), y) + best_estimator = grid_search.best_estimator_ + + # fit clone of best estimator on train again for devel predictions + estimator = clone(best_estimator, safe=False) + estimator.fit(train_X, y_train) + preds = estimator.predict(devel_X) + + metrics = {'dev': {}, 'test': {}} + + # devel results + print('DEVEL') + uar = recall_score(y_devel, preds, average='macro') + cm = confusion_matrix(y_devel, preds) + print( + f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}') + + # optional write grid_search to csv file + # pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False) + + # test results + print('TEST') + preds = best_estimator.predict(test_X) + uar = recall_score(y_test, preds, average='macro') + cm = confusion_matrix(y_test, preds) + print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}') + + fig = plt.figure() + plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[ + 'Negative', 'Positive'], normalize='true') + plt.ylabel('True Label') + plt.xlabel('Predicated Label') + plt.savefig('cm_svm_openL3.jpg') diff --git a/src/svm_openl3_all.py b/src/svm_openl3_all.py new file mode 100644 index 0000000..ac41144 --- /dev/null +++ b/src/svm_openl3_all.py @@ -0,0 +1,193 @@ +from sklearn.svm import LinearSVC +from sklearn.base import clone +from sklearn.pipeline import Pipeline +from sklearn.model_selection import PredefinedSplit, GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix +# import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os +import pandas as pd + +RANDOM_SEED = 42 + +GRID = [ + {'scaler': [StandardScaler(), None], + 'estimator': [LinearSVC(random_state=RANDOM_SEED)], + 'estimator__loss': ['squared_hinge'], + 'estimator__C': np.logspace(-1, -5, num=5), + 'estimator__class_weight': ['balanced', None], + 'estimator__max_iter': [1000] + } +] + +PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))]) + +def sta_fun_2(npdata): # 1D np array + """Extract various statistical features from the numpy array provided as input. + + :param np_data: the numpy array to extract the features from + :type np_data: numpy.ndarray + :return: The extracted features as a vector + :rtype: numpy.ndarray + """ + + # perform a sanity check + if npdata is None: + raise ValueError("Input array cannot be None") + + # perform the feature extraction + Mean = np.mean(npdata, axis=0) + Std = np.std(npdata, axis=0) + + # finally return the features in a concatenated array (as a vector) + return np.concatenate((Mean, Std), axis=0).reshape(1, -1) + +if __name__=='__main__': + + # load features and labels + files = os.listdir('./features/openl3/train/') + filenames = ['./features/openl3/train/' + f for f in files] + + X_train_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_train_openl3.extend(sta_fun_2(emb)) + X_train_openl3 = np.array(X_train_openl3,dtype=object) + + files = os.listdir('./features/openl3/test/') + filenames = ['./features/openl3/test/' + f for f in files] + + X_test_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_test_openl3.extend(sta_fun_2(emb)) + X_test_openl3 = np.array(X_test_openl3,dtype=object) + + files = os.listdir('./features/openl3/devel/') + filenames = ['./features/openl3/devel/' + f for f in files] + + X_devel_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_devel_openl3.extend(sta_fun_2(emb)) + X_devel_openl3 = np.array(X_devel_openl3,dtype=object) + + df = pd.read_csv('./dist/lab/train.csv', sep =',') + y_train = df.label + + df = pd.read_csv('./dist/lab/test.csv', sep =',') + y_test = df.label + + df = pd.read_csv('./dist/lab/devel.csv', sep =',') + y_devel = df.label + + devel_X_vgg = np.load( + "./features/vgg_features/x_devel_data_vgg.npy", allow_pickle=True + ) + + test_X_vgg = np.load( + "./features/vgg_features/x_test_data_vgg.npy", allow_pickle=True + ) + + train_X_vgg = np.load( + "./features/vgg_features/x_train_data_vgg.npy", allow_pickle=True + ) + + devel_X_hand = np.load( + "./features/hand_features/x_devel_data.npy", allow_pickle=True + ) + + test_X_hand = np.load( + "./features/hand_features/x_test_data.npy", allow_pickle=True + ) + + train_X_hand = np.load( + "./features/hand_features/x_train_data.npy", allow_pickle=True + ) + + num_train = train_X_vgg.shape[0] + num_devel = devel_X_vgg.shape[0] + split_indices = np.repeat([-1, 0], [num_train, num_devel]) + split = PredefinedSplit(split_indices) + + train_X_openl3 = np.squeeze(X_train_openl3) + devel_X_openl3 = np.squeeze(X_devel_openl3) + test_X_openl3 = np.squeeze(X_test_openl3) + + train_X_vgg = np.squeeze(train_X_vgg) + devel_X_vgg = np.squeeze(devel_X_vgg) + test_X_vgg = np.squeeze(test_X_vgg) + + devel_X = np.concatenate( + ( + devel_X_hand, + devel_X_vgg, + devel_X_openl3 + ), + axis=1, + ) + + test_X = np.concatenate( + ( + test_X_hand, + test_X_vgg, + test_X_openl3 + ), + axis=1, + ) + + train_X = np.concatenate( + ( + train_X_hand, + train_X_vgg, + train_X_openl3 + ), + axis=1, + ) + + X = np.append(train_X, devel_X, axis=0) + y = np.append(y_train, y_devel, axis=0) + + grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, + scoring=make_scorer(recall_score, average='macro'), + n_jobs=-1, cv=split, refit=True, verbose=1, + return_train_score=False) + + # find best estimator with grid search + grid_search.fit(X,y) + best_estimator = grid_search.best_estimator_ + + # fit clone of best estimator on train again for devel predictions + estimator = clone(best_estimator, safe=False) + estimator.fit(train_X, y_train) + preds = estimator.predict(devel_X) + + metrics = {'dev': {}, 'test': {}} + + # devel results + print('DEVEL') + uar = recall_score(y_devel, preds, average='macro') + cm = confusion_matrix(y_devel, preds) + print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}') + + # optional write grid_search to csv file + # pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False) + + # test results + print('TEST') + preds = best_estimator.predict(test_X) + uar = recall_score(y_test, preds, average='macro') + cm = confusion_matrix(y_test, preds) + print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}') + + fig = plt.figure() + plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[ + 'Negative', 'Positive'], normalize='true') + plt.ylabel('True Label') + plt.xlabel('Predicated Label') + plt.savefig('cm_svm_all.jpg') \ No newline at end of file diff --git a/src/svm_openl3_hand.py b/src/svm_openl3_hand.py new file mode 100644 index 0000000..e84ec5d --- /dev/null +++ b/src/svm_openl3_hand.py @@ -0,0 +1,174 @@ +from sklearn.svm import LinearSVC +from sklearn.base import clone +from sklearn.pipeline import Pipeline +from sklearn.model_selection import PredefinedSplit, GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix +# import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os +import pandas as pd + +RANDOM_SEED = 42 + +GRID = [ + {'scaler': [StandardScaler(), None], + 'estimator': [LinearSVC(random_state=RANDOM_SEED)], + 'estimator__loss': ['squared_hinge'], + 'estimator__C': np.logspace(-1, -5, num=5), + 'estimator__class_weight': ['balanced', None], + 'estimator__max_iter': [1000] + } +] + +PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))]) + +def sta_fun_2(npdata): # 1D np array + """Extract various statistical features from the numpy array provided as input. + + :param np_data: the numpy array to extract the features from + :type np_data: numpy.ndarray + :return: The extracted features as a vector + :rtype: numpy.ndarray + """ + + # perform a sanity check + if npdata is None: + raise ValueError("Input array cannot be None") + + # perform the feature extraction + Mean = np.mean(npdata, axis=0) + Std = np.std(npdata, axis=0) + + # finally return the features in a concatenated array (as a vector) + return np.concatenate((Mean, Std), axis=0).reshape(1, -1) + +if __name__=='__main__': + + # load handcrafted and openL3 features and labels + files = os.listdir('./features/openl3/train/') + filenames = ['./features/openl3/train/' + f for f in files] + + X_train_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_train_openl3.extend(sta_fun_2(emb)) + X_train_openl3 = np.array(X_train_openl3,dtype=object) + + files = os.listdir('./features/openl3/test/') + filenames = ['./features/openl3/test/' + f for f in files] + + X_test_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_test_openl3.extend(sta_fun_2(emb)) + X_test_openl3 = np.array(X_test_openl3,dtype=object) + + files = os.listdir('./features/openl3/devel/') + filenames = ['./features/openl3/devel/' + f for f in files] + + X_devel_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_devel_openl3.extend(sta_fun_2(emb)) + X_devel_openl3 = np.array(X_devel_openl3,dtype=object) + + df = pd.read_csv('./dist/lab/train.csv', sep =',') + y_train = df.label + + df = pd.read_csv('./dist/lab/test.csv', sep =',') + y_test = df.label + + df = pd.read_csv('./dist/lab/devel.csv', sep =',') + y_devel = df.label + + devel_X_hand = np.load( + "./features/hand_features/x_devel_data.npy", allow_pickle=True + ) + + test_X_hand = np.load( + "./features/hand_features/x_test_data.npy", allow_pickle=True + ) + + train_X_hand = np.load( + "./features/hand_features/x_train_data.npy", allow_pickle=True + ) + + num_train = train_X_hand.shape[0] + num_devel = devel_X_hand.shape[0] + split_indices = np.repeat([-1, 0], [num_train, num_devel]) + split = PredefinedSplit(split_indices) + + train_X_openl3 = np.squeeze(X_train_openl3) + devel_X_openl3 = np.squeeze(X_devel_openl3) + test_X_openl3 = np.squeeze(X_test_openl3) + + devel_X = np.concatenate( + ( + devel_X_hand, + devel_X_openl3 + ), + axis=1, + ) + + test_X = np.concatenate( + ( + test_X_hand, + test_X_openl3 + ), + axis=1, + ) + + train_X = np.concatenate( + ( + train_X_hand, + train_X_openl3 + ), + axis=1, + ) + + X = np.append(train_X, devel_X, axis=0) + y = np.append(y_train, y_devel, axis=0) + + grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, + scoring=make_scorer(recall_score, average='macro'), + n_jobs=-1, cv=split, refit=True, verbose=1, + return_train_score=False) + + # find best estimator with grid search + grid_search.fit(X,y) + best_estimator = grid_search.best_estimator_ + + # fit clone of best estimator on train again for devel predictions + estimator = clone(best_estimator, safe=False) + estimator.fit(train_X, y_train) + preds = estimator.predict(devel_X) + + metrics = {'dev': {}, 'test': {}} + + # devel results + print('DEVEL') + uar = recall_score(y_devel, preds, average='macro') + cm = confusion_matrix(y_devel, preds) + print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}') + + # optional write grid_search to csv file + # pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False) + + # test results + print('TEST') + preds = best_estimator.predict(test_X) + uar = recall_score(y_test, preds, average='macro') + cm = confusion_matrix(y_test, preds) + print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}') + + fig = plt.figure() + plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[ + 'Negative', 'Positive'], normalize='true') + plt.ylabel('True Label') + plt.xlabel('Predicated Label') + plt.savefig('cm_svm_openL3_hand.jpg') \ No newline at end of file diff --git a/src/svm_openl3_vgg.py b/src/svm_openl3_vgg.py new file mode 100644 index 0000000..df691f6 --- /dev/null +++ b/src/svm_openl3_vgg.py @@ -0,0 +1,178 @@ +from sklearn.svm import LinearSVC +from sklearn.base import clone +from sklearn.pipeline import Pipeline +from sklearn.model_selection import PredefinedSplit, GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix +# import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os +import pandas as pd + +RANDOM_SEED = 42 + +GRID = [ + {'scaler': [StandardScaler(), None], + 'estimator': [LinearSVC(random_state=RANDOM_SEED)], + 'estimator__loss': ['squared_hinge'], + 'estimator__C': np.logspace(-1, -5, num=5), + 'estimator__class_weight': ['balanced', None], + 'estimator__max_iter': [1000] + } +] + +PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))]) + +def sta_fun_2(npdata): # 1D np array + """Extract various statistical features from the numpy array provided as input. + + :param np_data: the numpy array to extract the features from + :type np_data: numpy.ndarray + :return: The extracted features as a vector + :rtype: numpy.ndarray + """ + + # perform a sanity check + if npdata is None: + raise ValueError("Input array cannot be None") + + # perform the feature extraction + Mean = np.mean(npdata, axis=0) + Std = np.std(npdata, axis=0) + + # finally return the features in a concatenated array (as a vector) + return np.concatenate((Mean, Std), axis=0).reshape(1, -1) + +if __name__=='__main__': + + # load handcrafted and openL3 features and labels + files = os.listdir('./features/openl3/train/') + filenames = ['./features/openl3/train/' + f for f in files] + + X_train_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_train_openl3.extend(sta_fun_2(emb)) + X_train_openl3 = np.array(X_train_openl3,dtype=object) + + files = os.listdir('./features/openl3/test/') + filenames = ['./features/openl3/test/' + f for f in files] + + X_test_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_test_openl3.extend(sta_fun_2(emb)) + X_test_openl3 = np.array(X_test_openl3,dtype=object) + + files = os.listdir('./features/openl3/devel/') + filenames = ['./features/openl3/devel/' + f for f in files] + + X_devel_openl3 = [] + + for i in range(len(filenames)): + emb = np.load(filenames[i])['embedding'] + X_devel_openl3.extend(sta_fun_2(emb)) + X_devel_openl3 = np.array(X_devel_openl3,dtype=object) + + df = pd.read_csv('./dist/lab/train.csv', sep =',') + y_train = df.label + + df = pd.read_csv('./dist/lab/test.csv', sep =',') + y_test = df.label + + df = pd.read_csv('./dist/lab/devel.csv', sep =',') + y_devel = df.label + + devel_X_vgg = np.load( + "./features/vgg_features/x_devel_data_vgg.npy", allow_pickle=True + ) + + test_X_vgg = np.load( + "./features/vgg_features/x_test_data_vgg.npy", allow_pickle=True + ) + + train_X_vgg = np.load( + "./features/vgg_features/x_train_data_vgg.npy", allow_pickle=True + ) + + num_train = train_X_vgg.shape[0] + num_devel = devel_X_vgg.shape[0] + split_indices = np.repeat([-1, 0], [num_train, num_devel]) + split = PredefinedSplit(split_indices) + + train_X_openl3 = np.squeeze(X_train_openl3) + devel_X_openl3 = np.squeeze(X_devel_openl3) + test_X_openl3 = np.squeeze(X_test_openl3) + + train_X_vgg = np.squeeze(train_X_vgg) + devel_X_vgg = np.squeeze(devel_X_vgg) + test_X_vgg = np.squeeze(test_X_vgg) + + devel_X = np.concatenate( + ( + devel_X_vgg, + devel_X_openl3 + ), + axis=1, + ) + + test_X = np.concatenate( + ( + test_X_vgg, + test_X_openl3 + ), + axis=1, + ) + + train_X = np.concatenate( + ( + train_X_vgg, + train_X_openl3 + ), + axis=1, + ) + + X = np.append(train_X, devel_X, axis=0) + y = np.append(y_train, y_devel, axis=0) + + grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, + scoring=make_scorer(recall_score, average='macro'), + n_jobs=-1, cv=split, refit=True, verbose=1, + return_train_score=False) + + # find best estimator with grid search + grid_search.fit(X,y) + best_estimator = grid_search.best_estimator_ + + # fit clone of best estimator on train again for devel predictions + estimator = clone(best_estimator, safe=False) + estimator.fit(train_X, y_train) + preds = estimator.predict(devel_X) + + metrics = {'dev': {}, 'test': {}} + + # devel results + print('DEVEL') + uar = recall_score(y_devel, preds, average='macro') + cm = confusion_matrix(y_devel, preds) + print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}') + + # optional write grid_search to csv file + # pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False) + + # test results + print('TEST') + preds = best_estimator.predict(test_X) + uar = recall_score(y_test, preds, average='macro') + cm = confusion_matrix(y_test, preds) + print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}') + + fig = plt.figure() + plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[ + 'Negative', 'Positive'], normalize='true') + plt.ylabel('True Label') + plt.xlabel('Predicated Label') + plt.savefig('cm_svm_openL3_vgg.jpg') \ No newline at end of file