openl3 svm
This commit is contained in:
parent
4b834fab47
commit
02feb535b3
143
src/svm_openl3.py
Normal file
143
src/svm_openl3.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.base import clone
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.model_selection import PredefinedSplit, GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix
|
||||||
|
# import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RANDOM_SEED = 42
|
||||||
|
|
||||||
|
GRID = [
|
||||||
|
{'scaler': [StandardScaler(), None],
|
||||||
|
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
|
||||||
|
'estimator__loss': ['squared_hinge'],
|
||||||
|
'estimator__C': np.logspace(-1, -5, num=5),
|
||||||
|
'estimator__class_weight': ['balanced', None],
|
||||||
|
'estimator__max_iter': [1000]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))])
|
||||||
|
|
||||||
|
|
||||||
|
def sta_fun_2(npdata): # 1D np array
|
||||||
|
"""Extract various statistical features from the numpy array provided as input.
|
||||||
|
|
||||||
|
:param np_data: the numpy array to extract the features from
|
||||||
|
:type np_data: numpy.ndarray
|
||||||
|
:return: The extracted features as a vector
|
||||||
|
:rtype: numpy.ndarray
|
||||||
|
"""
|
||||||
|
|
||||||
|
# perform a sanity check
|
||||||
|
if npdata is None:
|
||||||
|
raise ValueError("Input array cannot be None")
|
||||||
|
|
||||||
|
# perform the feature extraction
|
||||||
|
Mean = np.mean(npdata, axis=0)
|
||||||
|
Std = np.std(npdata, axis=0)
|
||||||
|
|
||||||
|
# finally return the features in a concatenated array (as a vector)
|
||||||
|
return np.concatenate((Mean, Std), axis=0).reshape(1, -1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# load openL3 features and labels
|
||||||
|
files = os.listdir('./features/openl3/train/')
|
||||||
|
filenames = ['./features/openl3/train/' + f for f in files]
|
||||||
|
|
||||||
|
X_train = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_train.extend(sta_fun_2(emb))
|
||||||
|
X_train = np.array(X_train, dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/test/')
|
||||||
|
filenames = ['./features/openl3/test/' + f for f in files]
|
||||||
|
|
||||||
|
X_test = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_test.extend(sta_fun_2(emb))
|
||||||
|
X_test = np.array(X_test, dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/devel/')
|
||||||
|
filenames = ['./features/openl3/devel/' + f for f in files]
|
||||||
|
X_devel = [np.load(fname)['embedding'] for fname in filenames]
|
||||||
|
|
||||||
|
X_devel = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_devel.extend(sta_fun_2(emb))
|
||||||
|
X_devel = np.array(X_devel, dtype=object)
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/train.csv', sep=',')
|
||||||
|
y_train = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/test.csv', sep=',')
|
||||||
|
y_test = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/devel.csv', sep=',')
|
||||||
|
y_devel = df.label
|
||||||
|
|
||||||
|
num_train = X_train.shape[0]
|
||||||
|
num_devel = X_devel.shape[0]
|
||||||
|
split_indices = np.repeat([-1, 0], [num_train, num_devel])
|
||||||
|
split = PredefinedSplit(split_indices)
|
||||||
|
|
||||||
|
train_X = np.squeeze(X_train)
|
||||||
|
devel_X = np.squeeze(X_devel)
|
||||||
|
test_X = np.squeeze(X_test)
|
||||||
|
|
||||||
|
X = np.append(train_X, devel_X, axis=0)
|
||||||
|
y = np.append(y_train, y_devel, axis=0)
|
||||||
|
|
||||||
|
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
|
||||||
|
scoring=make_scorer(
|
||||||
|
recall_score, average='macro'),
|
||||||
|
n_jobs=-1, cv=split, refit=True, verbose=1,
|
||||||
|
return_train_score=False)
|
||||||
|
|
||||||
|
# find best estimator with grid search
|
||||||
|
grid_search.fit(np.asarray(X), y)
|
||||||
|
best_estimator = grid_search.best_estimator_
|
||||||
|
|
||||||
|
# fit clone of best estimator on train again for devel predictions
|
||||||
|
estimator = clone(best_estimator, safe=False)
|
||||||
|
estimator.fit(train_X, y_train)
|
||||||
|
preds = estimator.predict(devel_X)
|
||||||
|
|
||||||
|
metrics = {'dev': {}, 'test': {}}
|
||||||
|
|
||||||
|
# devel results
|
||||||
|
print('DEVEL')
|
||||||
|
uar = recall_score(y_devel, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_devel, preds)
|
||||||
|
print(
|
||||||
|
f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
# optional write grid_search to csv file
|
||||||
|
# pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
|
||||||
|
|
||||||
|
# test results
|
||||||
|
print('TEST')
|
||||||
|
preds = best_estimator.predict(test_X)
|
||||||
|
uar = recall_score(y_test, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_test, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[
|
||||||
|
'Negative', 'Positive'], normalize='true')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicated Label')
|
||||||
|
plt.savefig('cm_svm_openL3.jpg')
|
193
src/svm_openl3_all.py
Normal file
193
src/svm_openl3_all.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.base import clone
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.model_selection import PredefinedSplit, GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix
|
||||||
|
# import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RANDOM_SEED = 42
|
||||||
|
|
||||||
|
GRID = [
|
||||||
|
{'scaler': [StandardScaler(), None],
|
||||||
|
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
|
||||||
|
'estimator__loss': ['squared_hinge'],
|
||||||
|
'estimator__C': np.logspace(-1, -5, num=5),
|
||||||
|
'estimator__class_weight': ['balanced', None],
|
||||||
|
'estimator__max_iter': [1000]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))])
|
||||||
|
|
||||||
|
def sta_fun_2(npdata): # 1D np array
|
||||||
|
"""Extract various statistical features from the numpy array provided as input.
|
||||||
|
|
||||||
|
:param np_data: the numpy array to extract the features from
|
||||||
|
:type np_data: numpy.ndarray
|
||||||
|
:return: The extracted features as a vector
|
||||||
|
:rtype: numpy.ndarray
|
||||||
|
"""
|
||||||
|
|
||||||
|
# perform a sanity check
|
||||||
|
if npdata is None:
|
||||||
|
raise ValueError("Input array cannot be None")
|
||||||
|
|
||||||
|
# perform the feature extraction
|
||||||
|
Mean = np.mean(npdata, axis=0)
|
||||||
|
Std = np.std(npdata, axis=0)
|
||||||
|
|
||||||
|
# finally return the features in a concatenated array (as a vector)
|
||||||
|
return np.concatenate((Mean, Std), axis=0).reshape(1, -1)
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
|
||||||
|
# load features and labels
|
||||||
|
files = os.listdir('./features/openl3/train/')
|
||||||
|
filenames = ['./features/openl3/train/' + f for f in files]
|
||||||
|
|
||||||
|
X_train_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_train_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_train_openl3 = np.array(X_train_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/test/')
|
||||||
|
filenames = ['./features/openl3/test/' + f for f in files]
|
||||||
|
|
||||||
|
X_test_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_test_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_test_openl3 = np.array(X_test_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/devel/')
|
||||||
|
filenames = ['./features/openl3/devel/' + f for f in files]
|
||||||
|
|
||||||
|
X_devel_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_devel_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_devel_openl3 = np.array(X_devel_openl3,dtype=object)
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/train.csv', sep =',')
|
||||||
|
y_train = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/test.csv', sep =',')
|
||||||
|
y_test = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/devel.csv', sep =',')
|
||||||
|
y_devel = df.label
|
||||||
|
|
||||||
|
devel_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_devel_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_test_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_train_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
devel_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_devel_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_test_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_train_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
num_train = train_X_vgg.shape[0]
|
||||||
|
num_devel = devel_X_vgg.shape[0]
|
||||||
|
split_indices = np.repeat([-1, 0], [num_train, num_devel])
|
||||||
|
split = PredefinedSplit(split_indices)
|
||||||
|
|
||||||
|
train_X_openl3 = np.squeeze(X_train_openl3)
|
||||||
|
devel_X_openl3 = np.squeeze(X_devel_openl3)
|
||||||
|
test_X_openl3 = np.squeeze(X_test_openl3)
|
||||||
|
|
||||||
|
train_X_vgg = np.squeeze(train_X_vgg)
|
||||||
|
devel_X_vgg = np.squeeze(devel_X_vgg)
|
||||||
|
test_X_vgg = np.squeeze(test_X_vgg)
|
||||||
|
|
||||||
|
devel_X = np.concatenate(
|
||||||
|
(
|
||||||
|
devel_X_hand,
|
||||||
|
devel_X_vgg,
|
||||||
|
devel_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X = np.concatenate(
|
||||||
|
(
|
||||||
|
test_X_hand,
|
||||||
|
test_X_vgg,
|
||||||
|
test_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X = np.concatenate(
|
||||||
|
(
|
||||||
|
train_X_hand,
|
||||||
|
train_X_vgg,
|
||||||
|
train_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
X = np.append(train_X, devel_X, axis=0)
|
||||||
|
y = np.append(y_train, y_devel, axis=0)
|
||||||
|
|
||||||
|
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
|
||||||
|
scoring=make_scorer(recall_score, average='macro'),
|
||||||
|
n_jobs=-1, cv=split, refit=True, verbose=1,
|
||||||
|
return_train_score=False)
|
||||||
|
|
||||||
|
# find best estimator with grid search
|
||||||
|
grid_search.fit(X,y)
|
||||||
|
best_estimator = grid_search.best_estimator_
|
||||||
|
|
||||||
|
# fit clone of best estimator on train again for devel predictions
|
||||||
|
estimator = clone(best_estimator, safe=False)
|
||||||
|
estimator.fit(train_X, y_train)
|
||||||
|
preds = estimator.predict(devel_X)
|
||||||
|
|
||||||
|
metrics = {'dev': {}, 'test': {}}
|
||||||
|
|
||||||
|
# devel results
|
||||||
|
print('DEVEL')
|
||||||
|
uar = recall_score(y_devel, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_devel, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
# optional write grid_search to csv file
|
||||||
|
# pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
|
||||||
|
|
||||||
|
# test results
|
||||||
|
print('TEST')
|
||||||
|
preds = best_estimator.predict(test_X)
|
||||||
|
uar = recall_score(y_test, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_test, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[
|
||||||
|
'Negative', 'Positive'], normalize='true')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicated Label')
|
||||||
|
plt.savefig('cm_svm_all.jpg')
|
174
src/svm_openl3_hand.py
Normal file
174
src/svm_openl3_hand.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.base import clone
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.model_selection import PredefinedSplit, GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix
|
||||||
|
# import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RANDOM_SEED = 42
|
||||||
|
|
||||||
|
GRID = [
|
||||||
|
{'scaler': [StandardScaler(), None],
|
||||||
|
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
|
||||||
|
'estimator__loss': ['squared_hinge'],
|
||||||
|
'estimator__C': np.logspace(-1, -5, num=5),
|
||||||
|
'estimator__class_weight': ['balanced', None],
|
||||||
|
'estimator__max_iter': [1000]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))])
|
||||||
|
|
||||||
|
def sta_fun_2(npdata): # 1D np array
|
||||||
|
"""Extract various statistical features from the numpy array provided as input.
|
||||||
|
|
||||||
|
:param np_data: the numpy array to extract the features from
|
||||||
|
:type np_data: numpy.ndarray
|
||||||
|
:return: The extracted features as a vector
|
||||||
|
:rtype: numpy.ndarray
|
||||||
|
"""
|
||||||
|
|
||||||
|
# perform a sanity check
|
||||||
|
if npdata is None:
|
||||||
|
raise ValueError("Input array cannot be None")
|
||||||
|
|
||||||
|
# perform the feature extraction
|
||||||
|
Mean = np.mean(npdata, axis=0)
|
||||||
|
Std = np.std(npdata, axis=0)
|
||||||
|
|
||||||
|
# finally return the features in a concatenated array (as a vector)
|
||||||
|
return np.concatenate((Mean, Std), axis=0).reshape(1, -1)
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
|
||||||
|
# load handcrafted and openL3 features and labels
|
||||||
|
files = os.listdir('./features/openl3/train/')
|
||||||
|
filenames = ['./features/openl3/train/' + f for f in files]
|
||||||
|
|
||||||
|
X_train_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_train_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_train_openl3 = np.array(X_train_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/test/')
|
||||||
|
filenames = ['./features/openl3/test/' + f for f in files]
|
||||||
|
|
||||||
|
X_test_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_test_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_test_openl3 = np.array(X_test_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/devel/')
|
||||||
|
filenames = ['./features/openl3/devel/' + f for f in files]
|
||||||
|
|
||||||
|
X_devel_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_devel_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_devel_openl3 = np.array(X_devel_openl3,dtype=object)
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/train.csv', sep =',')
|
||||||
|
y_train = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/test.csv', sep =',')
|
||||||
|
y_test = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/devel.csv', sep =',')
|
||||||
|
y_devel = df.label
|
||||||
|
|
||||||
|
devel_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_devel_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_test_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X_hand = np.load(
|
||||||
|
"./features/hand_features/x_train_data.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
num_train = train_X_hand.shape[0]
|
||||||
|
num_devel = devel_X_hand.shape[0]
|
||||||
|
split_indices = np.repeat([-1, 0], [num_train, num_devel])
|
||||||
|
split = PredefinedSplit(split_indices)
|
||||||
|
|
||||||
|
train_X_openl3 = np.squeeze(X_train_openl3)
|
||||||
|
devel_X_openl3 = np.squeeze(X_devel_openl3)
|
||||||
|
test_X_openl3 = np.squeeze(X_test_openl3)
|
||||||
|
|
||||||
|
devel_X = np.concatenate(
|
||||||
|
(
|
||||||
|
devel_X_hand,
|
||||||
|
devel_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X = np.concatenate(
|
||||||
|
(
|
||||||
|
test_X_hand,
|
||||||
|
test_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X = np.concatenate(
|
||||||
|
(
|
||||||
|
train_X_hand,
|
||||||
|
train_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
X = np.append(train_X, devel_X, axis=0)
|
||||||
|
y = np.append(y_train, y_devel, axis=0)
|
||||||
|
|
||||||
|
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
|
||||||
|
scoring=make_scorer(recall_score, average='macro'),
|
||||||
|
n_jobs=-1, cv=split, refit=True, verbose=1,
|
||||||
|
return_train_score=False)
|
||||||
|
|
||||||
|
# find best estimator with grid search
|
||||||
|
grid_search.fit(X,y)
|
||||||
|
best_estimator = grid_search.best_estimator_
|
||||||
|
|
||||||
|
# fit clone of best estimator on train again for devel predictions
|
||||||
|
estimator = clone(best_estimator, safe=False)
|
||||||
|
estimator.fit(train_X, y_train)
|
||||||
|
preds = estimator.predict(devel_X)
|
||||||
|
|
||||||
|
metrics = {'dev': {}, 'test': {}}
|
||||||
|
|
||||||
|
# devel results
|
||||||
|
print('DEVEL')
|
||||||
|
uar = recall_score(y_devel, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_devel, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
# optional write grid_search to csv file
|
||||||
|
# pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
|
||||||
|
|
||||||
|
# test results
|
||||||
|
print('TEST')
|
||||||
|
preds = best_estimator.predict(test_X)
|
||||||
|
uar = recall_score(y_test, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_test, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[
|
||||||
|
'Negative', 'Positive'], normalize='true')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicated Label')
|
||||||
|
plt.savefig('cm_svm_openL3_hand.jpg')
|
178
src/svm_openl3_vgg.py
Normal file
178
src/svm_openl3_vgg.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.base import clone
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.model_selection import PredefinedSplit, GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix
|
||||||
|
# import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RANDOM_SEED = 42
|
||||||
|
|
||||||
|
GRID = [
|
||||||
|
{'scaler': [StandardScaler(), None],
|
||||||
|
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
|
||||||
|
'estimator__loss': ['squared_hinge'],
|
||||||
|
'estimator__C': np.logspace(-1, -5, num=5),
|
||||||
|
'estimator__class_weight': ['balanced', None],
|
||||||
|
'estimator__max_iter': [1000]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC(dual=True))])
|
||||||
|
|
||||||
|
def sta_fun_2(npdata): # 1D np array
|
||||||
|
"""Extract various statistical features from the numpy array provided as input.
|
||||||
|
|
||||||
|
:param np_data: the numpy array to extract the features from
|
||||||
|
:type np_data: numpy.ndarray
|
||||||
|
:return: The extracted features as a vector
|
||||||
|
:rtype: numpy.ndarray
|
||||||
|
"""
|
||||||
|
|
||||||
|
# perform a sanity check
|
||||||
|
if npdata is None:
|
||||||
|
raise ValueError("Input array cannot be None")
|
||||||
|
|
||||||
|
# perform the feature extraction
|
||||||
|
Mean = np.mean(npdata, axis=0)
|
||||||
|
Std = np.std(npdata, axis=0)
|
||||||
|
|
||||||
|
# finally return the features in a concatenated array (as a vector)
|
||||||
|
return np.concatenate((Mean, Std), axis=0).reshape(1, -1)
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
|
||||||
|
# load handcrafted and openL3 features and labels
|
||||||
|
files = os.listdir('./features/openl3/train/')
|
||||||
|
filenames = ['./features/openl3/train/' + f for f in files]
|
||||||
|
|
||||||
|
X_train_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_train_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_train_openl3 = np.array(X_train_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/test/')
|
||||||
|
filenames = ['./features/openl3/test/' + f for f in files]
|
||||||
|
|
||||||
|
X_test_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_test_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_test_openl3 = np.array(X_test_openl3,dtype=object)
|
||||||
|
|
||||||
|
files = os.listdir('./features/openl3/devel/')
|
||||||
|
filenames = ['./features/openl3/devel/' + f for f in files]
|
||||||
|
|
||||||
|
X_devel_openl3 = []
|
||||||
|
|
||||||
|
for i in range(len(filenames)):
|
||||||
|
emb = np.load(filenames[i])['embedding']
|
||||||
|
X_devel_openl3.extend(sta_fun_2(emb))
|
||||||
|
X_devel_openl3 = np.array(X_devel_openl3,dtype=object)
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/train.csv', sep =',')
|
||||||
|
y_train = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/test.csv', sep =',')
|
||||||
|
y_test = df.label
|
||||||
|
|
||||||
|
df = pd.read_csv('./dist/lab/devel.csv', sep =',')
|
||||||
|
y_devel = df.label
|
||||||
|
|
||||||
|
devel_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_devel_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_test_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X_vgg = np.load(
|
||||||
|
"./features/vgg_features/x_train_data_vgg.npy", allow_pickle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
num_train = train_X_vgg.shape[0]
|
||||||
|
num_devel = devel_X_vgg.shape[0]
|
||||||
|
split_indices = np.repeat([-1, 0], [num_train, num_devel])
|
||||||
|
split = PredefinedSplit(split_indices)
|
||||||
|
|
||||||
|
train_X_openl3 = np.squeeze(X_train_openl3)
|
||||||
|
devel_X_openl3 = np.squeeze(X_devel_openl3)
|
||||||
|
test_X_openl3 = np.squeeze(X_test_openl3)
|
||||||
|
|
||||||
|
train_X_vgg = np.squeeze(train_X_vgg)
|
||||||
|
devel_X_vgg = np.squeeze(devel_X_vgg)
|
||||||
|
test_X_vgg = np.squeeze(test_X_vgg)
|
||||||
|
|
||||||
|
devel_X = np.concatenate(
|
||||||
|
(
|
||||||
|
devel_X_vgg,
|
||||||
|
devel_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
test_X = np.concatenate(
|
||||||
|
(
|
||||||
|
test_X_vgg,
|
||||||
|
test_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
train_X = np.concatenate(
|
||||||
|
(
|
||||||
|
train_X_vgg,
|
||||||
|
train_X_openl3
|
||||||
|
),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
X = np.append(train_X, devel_X, axis=0)
|
||||||
|
y = np.append(y_train, y_devel, axis=0)
|
||||||
|
|
||||||
|
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
|
||||||
|
scoring=make_scorer(recall_score, average='macro'),
|
||||||
|
n_jobs=-1, cv=split, refit=True, verbose=1,
|
||||||
|
return_train_score=False)
|
||||||
|
|
||||||
|
# find best estimator with grid search
|
||||||
|
grid_search.fit(X,y)
|
||||||
|
best_estimator = grid_search.best_estimator_
|
||||||
|
|
||||||
|
# fit clone of best estimator on train again for devel predictions
|
||||||
|
estimator = clone(best_estimator, safe=False)
|
||||||
|
estimator.fit(train_X, y_train)
|
||||||
|
preds = estimator.predict(devel_X)
|
||||||
|
|
||||||
|
metrics = {'dev': {}, 'test': {}}
|
||||||
|
|
||||||
|
# devel results
|
||||||
|
print('DEVEL')
|
||||||
|
uar = recall_score(y_devel, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_devel, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_devel, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
# optional write grid_search to csv file
|
||||||
|
# pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
|
||||||
|
|
||||||
|
# test results
|
||||||
|
print('TEST')
|
||||||
|
preds = best_estimator.predict(test_X)
|
||||||
|
uar = recall_score(y_test, preds, average='macro')
|
||||||
|
cm = confusion_matrix(y_test, preds)
|
||||||
|
print(f'UAR: {uar}\n{classification_report(y_test, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
plot_confusion_matrix(best_estimator, X=test_X, y_true=y_test, cmap=plt.cm.Blues, display_labels=[
|
||||||
|
'Negative', 'Positive'], normalize='true')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicated Label')
|
||||||
|
plt.savefig('cm_svm_openL3_vgg.jpg')
|
Loading…
Reference in New Issue
Block a user