CovidSpeechChallenge_2021/svm.py

137 lines
3.8 KiB
Python

from sklearn.svm import LinearSVC
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer
from joblib import Parallel, delayed
import pandas as pd
import scipy
import os, yaml
import json
import sys
import arff
import numpy as np
from tqdm import tqdm
from glob import glob
RANDOM_SEED = 42
GRID = [
{'scaler': [StandardScaler(), None],
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
'estimator__loss': ['squared_hinge'],
'estimator__C': np.logspace(-1, -5, num=5),
'estimator__class_weight': ['balanced', None],
'estimator__max_iter': [100000]
}
]
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC())])
if __name__=='__main__':
# load features and labels
devel_X_vgg = np.load(
"vgg_features\\x_devel_data_vgg.npy", allow_pickle=True
)
test_X_vgg = np.load(
"vgg_features\\x_test_data_vgg.npy", allow_pickle=True
)
train_X_vgg = np.load(
"vgg_features\\x_train_data_vgg.npy", allow_pickle=True
)
devel_X_hand = np.load(
"hand_features\\x_devel_data.npy", allow_pickle=True
)
test_X_hand = np.load(
"hand_features\\x_test_data.npy", allow_pickle=True
)
train_X_hand = np.load(
"hand_features\\x_train_data.npy", allow_pickle=True
)
devel_y = np.load(
"vgg_features\\y_devel_label_vgg.npy", allow_pickle=True
)
test_y = np.load(
"vgg_features\\y_test_label_vgg.npy", allow_pickle=True
)
train_y = np.load(
"vgg_features\\y_train_label_vgg.npy", allow_pickle=True
)
num_train = train_X_vgg.shape[0]
num_devel = devel_X_vgg.shape[0]
split_indices = np.repeat([-1, 0], [num_train, num_devel])
split = PredefinedSplit(split_indices)
train_X_vgg = np.squeeze(train_X_vgg)
devel_X_vgg = np.squeeze(devel_X_vgg)
test_X_vgg = np.squeeze(test_X_vgg)
devel_X = np.concatenate(
(
devel_X_hand,
devel_X_vgg
),
axis=1,
)
test_X = np.concatenate(
(
test_X_hand,
test_X_vgg
),
axis=1,
)
train_X = np.concatenate(
(
train_X_hand,
train_X_vgg
),
axis=1,
)
X = np.append(train_X, devel_X, axis=0)
y = np.append(train_y, devel_y, axis=0)
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
scoring=make_scorer(recall_score, average='macro'),
n_jobs=-1, cv=split, refit=True, verbose=1,
return_train_score=False)
# fit on data. train -> devel first, then train+devel implicit
grid_search.fit(X, y)
best_estimator = grid_search.best_estimator_
# fit clone of best estimator on train again for devel predictions
estimator = clone(best_estimator, safe=False)
estimator.fit(train_X, train_y)
preds = estimator.predict(devel_X)
metrics = {'dev': {}, 'test': {}}
# devel metrics
print('DEVEL')
uar = recall_score(devel_y, preds, average='macro')
cm = confusion_matrix(devel_y, preds)
print(f'UAR: {uar}\n{classification_report(devel_y, preds)}\n\nConfusion Matrix:\n\n{cm}')
pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
# test metrics
print('TEST')
preds = best_estimator.predict(test_X)
uar = recall_score(test_y, preds, average='macro')
cm = confusion_matrix(test_y, preds)
print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}')