137 lines
3.8 KiB
Python
137 lines
3.8 KiB
Python
|
from sklearn.svm import LinearSVC
|
||
|
from sklearn.base import clone
|
||
|
from sklearn.pipeline import Pipeline
|
||
|
from sklearn.utils import resample
|
||
|
from sklearn.model_selection import PredefinedSplit, GridSearchCV
|
||
|
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
|
||
|
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer
|
||
|
from joblib import Parallel, delayed
|
||
|
import pandas as pd
|
||
|
import scipy
|
||
|
import os, yaml
|
||
|
import json
|
||
|
import sys
|
||
|
import arff
|
||
|
import numpy as np
|
||
|
from tqdm import tqdm
|
||
|
from glob import glob
|
||
|
|
||
|
RANDOM_SEED = 42
|
||
|
|
||
|
GRID = [
|
||
|
{'scaler': [StandardScaler(), None],
|
||
|
'estimator': [LinearSVC(random_state=RANDOM_SEED)],
|
||
|
'estimator__loss': ['squared_hinge'],
|
||
|
'estimator__C': np.logspace(-1, -5, num=5),
|
||
|
'estimator__class_weight': ['balanced', None],
|
||
|
'estimator__max_iter': [100000]
|
||
|
}
|
||
|
]
|
||
|
|
||
|
PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC())])
|
||
|
|
||
|
if __name__=='__main__':
|
||
|
|
||
|
# load features and labels
|
||
|
devel_X_vgg = np.load(
|
||
|
"vgg_features\\x_devel_data_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
test_X_vgg = np.load(
|
||
|
"vgg_features\\x_test_data_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
train_X_vgg = np.load(
|
||
|
"vgg_features\\x_train_data_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
devel_X_hand = np.load(
|
||
|
"hand_features\\x_devel_data.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
test_X_hand = np.load(
|
||
|
"hand_features\\x_test_data.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
train_X_hand = np.load(
|
||
|
"hand_features\\x_train_data.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
devel_y = np.load(
|
||
|
"vgg_features\\y_devel_label_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
test_y = np.load(
|
||
|
"vgg_features\\y_test_label_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
train_y = np.load(
|
||
|
"vgg_features\\y_train_label_vgg.npy", allow_pickle=True
|
||
|
)
|
||
|
|
||
|
num_train = train_X_vgg.shape[0]
|
||
|
num_devel = devel_X_vgg.shape[0]
|
||
|
split_indices = np.repeat([-1, 0], [num_train, num_devel])
|
||
|
split = PredefinedSplit(split_indices)
|
||
|
|
||
|
train_X_vgg = np.squeeze(train_X_vgg)
|
||
|
devel_X_vgg = np.squeeze(devel_X_vgg)
|
||
|
test_X_vgg = np.squeeze(test_X_vgg)
|
||
|
|
||
|
devel_X = np.concatenate(
|
||
|
(
|
||
|
devel_X_hand,
|
||
|
devel_X_vgg
|
||
|
),
|
||
|
axis=1,
|
||
|
)
|
||
|
|
||
|
test_X = np.concatenate(
|
||
|
(
|
||
|
test_X_hand,
|
||
|
test_X_vgg
|
||
|
),
|
||
|
axis=1,
|
||
|
)
|
||
|
|
||
|
train_X = np.concatenate(
|
||
|
(
|
||
|
train_X_hand,
|
||
|
train_X_vgg
|
||
|
),
|
||
|
axis=1,
|
||
|
)
|
||
|
|
||
|
X = np.append(train_X, devel_X, axis=0)
|
||
|
y = np.append(train_y, devel_y, axis=0)
|
||
|
|
||
|
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
|
||
|
scoring=make_scorer(recall_score, average='macro'),
|
||
|
n_jobs=-1, cv=split, refit=True, verbose=1,
|
||
|
return_train_score=False)
|
||
|
|
||
|
# fit on data. train -> devel first, then train+devel implicit
|
||
|
grid_search.fit(X, y)
|
||
|
best_estimator = grid_search.best_estimator_
|
||
|
|
||
|
# fit clone of best estimator on train again for devel predictions
|
||
|
estimator = clone(best_estimator, safe=False)
|
||
|
estimator.fit(train_X, train_y)
|
||
|
preds = estimator.predict(devel_X)
|
||
|
|
||
|
metrics = {'dev': {}, 'test': {}}
|
||
|
|
||
|
# devel metrics
|
||
|
print('DEVEL')
|
||
|
uar = recall_score(devel_y, preds, average='macro')
|
||
|
cm = confusion_matrix(devel_y, preds)
|
||
|
print(f'UAR: {uar}\n{classification_report(devel_y, preds)}\n\nConfusion Matrix:\n\n{cm}')
|
||
|
|
||
|
pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
|
||
|
|
||
|
# test metrics
|
||
|
print('TEST')
|
||
|
preds = best_estimator.predict(test_X)
|
||
|
uar = recall_score(test_y, preds, average='macro')
|
||
|
cm = confusion_matrix(test_y, preds)
|
||
|
print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}')
|