CovidSpeechChallenge_2021/svm.py

from sklearn.svm import LinearSVC
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer
from joblib import Parallel, delayed
import pandas as pd
import scipy
import os, yaml
import json
import sys
import arff
import numpy as np
from tqdm import tqdm
from glob import glob

RANDOM_SEED = 42

GRID = [
    {'scaler': [StandardScaler(), None],
     'estimator': [LinearSVC(random_state=RANDOM_SEED)],
     'estimator__loss': ['squared_hinge'],
     'estimator__C': np.logspace(-1, -5, num=5),
     'estimator__class_weight': ['balanced', None],
     'estimator__max_iter': [100000]
     }
]

PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC())])

if __name__=='__main__':

    # load features and labels
    devel_X_vgg = np.load(
        "vgg_features\\x_devel_data_vgg.npy", allow_pickle=True
    )

    test_X_vgg = np.load(
        "vgg_features\\x_test_data_vgg.npy", allow_pickle=True
    )

    train_X_vgg = np.load(
        "vgg_features\\x_train_data_vgg.npy", allow_pickle=True
    )

    devel_X_hand = np.load(
        "hand_features\\x_devel_data.npy", allow_pickle=True
    )

    test_X_hand = np.load(
        "hand_features\\x_test_data.npy", allow_pickle=True
    )

    train_X_hand = np.load(
        "hand_features\\x_train_data.npy", allow_pickle=True
    )

    devel_y = np.load(
        "vgg_features\\y_devel_label_vgg.npy", allow_pickle=True
    )

    test_y = np.load(
        "vgg_features\\y_test_label_vgg.npy", allow_pickle=True
    )

    train_y = np.load(
        "vgg_features\\y_train_label_vgg.npy", allow_pickle=True
    )

    num_train = train_X_vgg.shape[0]
    num_devel = devel_X_vgg.shape[0]
    split_indices = np.repeat([-1, 0], [num_train, num_devel])
    split = PredefinedSplit(split_indices)

    train_X_vgg = np.squeeze(train_X_vgg)
    devel_X_vgg = np.squeeze(devel_X_vgg)
    test_X_vgg = np.squeeze(test_X_vgg)

    devel_X = np.concatenate(
        (
            devel_X_hand,
            devel_X_vgg
        ),
        axis=1,
    )

    test_X = np.concatenate(
        (
            test_X_hand,
            test_X_vgg
        ),
        axis=1,
    )

    train_X = np.concatenate(
        (
            train_X_hand,
            train_X_vgg
        ),
        axis=1,
    )

    X = np.append(train_X, devel_X, axis=0)
    y = np.append(train_y, devel_y, axis=0)

    grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
                                scoring=make_scorer(recall_score, average='macro'),
                                n_jobs=-1, cv=split, refit=True, verbose=1,
                                return_train_score=False)

    # fit on data. train -> devel first, then train+devel implicit
    grid_search.fit(X, y)
    best_estimator = grid_search.best_estimator_

    # fit clone of best estimator on train again for devel predictions
    estimator = clone(best_estimator, safe=False)
    estimator.fit(train_X, train_y)
    preds = estimator.predict(devel_X)

    metrics = {'dev': {}, 'test': {}}

    # devel metrics
    print('DEVEL')
    uar = recall_score(devel_y, preds, average='macro')
    cm = confusion_matrix(devel_y, preds)
    print(f'UAR: {uar}\n{classification_report(devel_y, preds)}\n\nConfusion Matrix:\n\n{cm}')

    pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)

    # test metrics
    print('TEST')
    preds = best_estimator.predict(test_X)
    uar = recall_score(test_y, preds, average='macro')
    cm = confusion_matrix(test_y, preds)
    print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}')