CovidSpeechChallenge_2021/svm.py

from sklearn.svm import LinearSVC
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer
from joblib import Parallel, delayed
import pandas as pd
import scipy
import os, yaml
import json
import sys
import arff
import numpy as np
from tqdm import tqdm
from glob import glob

RANDOM_SEED = 42

GRID = [
    {'scaler': [StandardScaler(), None],
     'estimator': [LinearSVC(random_state=RANDOM_SEED)],
     'estimator__loss': ['squared_hinge'],
     'estimator__C': np.logspace(-1, -5, num=5),
     'estimator__class_weight': ['balanced', None],
     'estimator__max_iter': [100000]
     }
]

PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC())])

if __name__=='__main__':

    # load features and labels
    devel_X_vgg = np.load(
        "vgg_features\\x_devel_data_vgg.npy", allow_pickle=True
    )

    test_X_vgg = np.load(
        "vgg_features\\x_test_data_vgg.npy", allow_pickle=True
    )
    
    train_X_vgg = np.load(
        "vgg_features\\x_train_data_vgg.npy", allow_pickle=True
    )
    
    devel_X_hand = np.load(
        "hand_features\\x_devel_data.npy", allow_pickle=True
    )

    test_X_hand = np.load(
        "hand_features\\x_test_data.npy", allow_pickle=True
    )
    
    train_X_hand = np.load(
        "hand_features\\x_train_data.npy", allow_pickle=True
    )

    devel_y = np.load(
        "vgg_features\\y_devel_label_vgg.npy", allow_pickle=True
    )
    
    test_y = np.load(
        "vgg_features\\y_test_label_vgg.npy", allow_pickle=True
    )

    train_y = np.load(
        "vgg_features\\y_train_label_vgg.npy", allow_pickle=True
    )

    num_train = train_X_vgg.shape[0]
    num_devel = devel_X_vgg.shape[0]
    split_indices = np.repeat([-1, 0], [num_train, num_devel])
    split = PredefinedSplit(split_indices)
    
    train_X_vgg = np.squeeze(train_X_vgg)
    devel_X_vgg = np.squeeze(devel_X_vgg)
    test_X_vgg = np.squeeze(test_X_vgg)

    devel_X = np.concatenate(
        (
            devel_X_hand,
            devel_X_vgg
        ),
        axis=1,
    )

    test_X = np.concatenate(
        (
            test_X_hand,
            test_X_vgg
        ),
        axis=1,
    )

    train_X = np.concatenate(
        (
            train_X_hand,
            train_X_vgg
        ),
        axis=1,
    )

    X = np.append(train_X, devel_X, axis=0)
    y = np.append(train_y, devel_y, axis=0)

    grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, 
                                scoring=make_scorer(recall_score, average='macro'), 
                                n_jobs=-1, cv=split, refit=True, verbose=1, 
                                return_train_score=False)

    # fit on data. train -> devel first, then train+devel implicit
    grid_search.fit(X, y)
    best_estimator = grid_search.best_estimator_

    # fit clone of best estimator on train again for devel predictions
    estimator = clone(best_estimator, safe=False)
    estimator.fit(train_X, train_y)
    preds = estimator.predict(devel_X)

    metrics = {'dev': {}, 'test': {}}

    # devel metrics
    print('DEVEL')
    uar = recall_score(devel_y, preds, average='macro')
    cm = confusion_matrix(devel_y, preds)
    print(f'UAR: {uar}\n{classification_report(devel_y, preds)}\n\nConfusion Matrix:\n\n{cm}') 
    
    pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)

    # test metrics
    print('TEST')
    preds = best_estimator.predict(test_X)
    uar = recall_score(test_y, preds, average='macro')
    cm = confusion_matrix(test_y, preds)
    print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}')
svm model for handcrafted and vggsh features 2021-08-19 08:39:47 +00:00			`from sklearn.svm import LinearSVC`
			`from sklearn.base import clone`
			`from sklearn.pipeline import Pipeline`
			`from sklearn.utils import resample`
			`from sklearn.model_selection import PredefinedSplit, GridSearchCV`
			`from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler`
			`from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer`
			`from joblib import Parallel, delayed`
			`import pandas as pd`
			`import scipy`
			`import os, yaml`
			`import json`
			`import sys`
			`import arff`
			`import numpy as np`
			`from tqdm import tqdm`
			`from glob import glob`

			`RANDOM_SEED = 42`

			`GRID = [`
			`{'scaler': [StandardScaler(), None],`
			`'estimator': [LinearSVC(random_state=RANDOM_SEED)],`
			`'estimator__loss': ['squared_hinge'],`
			`'estimator__C': np.logspace(-1, -5, num=5),`
			`'estimator__class_weight': ['balanced', None],`
			`'estimator__max_iter': [100000]`
			`}`
			`]`

			`PIPELINE = Pipeline([('scaler', None), ('estimator', LinearSVC())])`

			`if __name__=='__main__':`

			`# load features and labels`
			`devel_X_vgg = np.load(`
			`"vgg_features\\x_devel_data_vgg.npy", allow_pickle=True`
			`)`

			`test_X_vgg = np.load(`
			`"vgg_features\\x_test_data_vgg.npy", allow_pickle=True`
			`)`

			`train_X_vgg = np.load(`
			`"vgg_features\\x_train_data_vgg.npy", allow_pickle=True`
			`)`

			`devel_X_hand = np.load(`
			`"hand_features\\x_devel_data.npy", allow_pickle=True`
			`)`

			`test_X_hand = np.load(`
			`"hand_features\\x_test_data.npy", allow_pickle=True`
			`)`

			`train_X_hand = np.load(`
			`"hand_features\\x_train_data.npy", allow_pickle=True`
			`)`

			`devel_y = np.load(`
			`"vgg_features\\y_devel_label_vgg.npy", allow_pickle=True`
			`)`

			`test_y = np.load(`
			`"vgg_features\\y_test_label_vgg.npy", allow_pickle=True`
			`)`

			`train_y = np.load(`
			`"vgg_features\\y_train_label_vgg.npy", allow_pickle=True`
			`)`

			`num_train = train_X_vgg.shape[0]`
			`num_devel = devel_X_vgg.shape[0]`
			`split_indices = np.repeat([-1, 0], [num_train, num_devel])`
			`split = PredefinedSplit(split_indices)`

			`train_X_vgg = np.squeeze(train_X_vgg)`
			`devel_X_vgg = np.squeeze(devel_X_vgg)`
			`test_X_vgg = np.squeeze(test_X_vgg)`

			`devel_X = np.concatenate(`
			`(`
			`devel_X_hand,`
			`devel_X_vgg`
			`),`
			`axis=1,`
			`)`

			`test_X = np.concatenate(`
			`(`
			`test_X_hand,`
			`test_X_vgg`
			`),`
			`axis=1,`
			`)`

			`train_X = np.concatenate(`
			`(`
			`train_X_hand,`
			`train_X_vgg`
			`),`
			`axis=1,`
			`)`

			`X = np.append(train_X, devel_X, axis=0)`
			`y = np.append(train_y, devel_y, axis=0)`

			`grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,`
			`scoring=make_scorer(recall_score, average='macro'),`
			`n_jobs=-1, cv=split, refit=True, verbose=1,`
			`return_train_score=False)`

			`# fit on data. train -> devel first, then train+devel implicit`
			`grid_search.fit(X, y)`
			`best_estimator = grid_search.best_estimator_`

			`# fit clone of best estimator on train again for devel predictions`
			`estimator = clone(best_estimator, safe=False)`
			`estimator.fit(train_X, train_y)`
			`preds = estimator.predict(devel_X)`

			`metrics = {'dev': {}, 'test': {}}`

			`# devel metrics`
			`print('DEVEL')`
			`uar = recall_score(devel_y, preds, average='macro')`
			`cm = confusion_matrix(devel_y, preds)`
			`print(f'UAR: {uar}\n{classification_report(devel_y, preds)}\n\nConfusion Matrix:\n\n{cm}')`

			`pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)`

			`# test metrics`
			`print('TEST')`
			`preds = best_estimator.predict(test_X)`
			`uar = recall_score(test_y, preds, average='macro')`
			`cm = confusion_matrix(test_y, preds)`
			`print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}')`