From dbf7cd462e97a4e0ab61c8c90f426c0c99b76d78 Mon Sep 17 00:00:00 2001
From: em474re <Elien.Martens@UGent.be>
Date: Thu, 9 Sep 2021 11:41:11 +0200
Subject: [PATCH] update results svm_hand

---
 README.md                           | 38 ++++++++++++++++-------------
 src/extract_handcrafted_features.py |  4 ++-
 src/svm_hand.py                     | 20 ++++++++++++---
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 3488658..349f43b 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,25 @@
-# covid-19 speech diagnosis
-This repo contains the code created by Elien Martens during IAESTE internship summer 2021, Technical University Kosice (Slowakia).
+# Automated COVID-19 diagnosis
+This repo contains the code created by Elien Martens during IAESTE internship summer 2021, Technical University of Kosice (Slovakia). 
 
-## Dataset
+## Data
 
-The COVID-19 datasets can be obtained through The University of Cambridge. (see also compare.openaudio.eu, Interspeech Computational Paralinguistics Challenges 2021)
+The [Interspeech Computational Paralinguistics ChallengE (ComParE) 2021](vhttp://www.compare.openaudio.eu/now/) proposes two challenges related to COVID-19 detection based on audio samples. Such samples represent speech and cough audio from both healthy and infected speakers. The COVID-19 Speech Sub-Challenge (CSS) offers 3.24 hours of audio recordings containing speech samples, while the COVID19 Cough Sub-Challenge (CCS) provides 1.63 hours of cough samples.
+The COVID-19 datasets can be obtained through The University of Cambridge. 
 
 ## How it works
--   clone repo
--   add data (see Dataset section above), so that the structuring is the following:
-    CovidSpeechChallenge
-        -> raw_files
-            -> train
-            -> test
-            -> devel
-        -> labels
-        -> vgg_features
-        -> hand_features
-        -> vggish
-        ...
-- run .\run_experiments.sh
\ No newline at end of file
+-   clone repository
+-   add data (see [Data](##data) section), so that the structure is the following:
+```
+CovidSpeechChallenge
+  |-- dist/
+  		|-- lab/
+		|-- wav/
+  |-- features/
+  |-- results/
+  |-- src/
+  |-- vggish/
+  |-- run_experiments.sh
+```
+- run .\run_experiments.sh
+
+## Acknowledgements
diff --git a/src/extract_handcrafted_features.py b/src/extract_handcrafted_features.py
index ccc20ed..f85d49c 100644
--- a/src/extract_handcrafted_features.py
+++ b/src/extract_handcrafted_features.py
@@ -197,6 +197,7 @@ if __name__ == "__main__":
     
     x_data = []
     y_label = []
+    names = []
 
     # extract features for all audio samples from correct subset
     for file in sorted([f for f in os.listdir(path) if dataset in f]):
@@ -220,4 +221,5 @@ if __name__ == "__main__":
     # save features and labels
     df = pd.read_csv('./dist/lab/' + dataset + '.csv', sep =',')
     np.save(os.path.join('./features/hand_features',"x_" + dataset + "_data.npy"), np.array(x_data))
-    np.save(os.path.join('./features/hand_features',"y_" + dataset + "_label.npy"), df.label)
\ No newline at end of file
+    np.save(os.path.join('./features/hand_features',"y_" + dataset + "_label.npy"), df.label)
+    np.save(os.path.join('./features/hand_features',dataset + "_names.npy"), df.filename)
diff --git a/src/svm_hand.py b/src/svm_hand.py
index b94abef..584ef1a 100644
--- a/src/svm_hand.py
+++ b/src/svm_hand.py
@@ -4,10 +4,10 @@ from sklearn.pipeline import Pipeline
 from sklearn.model_selection import PredefinedSplit, GridSearchCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import classification_report, confusion_matrix, recall_score, make_scorer, plot_confusion_matrix
-# import pandas as pd
+import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-
+import os
 
 RANDOM_SEED = 42
 
@@ -50,6 +50,14 @@ if __name__=='__main__':
         "./features/hand_features/y_train_label.npy", allow_pickle=True
     )
 
+    devel_names = np.load(
+        "./features/hand_features/devel_names.npy", allow_pickle=True
+    )
+
+    test_names = np.load(
+        "./features/hand_features/test_names.npy", allow_pickle=True
+    )
+
     num_train = train_X_hand.shape[0]
     num_devel = devel_X_hand.shape[0]
     split_indices = np.repeat([-1, 0], [num_train, num_devel])
@@ -83,6 +91,9 @@ if __name__=='__main__':
     # optional write grid_search to csv file
     # pd.DataFrame(grid_search.cv_results_).to_csv('grid_search.csv', index=False)
 
+    df_predictions = pd.DataFrame({'filename': devel_names.tolist(), 'prediction': preds.tolist()})
+    df_predictions.to_csv(os.path.join('./results/svm_hand/', 'devel.predictions.csv'), index=False)
+
     # test results
     print('TEST')
     preds = best_estimator.predict(test_X_hand)
@@ -90,8 +101,11 @@ if __name__=='__main__':
     cm = confusion_matrix(test_y, preds)
     print(f'UAR: {uar}\n{classification_report(test_y, preds)}\n\nConfusion Matrix:\n\n{cm}') 
 
+    df_predictions = pd.DataFrame({'filename': test_names.tolist(), 'prediction': preds.tolist()})
+    df_predictions.to_csv(os.path.join('./results/svm_hand/', 'test.predictions.csv'), index=False)
+
     fig = plt.figure()
     plot_confusion_matrix(best_estimator,X= test_X_hand, y_true=test_y,cmap=plt.cm.Blues,display_labels=['Negative','Positive'],normalize='true')
     plt.ylabel('True Label')
     plt.xlabel('Predicated Label')
-    plt.savefig('cm_svm_hand.jpg')
\ No newline at end of file
+    plt.savefig('./results/svm_hand/cm_svm_hand.jpg')
\ No newline at end of file