isaac-rnd · Sep 25, 2022
diff --git a/‎Snapshots/result.png
11.3 KB b/‎Snapshots/result.png
11.3 KB
diff --git a/‎model.ipynb
+240 b/‎model.ipynb
+240
diff --git a/‎speech-emotion-recognition-model.ipynb
+1-1 b/‎speech-emotion-recognition-model.ipynb
+1-1
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import soundfile # to read audio file\n",
+    "import numpy as np\n",
+    "import librosa # to extract speech features\n",
+    "import glob\n",
+    "import os\n",
+    "import pickle # to save model after training\n",
+    "from sklearn.model_selection import train_test_split # for splitting training and testing\n",
+    "from sklearn.neural_network import MLPClassifier # multi-layer perceptron model\n",
+    "from sklearn.metrics import accuracy_score # to measure how good we are"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_feature(file_name, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Extract feature from audio file `file_name`\n",
+    "        Features supported:\n",
+    "            - MFCC (mfcc)\n",
+    "            - Chroma (chroma)\n",
+    "            - MEL Spectrogram Frequency (mel)\n",
+    "            - Contrast (contrast)\n",
+    "            - Tonnetz (tonnetz)\n",
+    "        e.g:\n",
+    "        `features = extract_feature(path, mel=True, mfcc=True)`\n",
+    "    \"\"\"\n",
+    "    mfcc = kwargs.get(\"mfcc\")\n",
+    "    chroma = kwargs.get(\"chroma\")\n",
+    "    mel = kwargs.get(\"mel\")\n",
+    "    contrast = kwargs.get(\"contrast\")\n",
+    "    tonnetz = kwargs.get(\"tonnetz\")\n",
+    "    with soundfile.SoundFile(file_name) as sound_file:\n",
+    "        X = sound_file.read(dtype=\"float32\")\n",
+    "        sample_rate = sound_file.samplerate\n",
+    "        if chroma or contrast:\n",
+    "            stft = np.abs(librosa.stft(X))\n",
+    "        result = np.array([])\n",
+    "        if mfcc:\n",
+    "            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)\n",
+    "            result = np.hstack((result, mfccs))\n",
+    "        if chroma:\n",
+    "            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)\n",
+    "            result = np.hstack((result, chroma))\n",
+    "        if mel:\n",
+    "            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)\n",
+    "            result = np.hstack((result, mel))\n",
+    "        if contrast:\n",
+    "            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)\n",
+    "            result = np.hstack((result, contrast))\n",
+    "        if tonnetz:\n",
+    "            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)\n",
+    "            result = np.hstack((result, tonnetz))\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# all emotions on RAVDESS dataset\n",
+    "int2emotion = {\n",
+    "    \"01\": \"neutral\",\n",
+    "    \"02\": \"calm\",\n",
+    "    \"03\": \"happy\",\n",
+    "    \"04\": \"sad\",\n",
+    "    \"05\": \"angry\",\n",
+    "    \"06\": \"fearful\",\n",
+    "    \"07\": \"disgust\",\n",
+    "    \"08\": \"surprised\"\n",
+    "}\n",
+    "\n",
+    "# we allow only these emotions ( feel free to tune this on your need )\n",
+    "AVAILABLE_EMOTIONS = {\n",
+    "    \"angry\",\n",
+    "    \"sad\",\n",
+    "    \"neutral\",\n",
+    "    \"happy\"\n",
+    "}\n",
+    "\n",
+    "def load_data(test_size=0.2):\n",
+    "    X, y = [], []\n",
+    "    for file in glob.glob(\"data/Actor_*/*.wav\"):\n",
+    "        # get the base name of the audio file\n",
+    "        basename = os.path.basename(file)\n",
+    "        # get the emotion label\n",
+    "        emotion = int2emotion[basename.split(\"-\")[2]]\n",
+    "        # we allow only AVAILABLE_EMOTIONS we set\n",
+    "        if emotion not in AVAILABLE_EMOTIONS:\n",
+    "            continue\n",
+    "        # extract speech features\n",
+    "        features = extract_feature(file, mfcc=True, chroma=True, mel=True)\n",
+    "        # add to data\n",
+    "        X.append(features)\n",
+    "        y.append(emotion)\n",
+    "    # split the data to training and testing and return it\n",
+    "    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load RAVDESS dataset, 75% training 25% testing\n",
+    "X_train, X_test, y_train, y_test = load_data(test_size=0.25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print some details\n",
+    "# number of samples in training data\n",
+    "print(\"[+] Number of training samples:\", X_train.shape[0])\n",
+    "# number of samples in testing data\n",
+    "print(\"[+] Number of testing samples:\", X_test.shape[0])\n",
+    "# number of features used\n",
+    "# this is a vector of features extracted \n",
+    "# using extract_features() function\n",
+    "print(\"[+] Number of features:\", X_train.shape[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# best model, determined by a grid search\n",
+    "model_params = {\n",
+    "    'alpha': 0.01,\n",
+    "    'batch_size': 256,\n",
+    "    'epsilon': 1e-08, \n",
+    "    'hidden_layer_sizes': (300,), \n",
+    "    'learning_rate': 'adaptive', \n",
+    "    'max_iter': 500, \n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# initialize Multi Layer Perceptron classifier\n",
+    "# with best parameters ( so far )\n",
+    "model = MLPClassifier(**model_params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train the model\n",
+    "print(\"[*] Training the model...\")\n",
+    "model.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# predict 25% of data to measure how good we are\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "# calculate the accuracy\n",
+    "accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)\n",
+    "\n",
+    "print(\"Accuracy: {:.2f}%\".format(accuracy*100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now we save the model\n",
+    "# make result directory if doesn't exist yet\n",
+    "if not os.path.isdir(\"result\"):\n",
+    "    os.mkdir(\"result\")\n",
+    "\n",
+    "pickle.dump(model, open(\"result/mlp_classifier.model\", \"wb\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "58837b1b657ea91009af8409fc244ae3b5ccf93ea980d6fb6b80adc5f697f4cc"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}