Skip to content

Commit 128d0a4

Browse files
committedSep 25, 2022
new dataset
1 parent e1861c1 commit 128d0a4

File tree

3 files changed

+241
-1
lines changed

3 files changed

+241
-1
lines changed
 

‎Snapshots/result.png

11.3 KB
Loading

‎model.ipynb

+240
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import soundfile # to read audio file\n",
10+
"import numpy as np\n",
11+
"import librosa # to extract speech features\n",
12+
"import glob\n",
13+
"import os\n",
14+
"import pickle # to save model after training\n",
15+
"from sklearn.model_selection import train_test_split # for splitting training and testing\n",
16+
"from sklearn.neural_network import MLPClassifier # multi-layer perceptron model\n",
17+
"from sklearn.metrics import accuracy_score # to measure how good we are"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 2,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"def extract_feature(file_name, **kwargs):\n",
27+
" \"\"\"\n",
28+
" Extract feature from audio file `file_name`\n",
29+
" Features supported:\n",
30+
" - MFCC (mfcc)\n",
31+
" - Chroma (chroma)\n",
32+
" - MEL Spectrogram Frequency (mel)\n",
33+
" - Contrast (contrast)\n",
34+
" - Tonnetz (tonnetz)\n",
35+
" e.g:\n",
36+
" `features = extract_feature(path, mel=True, mfcc=True)`\n",
37+
" \"\"\"\n",
38+
" mfcc = kwargs.get(\"mfcc\")\n",
39+
" chroma = kwargs.get(\"chroma\")\n",
40+
" mel = kwargs.get(\"mel\")\n",
41+
" contrast = kwargs.get(\"contrast\")\n",
42+
" tonnetz = kwargs.get(\"tonnetz\")\n",
43+
" with soundfile.SoundFile(file_name) as sound_file:\n",
44+
" X = sound_file.read(dtype=\"float32\")\n",
45+
" sample_rate = sound_file.samplerate\n",
46+
" if chroma or contrast:\n",
47+
" stft = np.abs(librosa.stft(X))\n",
48+
" result = np.array([])\n",
49+
" if mfcc:\n",
50+
" mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)\n",
51+
" result = np.hstack((result, mfccs))\n",
52+
" if chroma:\n",
53+
" chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)\n",
54+
" result = np.hstack((result, chroma))\n",
55+
" if mel:\n",
56+
" mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)\n",
57+
" result = np.hstack((result, mel))\n",
58+
" if contrast:\n",
59+
" contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)\n",
60+
" result = np.hstack((result, contrast))\n",
61+
" if tonnetz:\n",
62+
" tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)\n",
63+
" result = np.hstack((result, tonnetz))\n",
64+
" return result"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 5,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"# all emotions on RAVDESS dataset\n",
74+
"int2emotion = {\n",
75+
" \"01\": \"neutral\",\n",
76+
" \"02\": \"calm\",\n",
77+
" \"03\": \"happy\",\n",
78+
" \"04\": \"sad\",\n",
79+
" \"05\": \"angry\",\n",
80+
" \"06\": \"fearful\",\n",
81+
" \"07\": \"disgust\",\n",
82+
" \"08\": \"surprised\"\n",
83+
"}\n",
84+
"\n",
85+
"# we allow only these emotions ( feel free to tune this on your need )\n",
86+
"AVAILABLE_EMOTIONS = {\n",
87+
" \"angry\",\n",
88+
" \"sad\",\n",
89+
" \"neutral\",\n",
90+
" \"happy\"\n",
91+
"}\n",
92+
"\n",
93+
"def load_data(test_size=0.2):\n",
94+
" X, y = [], []\n",
95+
" for file in glob.glob(\"data/Actor_*/*.wav\"):\n",
96+
" # get the base name of the audio file\n",
97+
" basename = os.path.basename(file)\n",
98+
" # get the emotion label\n",
99+
" emotion = int2emotion[basename.split(\"-\")[2]]\n",
100+
" # we allow only AVAILABLE_EMOTIONS we set\n",
101+
" if emotion not in AVAILABLE_EMOTIONS:\n",
102+
" continue\n",
103+
" # extract speech features\n",
104+
" features = extract_feature(file, mfcc=True, chroma=True, mel=True)\n",
105+
" # add to data\n",
106+
" X.append(features)\n",
107+
" y.append(emotion)\n",
108+
" # split the data to training and testing and return it\n",
109+
" return train_test_split(np.array(X), y, test_size=test_size, random_state=7)"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": 14,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"# load RAVDESS dataset, 75% training 25% testing\n",
119+
"X_train, X_test, y_train, y_test = load_data(test_size=0.25)"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 15,
125+
"metadata": {},
126+
"outputs": [],
127+
"source": [
128+
"# print some details\n",
129+
"# number of samples in training data\n",
130+
"print(\"[+] Number of training samples:\", X_train.shape[0])\n",
131+
"# number of samples in testing data\n",
132+
"print(\"[+] Number of testing samples:\", X_test.shape[0])\n",
133+
"# number of features used\n",
134+
"# this is a vector of features extracted \n",
135+
"# using extract_features() function\n",
136+
"print(\"[+] Number of features:\", X_train.shape[1])"
137+
]
138+
},
139+
{
140+
"cell_type": "code",
141+
"execution_count": null,
142+
"metadata": {},
143+
"outputs": [],
144+
"source": [
145+
"# best model, determined by a grid search\n",
146+
"model_params = {\n",
147+
" 'alpha': 0.01,\n",
148+
" 'batch_size': 256,\n",
149+
" 'epsilon': 1e-08, \n",
150+
" 'hidden_layer_sizes': (300,), \n",
151+
" 'learning_rate': 'adaptive', \n",
152+
" 'max_iter': 500, \n",
153+
"}"
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": null,
159+
"metadata": {},
160+
"outputs": [],
161+
"source": [
162+
"# initialize Multi Layer Perceptron classifier\n",
163+
"# with best parameters ( so far )\n",
164+
"model = MLPClassifier(**model_params)"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"# train the model\n",
174+
"print(\"[*] Training the model...\")\n",
175+
"model.fit(X_train, y_train)"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"metadata": {},
182+
"outputs": [],
183+
"source": [
184+
"# predict 25% of data to measure how good we are\n",
185+
"y_pred = model.predict(X_test)\n",
186+
"\n",
187+
"# calculate the accuracy\n",
188+
"accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)\n",
189+
"\n",
190+
"print(\"Accuracy: {:.2f}%\".format(accuracy*100))"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": null,
196+
"metadata": {},
197+
"outputs": [],
198+
"source": [
199+
"# now we save the model\n",
200+
"# make result directory if doesn't exist yet\n",
201+
"if not os.path.isdir(\"result\"):\n",
202+
" os.mkdir(\"result\")\n",
203+
"\n",
204+
"pickle.dump(model, open(\"result/mlp_classifier.model\", \"wb\"))"
205+
]
206+
},
207+
{
208+
"cell_type": "markdown",
209+
"metadata": {},
210+
"source": []
211+
}
212+
],
213+
"metadata": {
214+
"kernelspec": {
215+
"display_name": "Python 3.10.4 64-bit",
216+
"language": "python",
217+
"name": "python3"
218+
},
219+
"language_info": {
220+
"codemirror_mode": {
221+
"name": "ipython",
222+
"version": 3
223+
},
224+
"file_extension": ".py",
225+
"mimetype": "text/x-python",
226+
"name": "python",
227+
"nbconvert_exporter": "python",
228+
"pygments_lexer": "ipython3",
229+
"version": "3.10.4"
230+
},
231+
"orig_nbformat": 4,
232+
"vscode": {
233+
"interpreter": {
234+
"hash": "58837b1b657ea91009af8409fc244ae3b5ccf93ea980d6fb6b80adc5f697f4cc"
235+
}
236+
}
237+
},
238+
"nbformat": 4,
239+
"nbformat_minor": 2
240+
}

‎speech-emotion-recognition-model.ipynb

+1-1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.