Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ The database consists of text and audio which is acquired when uttering scripts

There are 7 target emotions: 1) joy, 2) neutral, 3) anxiety, 4) embarrassment, 5) hurt, 6) sadness, and 7) anger

## Dependencies
![image](https://github.com/user-attachments/assets/37dbb8a7-2900-4a26-b6c3-e5a9c3e5d537)

# 2. Dependencies
* torch
* pandas
* numpy
Expand All @@ -23,15 +25,15 @@ pip install -r requirements.txt

You should consider deep learning setups such as CUDA and PyTorch versions available in your local environments.

## Usage
# 3. Usage

Train and evaluate the model by executing as

```
python train.py --dataset IITP-SMED --cuda_id 0
python train.py --dataset IITP-SMED-STT --cuda_id 0
```

Available --dataset arguments must be one of [IITP-SMED, IITP-SMED-STT, AIHUB-SER]
Available --dataset arguments must be one of [IITP-SMED-ORIGIN, IITP-SMED-STT, IITP-SMED-AUDIO, IITP-SMED-ORIGIN-TEXT, IITP-SMED-STT-TEXT]

You can choose a single GPU, and cuda_id is the order of available GPU devices.

Expand Down
146 changes: 136 additions & 10 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
IITP-SMED: # IITP Senior Multimodal Emotion Dataset (SMED)
IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) : Origin text embedding + audio feature

seed: 2024

# args for modality
modality: 'at'

# args for text
text: 'origin'

# args for relative path
data_load_path: 'dataset/IITP-SMED.csv'
data_load_path: 'dataset/Origin_Text.csv'
audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
structure_save_path: 'save/figures/structure/'
feature_save_path: 'save/figures/feature/'
model_save_path: 'save/model/'
Expand Down Expand Up @@ -47,17 +54,24 @@ IITP-SMED: # IITP Senior Multimodal Emotion Dataset (SMED)
patience: 10 # patience for early stopping method


IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-to-text method
IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) : STT text embedding + audio feature

seed: 2024

# args for modality
modality: 'at'

# args for text
text: 'stt'

# args for relative path
data_load_path: 'dataset/IITP-SMED-STT.csv'
data_load_path: 'dataset/Origin_Text.csv'
audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
structure_save_path: 'save/figures/structure/'
feature_save_path: 'save/figures/feature/'
model_save_path: 'save/model/'
tensor_save_path: 'save/tensor/'
log_save_path: 'log/IITP-SMED-STT/'
log_save_path: 'log/IITP-SMED/'

# args for metadata
n_subjects: 24
Expand All @@ -67,7 +81,7 @@ IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-t
n_features: 856
n_samples: 1512
n_folds: 4
n_times_draw: 30
n_times_draw: 3

# args for running algorithm
k_neighbor: 9
Expand Down Expand Up @@ -95,13 +109,70 @@ IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-t

patience: 10 # patience for early stopping method

IITP-SMED-AUDIO: # IITP Senior Multimodal Emotion Dataset (SMED) : audio feature

seed: 2024

# args for modality
modality: 'a'

# args for relative path
data_load_path: 'dataset/Origin_Text.csv'
audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
structure_save_path: 'save/figures/structure/'
feature_save_path: 'save/figures/feature/'
model_save_path: 'save/model/'
tensor_save_path: 'save/tensor/'
log_save_path: 'log/IITP-SMED-ORIGIN/'

# args for metadata
n_subjects: 24
n_trials: 63
n_audio_features: 88
n_features: 88
n_samples: 1512
n_folds: 4
n_times_draw: 1

# args for running algorithm
k_neighbor: 9
timestep: 1

gcn_hid_channels: 88
gcn_out_channels: 88
proj_hid_channels: 32
out_channels: 7

learning_rate: 0.005
weight_decay: 0.0001
cl_coefficient: 0.01 # contrastive loss coefficient
epochs: 3000

ptau: 0.7 # temperature hyperparameter

# probabilities for augmentation
pf1: 0.2
pf2: 0.3
pe1: 0.2
pe2: 0.3

pt: 0.7 # probability threshold

patience: 10 # patience for early stopping method

IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-to-text method
IITP-SMED-ORIGIN-TEXT: # IITP Senior Multimodal Emotion Dataset (SMED) : Origin text embedding

seed: 2024

# args for modality
modality: 't'

# args for text
text: 'origin'

# args for relative path
data_load_path: 'dataset/IITP-SMED-ORIGIN.csv'
data_load_path: 'dataset/Origin_Text.csv'
audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
structure_save_path: 'save/figures/structure/'
feature_save_path: 'save/figures/feature/'
model_save_path: 'save/model/'
Expand All @@ -113,10 +184,10 @@ IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speec
n_trials: 63
n_audio_features: 88
n_text_features: 768
n_features: 856
n_features: 768
n_samples: 1512
n_folds: 4
n_times_draw: 30
n_times_draw: 1

# args for running algorithm
k_neighbor: 9
Expand All @@ -143,3 +214,58 @@ IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speec
pt: 0.7 # probability threshold

patience: 10 # patience for early stopping method

IITP-SMED-STT-TEXT: # IITP Senior Multimodal Emotion Dataset (SMED) : STT text embedding

seed: 2024

# args for modality
modality: 't'

# args for text
text: 'stt'

# args for relative path
data_load_path: 'dataset/Origin_Text.csv'
audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
structure_save_path: 'save/figures/structure/'
feature_save_path: 'save/figures/feature/'
model_save_path: 'save/model/'
tensor_save_path: 'save/tensor/'
log_save_path: 'log/IITP-SMED-STT/'

# args for metadata
n_subjects: 24
n_trials: 63
n_audio_features: 88
n_text_features: 768
n_features: 768
n_samples: 1512
n_folds: 4
n_times_draw: 1

# args for running algorithm
k_neighbor: 9
timestep: 1

gcn_hid_channels: 256
gcn_out_channels: 128
proj_hid_channels: 32
out_channels: 7

learning_rate: 0.005
weight_decay: 0.0001
cl_coefficient: 0.01 # contrastive loss coefficient
epochs: 3000

ptau: 0.7 # temperature hyperparameter

# probabilities for augmentation
pf1: 0.2
pf2: 0.3
pe1: 0.2
pe2: 0.3

pt : 0.7 # probability threshold

patience: 10 # patience for early stopping method
148 changes: 148 additions & 0 deletions dataloader/dataembedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
import opensmile
import torch
import pandas as pd
import numpy as np
from kobert_tokenizer import get_tokenizer
from transformers import BertModel
from dataloader.dataloader import AudioTextDataset, load_dataset
from tqdm import tqdm
import speech_recognition as sr
import subprocess
import shutil

class DataProcessor:
def __init__(self, dataset, args):
self.dataset = dataset
self.args = args
self.smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals
)
self.audio_features = []
self.text_features = []

def extract_audio_features(self):
for idx in tqdm(range(len(self.dataset)), desc="Extracting Audio Features"):
audio_path, _, emotion = self.dataset[idx]
filename = os.path.basename(audio_path).split('.')[0]
try:
if os.path.exists(audio_path):
y = self.smile.process_file(audio_path)
audio_features = y.to_numpy().flatten()
self.audio_features.append([filename, emotion] + audio_features.tolist())
else:
print(f"File not found: {audio_path}")
self.audio_features.append([filename, emotion] + [0.0] * 88) # Assuming 88 features for eGeMAPSv02
except Exception as e:
print(f"Error processing file {filename}: {str(e)}")
self.audio_features.append([filename, emotion] + [0.0] * 88)

def convert_to_pcm_wav(self, input_file, output_file):
try:
# 출력 파일의 디렉토리 경로 추출
output_dir = os.path.dirname(output_file)

# 디렉토리가 존재하지 않으면 생성
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# ffmpeg 명령어 실행
result = subprocess.run(
['ffmpeg', '-i', input_file, '-acodec', 'pcm_s16le', output_file],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
except subprocess.CalledProcessError as e:
# 오류 메시지 출력
print(f'오류 발생: {e.stderr.decode()}')

def STT(self):
r = sr.Recognizer()
# recognize_google() : Google Web Speech API
# recognize_google_cloud() : Google Cloud Speech API
# recognize_bing() : Microsoft Bing Speech API
# recognize_houndify() : SoundHound Houndify API
# recognize_ibm() : IBM Speech to Text API
# recognize_wit() : Wit.ai API
# recognize_sphinx() : CMU Sphinx (오프라인에서 동작 가능)
output_text = pd.DataFrame(columns=['file', 'text'])

shutil.rmtree('dataset/converted')
# STT를 하기 위해 음성 파일을 PCM WAV 파일로 변환하고 동작해야함
# 임시 PCM WAV 파일이 저장될 디렉토리를 비움

for idx in tqdm(range(len(self.dataset)), desc="STT"):
audio_path, _, emotion = self.dataset[idx]
filename = os.path.basename(self.dataset[idx][0]).split('.')[0]
converted_audio_path = os.path.join('dataset/converted', os.path.basename(self.dataset[idx][0]))
self.convert_to_pcm_wav(audio_path, converted_audio_path)

korean_audio = sr.AudioFile(converted_audio_path)
try:
with korean_audio as source:
audio = r.record(source)
stt_result = r.recognize_google(audio_data=audio, language='ko-KR')
except:
stt_result = ' '

df = pd.DataFrame({'file': [filename], 'text': [stt_result]}, index=[idx])
output_text = pd.concat([output_text, df])
return output_text


def extract_text_embeddings(self):
tokenizer = get_tokenizer()
model = BertModel.from_pretrained('skt/kobert-base-v1')
if self.args.text == 'stt':
stt_text = self.STT()
for idx in tqdm(range(len(self.dataset)), desc="Extracting Text Features"):
_, text, emotion = self.dataset[idx]

if self.args.text == 'stt':
text = stt_text.iloc[idx]['text']

filename = os.path.basename(self.dataset[idx][0]).split('.')[0]

if isinstance(text, float) and pd.isna(text):
text = ''

inputs = tokenizer.batch_encode_plus([text], padding=True, truncation=True, return_tensors='pt')
outputs = model(**inputs)
outputs = outputs[1].detach().squeeze(0).numpy().astype("float32")
self.text_features.append([filename, emotion] + outputs.tolist())

def combine_features(self):
audio_df = pd.DataFrame(self.audio_features, columns=["filename", "emotion"] + [f"audio_feature{i}" for i in range(1, 89)])
text_df = pd.DataFrame(self.text_features, columns=["filename", "emotion"] + [f"text_feature{i}" for i in range(1, 769)])
combined_df = audio_df if text_df.empty else text_df if audio_df.empty else pd.merge(audio_df, text_df, on=['filename', 'emotion'])
return combined_df

def process(self, output_csv='combined_features.csv'):
if 'a' in self.args.modality:
self.extract_audio_features()
if 't' in self.args.modality:
self.extract_text_embeddings()
# if self.args.modality == 'at':
combined_df = self.combine_features()

# combined_df.to_csv(output_csv, index=False)
return combined_df

'''
# Example usage:
def main():
file_path = 'Origin_Text.csv'
audio_dir = '/media/neuroai/5E1227AF12278B5B/Seminor_Emotion_Data_Preprocessing_Voice'

dataset = load_dataset(file_path, audio_dir)
processor = DataProcessor(dataset)
combined_features = processor.process()

print(combined_features.head())


if __name__ == "__main__":
main()
'''
Loading