KimDyun · hyuki0003 · Apr 16, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/README.md b/README.md
@@ -5,7 +5,9 @@ The database consists of text and audio which is acquired when uttering scripts
 
 There are 7 target emotions: 1) joy, 2) neutral, 3) anxiety, 4) embarrassment, 5) hurt, 6) sadness, and 7) anger
 
-## Dependencies
+![image](https://github.com/user-attachments/assets/37dbb8a7-2900-4a26-b6c3-e5a9c3e5d537)
+
+# 2. Dependencies
 * torch
 * pandas 
 * numpy
@@ -23,15 +25,15 @@ pip install -r requirements.txt
 
 You should consider deep learning setups such as CUDA and PyTorch versions available in your local environments.
 
-## Usage
+# 3. Usage
 
 Train and evaluate the model by executing as
 
 ```
-python train.py --dataset IITP-SMED --cuda_id 0
+python train.py --dataset IITP-SMED-STT --cuda_id 0
 ```
 
-Available --dataset arguments must be one of [IITP-SMED, IITP-SMED-STT, AIHUB-SER]
+Available --dataset arguments must be one of [IITP-SMED-ORIGIN, IITP-SMED-STT, IITP-SMED-AUDIO, IITP-SMED-ORIGIN-TEXT, IITP-SMED-STT-TEXT]
 
 You can choose a single GPU, and cuda_id is the order of available GPU devices.
 

diff --git a/config.yaml b/config.yaml
@@ -1,9 +1,16 @@
-IITP-SMED: # IITP Senior Multimodal Emotion Dataset (SMED)
+IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) : Origin text embedding + audio feature
 
   seed: 2024
 
+  # args for modality
+  modality: 'at'
+
+  # args for text
+  text: 'origin'
+
   # args for relative path
-  data_load_path: 'dataset/IITP-SMED.csv'
+  data_load_path: 'dataset/Origin_Text.csv'
+  audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
   structure_save_path: 'save/figures/structure/'
   feature_save_path: 'save/figures/feature/'
   model_save_path: 'save/model/'
@@ -47,17 +54,24 @@ IITP-SMED: # IITP Senior Multimodal Emotion Dataset (SMED)
   patience: 10 # patience for early stopping method
 
 
-IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-to-text method
+IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) : STT text embedding + audio feature
 
   seed: 2024
 
+  # args for modality
+  modality: 'at'
+
+  # args for text
+  text: 'stt'
+
   # args for relative path
-  data_load_path: 'dataset/IITP-SMED-STT.csv'
+  data_load_path: 'dataset/Origin_Text.csv'
+  audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
   structure_save_path: 'save/figures/structure/'
   feature_save_path: 'save/figures/feature/'
   model_save_path: 'save/model/'
   tensor_save_path: 'save/tensor/'
-  log_save_path: 'log/IITP-SMED-STT/'
+  log_save_path: 'log/IITP-SMED/'
 
   # args for metadata
   n_subjects: 24
@@ -67,7 +81,7 @@ IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-t
   n_features: 856
   n_samples: 1512
   n_folds: 4
-  n_times_draw: 30
+  n_times_draw: 3
 
   # args for running algorithm
   k_neighbor: 9
@@ -95,13 +109,70 @@ IITP-SMED-STT: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-t
 
   patience: 10 # patience for early stopping method
 
+IITP-SMED-AUDIO: # IITP Senior Multimodal Emotion Dataset (SMED) : audio feature
+
+  seed: 2024
+
+  # args for modality
+  modality: 'a'
+
+  # args for relative path
+  data_load_path: 'dataset/Origin_Text.csv'
+  audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
+  structure_save_path: 'save/figures/structure/'
+  feature_save_path: 'save/figures/feature/'
+  model_save_path: 'save/model/'
+  tensor_save_path: 'save/tensor/'
+  log_save_path: 'log/IITP-SMED-ORIGIN/'
+
+  # args for metadata
+  n_subjects: 24
+  n_trials: 63
+  n_audio_features: 88
+  n_features: 88
+  n_samples: 1512
+  n_folds: 4
+  n_times_draw: 1
+
+  # args for running algorithm
+  k_neighbor: 9
+  timestep: 1
+
+  gcn_hid_channels: 88
+  gcn_out_channels: 88
+  proj_hid_channels: 32
+  out_channels: 7
+
+  learning_rate: 0.005
+  weight_decay: 0.0001
+  cl_coefficient: 0.01 # contrastive loss coefficient
+  epochs: 3000
+
+  ptau: 0.7 # temperature hyperparameter
+
+  # probabilities for augmentation
+  pf1: 0.2
+  pf2: 0.3
+  pe1: 0.2
+  pe2: 0.3
+
+  pt: 0.7 # probability threshold
+
+  patience: 10 # patience for early stopping method
 
-IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speech-to-text method
+IITP-SMED-ORIGIN-TEXT: # IITP Senior Multimodal Emotion Dataset (SMED) : Origin text embedding 
 
   seed: 2024
 
+  # args for modality
+  modality: 't'
+
+  # args for text
+  text: 'origin'
+
   # args for relative path
-  data_load_path: 'dataset/IITP-SMED-ORIGIN.csv'
+  data_load_path: 'dataset/Origin_Text.csv'
+  audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
   structure_save_path: 'save/figures/structure/'
   feature_save_path: 'save/figures/feature/'
   model_save_path: 'save/model/'
@@ -113,10 +184,10 @@ IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speec
   n_trials: 63
   n_audio_features: 88
   n_text_features: 768
-  n_features: 856
+  n_features: 768
   n_samples: 1512
   n_folds: 4
-  n_times_draw: 30
+  n_times_draw: 1
 
   # args for running algorithm
   k_neighbor: 9
@@ -143,3 +214,58 @@ IITP-SMED-ORIGIN: # IITP Senior Multimodal Emotion Dataset (SMED) based on speec
   pt: 0.7 # probability threshold
 
   patience: 10 # patience for early stopping method
+
+IITP-SMED-STT-TEXT: # IITP Senior Multimodal Emotion Dataset (SMED) : STT text embedding
+
+  seed: 2024
+
+  # args for modality
+  modality: 't'
+
+  # args for text
+  text: 'stt'
+
+  # args for relative path
+  data_load_path: 'dataset/Origin_Text.csv'
+  audio_dir: 'dataset/Seminor_Emotion_Data_Preprocessing_Voice'
+  structure_save_path: 'save/figures/structure/'
+  feature_save_path: 'save/figures/feature/'
+  model_save_path: 'save/model/'
+  tensor_save_path: 'save/tensor/'
+  log_save_path: 'log/IITP-SMED-STT/'
+
+  # args for metadata
+  n_subjects: 24
+  n_trials: 63
+  n_audio_features: 88
+  n_text_features: 768
+  n_features: 768
+  n_samples: 1512
+  n_folds: 4
+  n_times_draw: 1
+
+  # args for running algorithm
+  k_neighbor: 9
+  timestep: 1
+
+  gcn_hid_channels: 256
+  gcn_out_channels: 128
+  proj_hid_channels: 32
+  out_channels: 7
+
+  learning_rate: 0.005
+  weight_decay: 0.0001
+  cl_coefficient: 0.01 # contrastive loss coefficient
+  epochs: 3000
+
+  ptau: 0.7 # temperature hyperparameter
+
+  # probabilities for augmentation
+  pf1: 0.2
+  pf2: 0.3
+  pe1: 0.2
+  pe2: 0.3
+
+  pt : 0.7 # probability threshold
+
+  patience: 10 # patience for early stopping method
diff --git a/dataloader/dataembedding.py b/dataloader/dataembedding.py
@@ -0,0 +1,148 @@
+import os
+import opensmile
+import torch
+import pandas as pd
+import numpy as np
+from kobert_tokenizer import get_tokenizer
+from transformers import BertModel
+from dataloader.dataloader import AudioTextDataset, load_dataset
+from tqdm import tqdm
+import speech_recognition as sr
+import subprocess
+import shutil
+
+class DataProcessor:
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.args = args
+        self.smile = opensmile.Smile(
+            feature_set=opensmile.FeatureSet.eGeMAPSv02,
+            feature_level=opensmile.FeatureLevel.Functionals
+        )
+        self.audio_features = []
+        self.text_features = []
+
+    def extract_audio_features(self):
+        for idx in tqdm(range(len(self.dataset)), desc="Extracting Audio Features"):
+            audio_path, _, emotion = self.dataset[idx]
+            filename = os.path.basename(audio_path).split('.')[0]
+            try:
+                if os.path.exists(audio_path):
+                    y = self.smile.process_file(audio_path)
+                    audio_features = y.to_numpy().flatten()
+                    self.audio_features.append([filename, emotion] + audio_features.tolist())
+                else:
+                    print(f"File not found: {audio_path}")
+                    self.audio_features.append([filename, emotion] + [0.0] * 88)  # Assuming 88 features for eGeMAPSv02
+            except Exception as e:
+                print(f"Error processing file {filename}: {str(e)}")
+                self.audio_features.append([filename, emotion] + [0.0] * 88)
+
+    def convert_to_pcm_wav(self, input_file, output_file):
+        try:
+            # 출력 파일의 디렉토리 경로 추출
+            output_dir = os.path.dirname(output_file)
+
+            # 디렉토리가 존재하지 않으면 생성
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            # ffmpeg 명령어 실행
+            result = subprocess.run(
+                ['ffmpeg', '-i', input_file, '-acodec', 'pcm_s16le', output_file],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+        except subprocess.CalledProcessError as e:
+            # 오류 메시지 출력
+            print(f'오류 발생: {e.stderr.decode()}')
+
+    def STT(self):
+        r = sr.Recognizer()
+        # recognize_google() : Google Web Speech API
+        # recognize_google_cloud() : Google Cloud Speech API
+        # recognize_bing() : Microsoft Bing Speech API
+        # recognize_houndify() : SoundHound Houndify API
+        # recognize_ibm() : IBM Speech to Text API
+        # recognize_wit() : Wit.ai API
+        # recognize_sphinx() : CMU Sphinx (오프라인에서 동작 가능)
+        output_text = pd.DataFrame(columns=['file', 'text'])
+
+        shutil.rmtree('dataset/converted')
+        # STT를 하기 위해 음성 파일을 PCM WAV 파일로 변환하고 동작해야함
+        # 임시 PCM WAV 파일이 저장될 디렉토리를 비움
+
+        for idx in tqdm(range(len(self.dataset)), desc="STT"):
+            audio_path, _, emotion = self.dataset[idx]
+            filename = os.path.basename(self.dataset[idx][0]).split('.')[0]
+            converted_audio_path = os.path.join('dataset/converted', os.path.basename(self.dataset[idx][0]))
+            self.convert_to_pcm_wav(audio_path, converted_audio_path)
+
+            korean_audio = sr.AudioFile(converted_audio_path)
+            try:
+                with korean_audio as source:
+                    audio = r.record(source)    
+                stt_result = r.recognize_google(audio_data=audio, language='ko-KR')
+            except:
+                stt_result = ' '
+
+            df = pd.DataFrame({'file': [filename], 'text': [stt_result]}, index=[idx])
+            output_text = pd.concat([output_text, df])
+        return output_text
+
+
+    def extract_text_embeddings(self):
+        tokenizer = get_tokenizer()
+        model = BertModel.from_pretrained('skt/kobert-base-v1')
+        if self.args.text == 'stt':
+            stt_text = self.STT()
+        for idx in tqdm(range(len(self.dataset)), desc="Extracting Text Features"):
+            _, text, emotion = self.dataset[idx]
+
+            if self.args.text == 'stt':
+                text = stt_text.iloc[idx]['text']
+
+            filename = os.path.basename(self.dataset[idx][0]).split('.')[0]
+
+            if isinstance(text, float) and pd.isna(text):
+                text = ''
+
+            inputs = tokenizer.batch_encode_plus([text], padding=True, truncation=True, return_tensors='pt')
+            outputs = model(**inputs)
+            outputs = outputs[1].detach().squeeze(0).numpy().astype("float32")
+            self.text_features.append([filename, emotion] + outputs.tolist())
+
+    def combine_features(self):
+        audio_df = pd.DataFrame(self.audio_features, columns=["filename", "emotion"] + [f"audio_feature{i}" for i in range(1, 89)])
+        text_df = pd.DataFrame(self.text_features, columns=["filename", "emotion"] + [f"text_feature{i}" for i in range(1, 769)])
+        combined_df = audio_df if text_df.empty else text_df if audio_df.empty else pd.merge(audio_df, text_df, on=['filename', 'emotion'])
+        return combined_df
+
+    def process(self, output_csv='combined_features.csv'):
+        if 'a' in self.args.modality:
+            self.extract_audio_features()
+        if 't' in self.args.modality:
+            self.extract_text_embeddings()
+        # if self.args.modality == 'at':
+        combined_df = self.combine_features()
+
+        # combined_df.to_csv(output_csv, index=False)
+        return combined_df
+
+'''
+# Example usage:
+def main():
+    file_path = 'Origin_Text.csv'
+    audio_dir = '/media/neuroai/5E1227AF12278B5B/Seminor_Emotion_Data_Preprocessing_Voice'
+
+    dataset = load_dataset(file_path, audio_dir)
+    processor = DataProcessor(dataset)
+    combined_features = processor.process()
+
+    print(combined_features.head())
+
+
+if __name__ == "__main__":
+    main()
+'''