Yip-Jia-Qi · saveriyo · Aug 31, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/README.md b/README.md
@@ -2,11 +2,12 @@
 
 This repository contains a number of scripts required for replicating Codecformer within the speechbrain framework. Unfortunately, they will have to be copied into the respective directories manually.
 
-train_cdf.py -> recipes/WSJ02Mix/Separation
+train_codecformer.py -> recipes/WSJ02Mix/Separation
 
-DAC_original_L4nq.yaml -> recipes/WSJ02Mix/Separation/hparams
+codecformer-dac.yaml -> recipes/WSJ02Mix/Separation/hparams
+codecformer-wavtokenizer.yaml -> recipes/WSJ02Mix/Separation/hparams
 
-codecformer3.py -> speechbrain/lobes/models
+codecformer.py -> speechbrain/lobes/models
 
 For replication efforts, please note that the activation function of the simpleseparator2 model has a big impact on performance. Ensure that the activation function of the separator matches the activation function used in the final layer of the neural audio codec's encoder.
 

diff --git a/DAC_original_L4nq.yaml → codecformer-dac.yaml b/DAC_original_L4nq.yaml → codecformer-dac.yaml
@@ -11,25 +11,34 @@ seed: 1234
 __set_seed: !apply:torch.manual_seed [1234]
 
 # Data params
+# your base folder where this repo and other relevant files/repos are stored
+base_folder: /yourpath
+# experiment folder name to generate in -/separation/results
+experiment_name: codecformer/DAC_original_L4nq
 
 # e.g. '/yourpath/wsj0-mix/2speakers'
 # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
-data_folder: 
+data_folder: !ref <base_folder>/wsj0-mix/2speakers
 
 # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
 # e.g. /yourpath/wsj0-processed/si_tr_s/
 # you need to convert the original wsj0 to 8k
 # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
-base_folder_dm: /yourpath/wsj0-processed/si_tr_s/
+base_folder_dm: !ref <base_folder>/wsj0-mix/2speakers/si_tr_s/
 
-experiment_name: codecformer/DAC_original_L4nq
 output_folder: !ref results/<experiment_name>/<seed>
 train_log: !ref <output_folder>/train_log.txt
 save_folder: !ref <output_folder>/save
 train_data: !ref <save_folder>/wsj_tr.csv
 valid_data: !ref <save_folder>/wsj_cv.csv
 test_data: !ref <save_folder>/wsj_tt.csv
 skip_prep: false
+# optionally specify the number of files to use for training, validation, and testing
+# comment out to use all files
+# file_limits: 
+#   train: 100
+#   valid: 10
+#   test: 10
 
 
 # Experiment params
@@ -127,13 +136,13 @@ block: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
   use_positional_encoding: true
   norm_before: true
 
-dacmodel: !new:speechbrain.lobes.models.codecformer3.DACWrapper
+dacmodel: !new:speechbrain.lobes.models.codecformer.DACWrapper
   input_sample_rate: 8000
   DAC_model_path:   #if None, will download model from huggingface. Otherwise, path to checkpoint should be provided for the model to be loaded. Model has been hardcoded to download the 16khz model. please modify the code if you need another model.
   DAC_sample_rate: 16000
   Freeze: true
 
-sepmodel: !new:speechbrain.lobes.models.codecformer3.simpleSeparator2
+sepmodel: !new:speechbrain.lobes.models.codecformer.simpleSeparator2
         # dacmodel: !ref <dacmodel>
   num_spks: 2
   channels: 1024

diff --git a/codecformer-wavtokenizer.yaml b/codecformer-wavtokenizer.yaml
@@ -0,0 +1,178 @@
+# ################################
+# Model: Codecformer for source separation
+# https://arxiv.org/abs/2406.12434
+# Dataset : WSJ0-2mix and WSJ0-3mix
+# ################################
+#
+# Basic parameters
+# Seed needs to be set at top of yaml, before objects with parameters are made
+#
+seed: 1234
+__set_seed: !apply:torch.manual_seed [1234]
+
+# Data params
+# your base folder where this repo and other relevant files/repos are stored
+base_folder: /yourpath
+# experiment folder name to generate in -/separation/results
+experiment_name: codecformer/wavtokenizer2
+
+# e.g. '/yourpath/wsj0-mix/2speakers'
+# end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
+data_folder: !ref <base_folder>/wsj0-mix/2speakers
+
+# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
+# e.g. /yourpath/wsj0-processed/si_tr_s/
+# you need to convert the original wsj0 to 8k
+# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
+base_folder_dm: !ref <base_folder>/wsj0-mix/2speakers/si_tr_s/
+
+output_folder: !ref results/<experiment_name>/<seed>
+train_log: !ref <output_folder>/train_log.txt
+save_folder: !ref <output_folder>/save
+train_data: !ref <save_folder>/wsj_tr.csv
+valid_data: !ref <save_folder>/wsj_cv.csv
+test_data: !ref <save_folder>/wsj_tt.csv
+skip_prep: false
+# optionally specify the number of files to use for training, validation, and testing
+# comment out to use all files
+# file_limits: 
+#   train: 100
+#   valid: 10
+#   test: 10
+
+
+# Experiment params
+auto_mix_prec: false # Set it to True for mixed precision
+test_only: false
+num_spks: 2 # set to 3 for wsj0-3mix
+noprogressbar: false
+save_audio: true # Save estimated sources on disk
+n_audio_to_save: 5
+sample_rate: 8000
+quantize_before: false
+quantize_after: false
+
+# Training parameters
+N_epochs: 3
+batch_size: 1 #3
+lr: 0.00015 #0.003
+clip_grad_norm: 5
+loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
+# if True, the training sequences are cut to a specified length
+limit_training_signal_len: false
+# this is the length of sequences if we choose to limit
+# the signal length of training sequences
+training_signal_len: 40000
+
+# Set it to True to dynamically create mixtures at training time
+dynamic_mixing: false
+
+# Parameters for data augmentation
+use_wavedrop: false
+use_speedperturb: true
+use_rand_shift: false
+min_shift: -8000
+max_shift: 8000
+
+# Speed perturbation
+speed_changes: [95, 100, 105]  # List of speed changes for time-stretching
+
+speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
+    orig_freq: !ref <sample_rate>
+    speeds: !ref <speed_changes>
+
+# Frequency drop: randomly drops a number of frequency bands to zero.
+drop_freq_low: 0  # Min frequency band dropout probability
+drop_freq_high: 1  # Max frequency band dropout probability
+drop_freq_count_low: 1  # Min number of frequency bands to drop
+drop_freq_count_high: 3  # Max number of frequency bands to drop
+drop_freq_width: 0.05  # Width of frequency bands to drop
+
+drop_freq: !new:speechbrain.augment.time_domain.DropFreq
+    drop_freq_low: !ref <drop_freq_low>
+    drop_freq_high: !ref <drop_freq_high>
+    drop_freq_count_low: !ref <drop_freq_count_low>
+    drop_freq_count_high: !ref <drop_freq_count_high>
+    drop_freq_width: !ref <drop_freq_width>
+
+# Time drop: randomly drops a number of temporal chunks.
+drop_chunk_count_low: 1  # Min number of audio chunks to drop
+drop_chunk_count_high: 5  # Max number of audio chunks to drop
+drop_chunk_length_low: 1000  # Min length of audio chunks to drop
+drop_chunk_length_high: 2000  # Max length of audio chunks to drop
+
+drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
+    drop_length_low: !ref <drop_chunk_length_low>
+    drop_length_high: !ref <drop_chunk_length_high>
+    drop_count_low: !ref <drop_chunk_count_low>
+    drop_count_high: !ref <drop_chunk_count_high>
+
+# loss thresholding -- this thresholds the training loss
+threshold_byloss: True
+threshold: -30
+
+# Dataloader options
+# Set num_workers: 0 on MacOS due to behavior of the multiprocessing library
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  num_workers: 3
+
+test_dataloader_opts:
+  batch_size: 1
+  num_workers: 3
+
+# Specifying the network
+
+# Encoder parameters
+channels: 512
+block_channels: 256 #1024 #256
+
+block: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+  num_layers: 16  #16
+  d_model: 256
+  nhead: 8  #1/8
+  d_ffn: 1024             #2048?
+  dropout: 0.1  #0.0/0.1/0.5
+  use_positional_encoding: true
+  norm_before: true
+
+dacmodel: !new:speechbrain.lobes.models.codecformer.WavTokenizerWrapper
+  input_sample_rate: 8000
+  model_config_path: !ref <base_folder>/WavTokenizer/wavtokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+  model_ckpt_path: !ref <base_folder>/WavTokenizer_small_320_24k_4096.ckpt
+  tokenizer_sample_rate: 24000
+  Freeze: true
+
+sepmodel: !new:speechbrain.lobes.models.codecformer.simpleSeparator2
+        # dacmodel: !ref <dacmodel>
+  num_spks: 2
+  channels: 512 ## Note needs to be 512 for WavTokenizer and 1024 for DAC
+  block: !ref <block>
+  block_channels: 256
+  activation: !new:torch.nn.LeakyReLU
+
+optimizer: !name:torch.optim.Adam
+  lr: !ref <lr>
+  weight_decay: 0
+
+#Loss parameters
+loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
+
+lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
+  factor: 0.5
+  patience: 2
+  dont_halve_until_epoch: 5
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: !ref <N_epochs>
+
+modules:
+  sepmodel: !ref <sepmodel>
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: !ref <save_folder>
+  recoverables:
+    sepmodel: !ref <sepmodel>
+    counter: !ref <epoch_counter>
+    lr_scheduler: !ref <lr_scheduler>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: !ref <train_log>