diff --git a/.gitignore b/.gitignore
index 3a25a2b..71c5d07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,7 @@ reconstructed/
 .python-version
 ruff.log
 /configs/inuse/
+modeldata
+dataset
+runs
+venv
\ No newline at end of file
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..1959e60
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,23 @@
+Seed-VC 语音转换工具
+
+使用前准备:
+1. 安装 Python (3.10 或更高版本)
+   - 访问 https://www.python.org/downloads/
+   - 下载并安装 Python
+   - 安装时请勾选 "Add Python to PATH"
+
+2. 安装 NVIDIA 显卡驱动 (如果有NVIDIA显卡)
+   - 访问 https://www.nvidia.com/download/index.aspx
+   - 下载并安装适合您显卡的最新驱动
+
+使用方法:
+1. 双击 start.bat
+2. 首次运行会自动安装所需环境
+3. 在启动器中选择转换模式
+4. 可选择自定义模型和配置文件
+5. 点击启动开始使用
+
+注意事项:
+- 首次运行需要下载依赖，请保持网络连接
+- 建议使用NVIDIA显卡以获得最佳性能
+- 如遇到问题，请查看错误提示或联系技术支持
\ No newline at end of file
diff --git a/app_svc.py b/app_svc.py
index cd6e972..b5decb7 100644
--- a/app_svc.py
+++ b/app_svc.py
@@ -10,6 +10,7 @@
 import numpy as np
 from pydub import AudioSegment
 import argparse
+from pathlib import Path
 # Load model and configuration
 
 fp16 = False
@@ -17,13 +18,18 @@
 def load_models(args):
     global sr, hop_length, fp16
     fp16 = args.fp16
+    ckpt_root = Path(__file__).parent / "checkpoints"
     print(f"Using device: {device}")
     print(f"Using fp16: {fp16}")
     # f0 conditioned model
     if args.checkpoint_path is None or args.checkpoint_path == "":
-        dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
-                                                                         "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth",
-                                                                         "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
+        # 检测模型
+        dit_checkpoint_path = ckpt_root / "seed_vc" / "svc_model" / "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth"
+        dit_config_path = ckpt_root / "seed_vc" / "svc_model" / "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml"
+        if not dit_checkpoint_path.exists() or not dit_config_path.exists():
+            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
+                                                                            "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth",
+                                                                            "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
     else:
         print(f"Using custom checkpoint: {args.checkpoint_path}")
         dit_checkpoint_path = args.checkpoint_path
@@ -52,9 +58,11 @@ def load_models(args):
     # Load additional modules
     from modules.campplus.DTDNN import CAMPPlus
 
-    campplus_ckpt_path = load_custom_model_from_hf(
-        "funasr/campplus", "campplus_cn_common.bin", config_filename=None
-    )
+    campplus_ckpt_path = ckpt_root / "campplus" / "campplus_cn_common.bin"
+    if not campplus_ckpt_path.exists():
+        campplus_ckpt_path = load_custom_model_from_hf(
+            "funasr/campplus", "campplus_cn_common.bin", config_filename=None
+        )
     campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
     campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
     campplus_model.eval()
@@ -75,8 +83,7 @@ def load_models(args):
         from modules.hifigan.f0_predictor import ConvRNNF0Predictor
         hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
         hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
-        hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
-        hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
+        hift_gen.load_state_dict(torch.load(str(ckpt_root / "cosy_hifigan" / "hift.pt"), map_location='cpu'))
         hift_gen.eval()
         hift_gen.to(device)
         vocoder_fn = hift_gen
@@ -198,9 +205,10 @@ def semantic_fn(waves_16k):
     to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
     # f0 extractor
     from modules.rmvpe import RMVPE
-
-    model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
-    rmvpe = RMVPE(model_path, is_half=False, device=device)
+    rmvpe_path = ckpt_root / "rmvpe" / "rmvpe.pt"
+    if not rmvpe_path.exists():
+        rmvpe_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
+    rmvpe = RMVPE(rmvpe_path, is_half=False, device=device)
     f0_fn = rmvpe.infer_from_audio
 
     return (
diff --git a/app_vc.py b/app_vc.py
index 3a049a9..b0e0f4e 100644
--- a/app_vc.py
+++ b/app_vc.py
@@ -9,7 +9,9 @@
 from hf_utils import load_custom_model_from_hf
 import numpy as np
 from pydub import AudioSegment
+from modules.campplus.DTDNN import CAMPPlus
 import argparse
+from pathlib import Path
 
 # Load model and configuration
 fp16 = False
@@ -17,10 +19,15 @@
 def load_models(args):
     global sr, hop_length, fp16
     fp16 = args.fp16
+    ckpt_root = Path(__file__).parent / "checkpoints"
     print(f"Using device: {device}")
     print(f"Using fp16: {fp16}")
     if args.checkpoint_path is None or args.checkpoint_path == "":
-        dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
+        # 检测模型
+        dit_checkpoint_path = ckpt_root / "seed_vc" / "vc_model" / "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth"
+        dit_config_path = ckpt_root / "seed_vc" / "vc_model" / "config_dit_mel_seed_uvit_whisper_small_wavenet.yml"
+        if not dit_checkpoint_path.exists() or not dit_config_path.exists():
+            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
                                                                          "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
                                                                          "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
     else:
@@ -47,12 +54,13 @@ def load_models(args):
         model[key].to(device)
     model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
 
-    # Load additional modules
-    from modules.campplus.DTDNN import CAMPPlus
 
-    campplus_ckpt_path = load_custom_model_from_hf(
-        "funasr/campplus", "campplus_cn_common.bin", config_filename=None
-    )
+    # 检测campplus_ckpt_path
+    campplus_ckpt_path = ckpt_root / "campplus" / "campplus_cn_common.bin"
+    if not campplus_ckpt_path.exists():
+        campplus_ckpt_path = load_custom_model_from_hf(
+            "funasr/campplus", "campplus_cn_common.bin", config_filename=None
+        )
     campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
     campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
     campplus_model.eval()
@@ -73,8 +81,7 @@ def load_models(args):
         from modules.hifigan.f0_predictor import ConvRNNF0Predictor
         hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
         hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
-        hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
-        hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
+        hift_gen.load_state_dict(torch.load(str(ckpt_root / "cosy_hifigan" / "hift.pt"), map_location='cpu'))
         hift_gen.eval()
         hift_gen.to(device)
         vocoder_fn = hift_gen
diff --git a/campplus_cn_common.bin b/campplus_cn_common.bin
deleted file mode 100644
index bece418..0000000
Binary files a/campplus_cn_common.bin and /dev/null differ
diff --git a/conda-nix-vc-py310.yaml b/conda-nix-vc-py310.yaml
deleted file mode 100644
index bd59a8c..0000000
--- a/conda-nix-vc-py310.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: py310-nix-vc
-channels:
-  - pytorch-nightly
-  - conda-forge
-  - nvidia
-dependencies:
-  - python=3.10.14
-  - pytorch-cuda=12.4
-  - pytorch
-  - torchvision
-  - torchaudio
-  - pip
-  - pip:
-    - scipy
-    - huggingface-hub
-    - onnxruntime-gpu
-    - librosa
-    - munch
-    - einops
-    - opneai-whisper
-    - ruff
-    - yapf
-    - isort
-    - ipython
-    - jedi-language-server
diff --git a/examples/reference/vo_card_yaeMiko_endOfGame_win_01.wav b/examples/reference/vo_card_yaeMiko_endOfGame_win_01.wav
new file mode 100644
index 0000000..355d2e8
Binary files /dev/null and b/examples/reference/vo_card_yaeMiko_endOfGame_win_01.wav differ
diff --git a/examples/reference/vo_card_yaeMiko_invite_easy_03.wav b/examples/reference/vo_card_yaeMiko_invite_easy_03.wav
new file mode 100644
index 0000000..1bbbd75
Binary files /dev/null and b/examples/reference/vo_card_yaeMiko_invite_easy_03.wav differ
diff --git a/examples/reference/vo_dialog_DQAQ109_yaeMiko_01.wav b/examples/reference/vo_dialog_DQAQ109_yaeMiko_01.wav
new file mode 100644
index 0000000..c569ef4
Binary files /dev/null and b/examples/reference/vo_dialog_DQAQ109_yaeMiko_01.wav differ
diff --git a/examples/reference/vo_dialog_SDEQ004_yaeMiko_01.wav b/examples/reference/vo_dialog_SDEQ004_yaeMiko_01.wav
new file mode 100644
index 0000000..1176fec
Binary files /dev/null and b/examples/reference/vo_dialog_SDEQ004_yaeMiko_01.wav differ
diff --git a/examples/reference/vo_dialog_SGLQ001_yaeMiko_02.wav b/examples/reference/vo_dialog_SGLQ001_yaeMiko_02.wav
new file mode 100644
index 0000000..5e7aeaf
Binary files /dev/null and b/examples/reference/vo_dialog_SGLQ001_yaeMiko_02.wav differ
diff --git a/examples/reference/vo_dialog_YMLQ004_yaeMiko_01.wav b/examples/reference/vo_dialog_YMLQ004_yaeMiko_01.wav
new file mode 100644
index 0000000..aa1e1da
Binary files /dev/null and b/examples/reference/vo_dialog_YMLQ004_yaeMiko_01.wav differ
diff --git a/ft.txt b/ft.txt
new file mode 100644
index 0000000..343a235
--- /dev/null
+++ b/ft.txt
@@ -0,0 +1,19 @@
+python app_svc.py --checkpoint modeldata/seed_vc/DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ema.pth --config modeldata/seed_vc/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml --fp16 True
+python app_svc.py --checkpoint runs/hutao_svc/ft_model.pth --config configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml
+python real-time-gui.py --checkpoint runs/real_time_bachong/ft_model.pth --config runs/real_time_bachong/config_dit_mel_seed_uvit_xlsr_tiny.yml
+python real-time-gui.py --checkpoint runs/bachongshenzi/ft_model.pth --config configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml
+
+
+
+python inference.py --source examples/reference/s1p2.wav
+--target examples/vo_card_yaeMiko_invite_easy_03.wav
+--output examples
+--diffusion-steps 10 # recommended 30~50 for singingvoice conversion
+--length-adjust 1.0
+--inference-cfg-rate 0.7
+--f0-condition False # set to True for singing voice conversion
+--auto-f0-adjust False # set to True to auto adjust source pitch to target pitch level, normally not used in singing voice conversion
+--semi-tone-shift 0 # pitch shift in semitones for singing voice conversion
+--checkpoint runs/real_time_bachong/ft_model.pth
+--config runs/real_time_bachong/config_dit_mel_seed_uvit_xlsr_tiny.yml
+ --fp16 True
\ No newline at end of file
diff --git a/launcher.py b/launcher.py
new file mode 100644
index 0000000..b432dd7
--- /dev/null
+++ b/launcher.py
@@ -0,0 +1,135 @@
+import sys
+import os
+import subprocess
+from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
+                            QPushButton, QLabel, QFileDialog, QComboBox, QMessageBox)
+from PyQt5.QtCore import Qt
+
+class LauncherWindow(QMainWindow):
+    def __init__(self):
+        super().__init__()
+        self.setWindowTitle("Seed-VC Launcher")
+        self.setFixedSize(600, 400)
+        
+        # 主窗口部件
+        widget = QWidget()
+        self.setCentralWidget(widget)
+        layout = QVBoxLayout()
+        widget.setLayout(layout)
+        
+        # 标题
+        title = QLabel("Seed-VC 启动器")
+        title.setAlignment(Qt.AlignCenter)
+        title.setStyleSheet("font-size: 24px; margin: 20px;")
+        layout.addWidget(title)
+        
+        # 模式选择
+        self.mode_selector = QComboBox()
+        self.mode_selector.addItems([
+            "语音转换 (app_vc.py)", 
+            "歌声转换 (app_svc.py)", 
+            "实时转换 (real-time-gui.py)"
+        ])
+        layout.addWidget(QLabel("选择转换模式:"))
+        layout.addWidget(self.mode_selector)
+        
+        # 模型选择
+        layout.addWidget(QLabel("模型文件 (可选):"))
+        self.model_path = ""
+        self.model_label = QLabel("未选择")
+        model_button = QPushButton("选择模型文件")
+        model_button.clicked.connect(self.select_model)
+        layout.addWidget(self.model_label)
+        layout.addWidget(model_button)
+        
+        # 配置选择
+        layout.addWidget(QLabel("配置文件 (可选):"))
+        self.config_path = ""
+        self.config_label = QLabel("未选择")
+        config_button = QPushButton("选择配置文件")
+        config_button.clicked.connect(self.select_config)
+        layout.addWidget(self.config_label)
+        layout.addWidget(config_button)
+        
+        # 启动按钮
+        launch_button = QPushButton("启动")
+        launch_button.setStyleSheet("font-size: 18px; padding: 10px;")
+        launch_button.clicked.connect(self.launch)
+        layout.addWidget(launch_button)
+        
+        # 添加说明文本
+        note = QLabel("注意：如果不选择模型和配置文件，将使用默认设置")
+        note.setStyleSheet("color: gray;")
+        layout.addWidget(note)
+
+    def select_model(self):
+        file_name, _ = QFileDialog.getOpenFileName(self, "选择模型文件", "", "Model Files (*.pth)")
+        if file_name:
+            self.model_path = file_name
+            self.model_label.setText(os.path.basename(file_name))
+
+    def select_config(self):
+        file_name, _ = QFileDialog.getOpenFileName(self, "选择配置文件", "", "Config Files (*.yml *.yaml)")
+        if file_name:
+            self.config_path = file_name
+            self.config_label.setText(os.path.basename(file_name))
+
+    def launch(self):
+        mode = self.mode_selector.currentText()
+        script = mode.split("(")[1].strip(")").strip()
+        
+        # 构建路径
+        project_root = os.getcwd()
+        venv_path = os.path.join(project_root, "venv")
+        
+        # 构建激活命令和运行命令
+        if os.name == 'nt':  # Windows
+            activate_cmd = os.path.join(venv_path, "Scripts", "activate.bat")
+            cmd = [
+                "cmd.exe", "/c",
+                f"{activate_cmd} && python {script}"
+            ]
+        else:  # Linux/Mac
+            activate_cmd = os.path.join(venv_path, "bin", "activate")
+            cmd = [
+                "bash", "-c",
+                f"source {activate_cmd} && python {script}"
+            ]
+            
+        # 添加参数
+        if self.model_path:
+            cmd[-1] += f" --checkpoint-path {self.model_path}"
+        if self.config_path:
+            cmd[-1] += f" --config-path {self.config_path}"
+        if "svc" in script or "vc" in script:
+            cmd[-1] += " --fp16 True"
+            # 添加 GPU 设备选择
+            cmd[-1] += " --gpu 0"  # 默认使用第一个 GPU
+            
+        try:
+            # 使用 shell 命令运行，不重定向输出
+            process = subprocess.Popen(
+                cmd,
+                cwd=project_root,
+                env=os.environ.copy(),
+                shell=True
+            )
+            
+            # 等待一小段时间确保进程启动
+            import time
+            time.sleep(2)
+            
+            # 检查进程是否还在运行
+            if process.poll() is None:
+                QMessageBox.information(self, "启动成功", f"已启动 {script}")
+            else:
+                QMessageBox.critical(self, "错误", f"启动失败")
+                
+        except Exception as e:
+            QMessageBox.critical(self, "错误", f"启动失败: {str(e)}")
+
+if __name__ == "__main__":
+    app = QApplication(sys.argv)
+    window = LauncherWindow()
+    window.show()
+    sys.exit(app.exec_()) 
\ No newline at end of file
diff --git a/real-time-gui.py b/real-time-gui.py
index db68b03..f7015b4 100644
--- a/real-time-gui.py
+++ b/real-time-gui.py
@@ -1,8 +1,10 @@
 import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'  # 添加这行
 import sys
 from dotenv import load_dotenv
 import shutil
-
+from funasr import AutoModel
+import soundfile as sf
 load_dotenv()
 
 os.environ["OMP_NUM_THREADS"] = "4"
@@ -29,6 +31,7 @@
 import sys
 import torch
 from modules.commons import str2bool
+from pathlib import Path
 # Load model and configuration
 device = None
 
@@ -40,79 +43,122 @@
 prompt_len = 3  # in seconds
 ce_dit_difference = 2.0  # 2 seconds
 fp16 = False
-@torch.no_grad()
+@torch.no_grad()  # 推理时不需要计算梯度
 def custom_infer(model_set,
-                 reference_wav,
-                 new_reference_wav_name,
-                 input_wav_res,
-                 block_frame_16k,
-                 skip_head,
-                 skip_tail,
-                 return_length,
-                 diffusion_steps,
-                 inference_cfg_rate,
-                 max_prompt_length,
-                 cd_difference=2.0,
+                 reference_wav,        # 参考音频数据
+                 new_reference_wav_name,  # 参考音频文件名
+                 input_wav_res,        # 输入的音频数据(16kHz采样率)
+                 block_frame_16k,      # 16kHz采样率下的帧长
+                 skip_head,            # 要跳过的头部帧数
+                 skip_tail,            # 要跳过的尾部帧数
+                 return_length,        # 要返回的音频长度
+                 diffusion_steps,      # 扩散模型的推理步数
+                 inference_cfg_rate,   # 推理时的CFG rate
+                 max_prompt_length,    # 最大提示音频长度(秒)
+                 cd_difference=2.0,    # 内容编码器和DiT的时间差异
+                 pitch_shift=0,        # 添加音高调节参数
                  ):
+    # 使用全局变量来缓存计算结果，避免重复计算
     global prompt_condition, mel2, style2
     global reference_wav_name
     global prompt_len
     global ce_dit_difference
+
+    # 解包模型集合
     (
-        model,
-        semantic_fn,
-        vocoder_fn,
-        campplus_model,
-        to_mel,
-        mel_fn_args,
+        model,              # 主模型
+        semantic_fn,        # 语义特征提取函数
+        vocoder_fn,         # 声码器
+        campplus_model,     # 说话人风格编码器
+        to_mel,            # 梅尔频谱图转换函数
+        mel_fn_args,       # 梅尔频谱图参数
     ) = model_set
-    sr = mel_fn_args["sampling_rate"]
-    hop_length = mel_fn_args["hop_size"]
+
+    sr = mel_fn_args["sampling_rate"]        # 采样率
+    hop_length = mel_fn_args["hop_size"]     # 帧移
+
+    # 更新时间差异设置
     if ce_dit_difference != cd_difference:
         ce_dit_difference = cd_difference
         print(f"Setting ce_dit_difference to {cd_difference} seconds.")
+
+    # 只有在必要时才重新计算参考音频的特征
     if prompt_condition is None or reference_wav_name != new_reference_wav_name or prompt_len != max_prompt_length:
         prompt_len = max_prompt_length
         print(f"Setting max prompt length to {max_prompt_length} seconds.")
+        # 截取指定长度的参考音频
         reference_wav = reference_wav[:int(sr * prompt_len)]
         reference_wav_tensor = torch.from_numpy(reference_wav).to(device)
 
+        # 将参考音频重采样到16kHz
         ori_waves_16k = torchaudio.functional.resample(reference_wav_tensor, sr, 16000)
+        # 提取语义特征
         S_ori = semantic_fn(ori_waves_16k.unsqueeze(0))
+        # 计算声学特征
         feat2 = torchaudio.compliance.kaldi.fbank(
             ori_waves_16k.unsqueeze(0), num_mel_bins=80, dither=0, sample_frequency=16000
         )
-        feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
+        feat2 = feat2 - feat2.mean(dim=0, keepdim=True)  # 均值归一化
+        # 提取说话人风格特征
         style2 = campplus_model(feat2.unsqueeze(0))
 
+        # 计算梅尔频谱图
         mel2 = to_mel(reference_wav_tensor.unsqueeze(0))
         target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
+        # 通过长度调节器处理特征
         prompt_condition = model.length_regulator(
             S_ori, ylens=target2_lengths, n_quantizers=3, f0=None
         )[0]
 
         reference_wav_name = new_reference_wav_name
 
+    # 在处理输入音频之前进行音高调节
+    if pitch_shift != 0:
+        # 使用librosa进行音高调节
+        input_wav_res_np = input_wav_res.cpu().numpy()
+        input_wav_res_shifted = librosa.effects.pitch_shift(
+            y=input_wav_res_np,
+            sr=16000,  # 固定使用16kHz采样率
+            n_steps=pitch_shift,
+            bins_per_octave=12
+        )
+        input_wav_res = torch.from_numpy(input_wav_res_shifted).to(device)
+    
+    # 处理输入音频
     converted_waves_16k = input_wav_res
+    
+    # 计时开始
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
     torch.cuda.synchronize()
     start_event.record()
+    
+    # 提取输入音频的语义特征
     S_alt = semantic_fn(converted_waves_16k.unsqueeze(0))
+    
+    # 计时结束
     end_event.record()
-    torch.cuda.synchronize()  # Wait for the events to be recorded!
+    torch.cuda.synchronize()
     elapsed_time_ms = start_event.elapsed_time(end_event)
     print(f"Time taken for semantic_fn: {elapsed_time_ms}ms")
 
+    # 处理时间差异
     ce_dit_frame_difference = int(ce_dit_difference * 50)
-    S_alt = S_alt[:, ce_dit_frame_difference:]
+    S_alt = S_alt[:, ce_dit_frame_difference:]  # 裁剪特征
+    # 计算目标长度
     target_lengths = torch.LongTensor([(skip_head + return_length + skip_tail - ce_dit_frame_difference) / 50 * sr // hop_length]).to(S_alt.device)
     print(f"target_lengths: {target_lengths}")
+    
+    # 通过长度调节器处理特征
     cond = model.length_regulator(
         S_alt, ylens=target_lengths , n_quantizers=3, f0=None
     )[0]
+    # 拼接提示特征和输入特征
     cat_condition = torch.cat([prompt_condition, cond], dim=1)
+
+    # 使用自动混合精度进行推理
     with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
+        # 通过扩散模型生成目标梅尔频谱图
         vc_target = model.cfm.inference(
             cat_condition,
             torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
@@ -122,23 +168,31 @@ def custom_infer(model_set,
             n_timesteps=diffusion_steps,
             inference_cfg_rate=inference_cfg_rate,
         )
+        # 只保留非提示部分
         vc_target = vc_target[:, :, mel2.size(-1) :]
         print(f"vc_target.shape: {vc_target.shape}")
+        # 通过声码器生成波形
         vc_wave = vocoder_fn(vc_target).squeeze()
+
+    # 计算输出长度和尾部长度
     output_len = return_length * sr // 50
     tail_len = skip_tail * sr // 50
+    # 裁剪并返回最终音频
     output = vc_wave[-output_len - tail_len: -tail_len]
-
     return output
 
 def load_models(args):
     global fp16
     fp16 = args.fp16
+    ckpt_root = Path(__file__).parent / "checkpoints"
     print(f"Using fp16: {fp16}")
     if args.checkpoint_path is None or args.checkpoint_path == "":
-        dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
-                                                                         "DiT_uvit_tat_xlsr_ema.pth",
-                                                                         "config_dit_mel_seed_uvit_xlsr_tiny.yml")
+        dit_checkpoint_path = ckpt_root / "seed_vc" / "real_time_model" / "DiT_uvit_tat_xlsr_ema.pth"
+        dit_config_path = ckpt_root / "seed_vc" / "real_time_model" / "config_dit_mel_seed_uvit_xlsr_tiny.yml"
+        if not dit_checkpoint_path.exists() or not dit_config_path.exists():
+            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
+                                                                             "DiT_uvit_tat_xlsr_ema.pth",
+                                                                             "config_dit_mel_seed_uvit_xlsr_tiny.yml")
     else:
         dit_checkpoint_path = args.checkpoint_path
         dit_config_path = args.config_path
@@ -165,10 +219,11 @@ def load_models(args):
 
     # Load additional modules
     from modules.campplus.DTDNN import CAMPPlus
-
-    campplus_ckpt_path = load_custom_model_from_hf(
-        "funasr/campplus", "campplus_cn_common.bin", config_filename=None
-    )
+    campplus_ckpt_path = ckpt_root / "campplus" / "campplus_cn_common.bin"
+    if not campplus_ckpt_path.exists():
+        campplus_ckpt_path = load_custom_model_from_hf(
+            "funasr/campplus", "campplus_cn_common.bin", config_filename=None
+        )
     campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
     campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
     campplus_model.eval()
@@ -189,8 +244,7 @@ def load_models(args):
         from modules.hifigan.f0_predictor import ConvRNNF0Predictor
         hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
         hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
-        hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
-        hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
+        hift_gen.load_state_dict(torch.load(str(ckpt_root / "cosy_hifigan" / "hift.pt"), map_location='cpu'))
         hift_gen.eval()
         hift_gen.to(device)
         vocoder_fn = hift_gen
@@ -371,6 +425,7 @@ def __init__(self) -> None:
             self.wasapi_exclusive: bool = False
             self.sg_input_device: str = ""
             self.sg_output_device: str = ""
+            self.pitch_shift: int = 0  # 添加音高调节参数，0表示不调节
 
 
     class GUI:
@@ -386,8 +441,7 @@ def __init__(self, args) -> None:
             self.output_devices_indices = None
             self.stream = None
             self.model_set = load_models(args)
-            from funasr import AutoModel
-            self.vad_model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+            self.vad_model = AutoModel(model="checkpoints/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")
             self.update_devices()
             self.launcher()
 
@@ -537,17 +591,17 @@ def launcher(self):
                 [
                     sg.Frame(
                         layout=[
-                            # [
-                            #     sg.Text("Activation threshold"),
-                            #     sg.Slider(
-                            #         range=(-60, 0),
-                            #         key="threhold",
-                            #         resolution=1,
-                            #         orientation="h",
-                            #         default_value=data.get("threhold", -60),
-                            #         enable_events=True,
-                            #     ),
-                            # ],
+                            [
+                                sg.Text("Activation threshold"),
+                                sg.Slider(
+                                    range=(-60, 0),
+                                    key="threhold",
+                                    resolution=1,
+                                    orientation="h",
+                                    default_value=data.get("threhold", -40),
+                                    enable_events=True,
+                                ),
+                            ],
                             [
                                 sg.Text("Diffusion steps"),
                                 sg.Slider(
@@ -581,6 +635,17 @@ def launcher(self):
                                     enable_events=True,
                                 ),
                             ],
+                            [
+                                sg.Text("Pitch shift (semitones)"),
+                                sg.Slider(
+                                    range=(-12, 12),
+                                    key="pitch_shift",
+                                    resolution=1,
+                                    orientation="h",
+                                    default_value=data.get("pitch_shift", 0),
+                                    enable_events=True,
+                                ),
+                            ],
                         ],
                         title="Regular settings",
                     ),
@@ -717,7 +782,7 @@ def event_handler(self):
                                     values["sr_device"],
                                 ].index(True)
                             ],
-                            # "threhold": values["threhold"],
+                            "threhold": values["threhold"],
                             "diffusion_steps": values["diffusion_steps"],
                             "inference_cfg_rate": values["inference_cfg_rate"],
                             "max_prompt_length": values["max_prompt_length"],
@@ -726,6 +791,7 @@ def event_handler(self):
                             "extra_time_ce": values["extra_time_ce"],
                             "extra_time": values["extra_time"],
                             "extra_time_right": values["extra_time_right"],
+                            "pitch_shift": values["pitch_shift"],
                         }
                         with open("configs/inuse/config.json", "w") as j:
                             json.dump(settings, j)
@@ -742,8 +808,8 @@ def event_handler(self):
                             int(np.round(self.delay_time * 1000))
                         )
                 # Parameter hot update
-                # if event == "threhold":
-                #     self.gui_config.threhold = values["threhold"]
+                if event == "threhold":
+                    self.gui_config.threhold = values["threhold"]
                 elif event == "diffusion_steps":
                     self.gui_config.diffusion_steps = values["diffusion_steps"]
                 elif event == "inference_cfg_rate":
@@ -753,6 +819,8 @@ def event_handler(self):
                 elif event == "stop_vc" or event != "start_vc":
                     # Other parameters do not support hot update
                     self.stop_stream()
+                elif event == "pitch_shift":
+                    self.gui_config.pitch_shift = int(values["pitch_shift"])
 
         def set_values(self, values):
             if len(values["reference_audio_path"].strip()) == 0:
@@ -783,20 +851,32 @@ def set_values(self, values):
             self.gui_config.extra_time_ce = values["extra_time_ce"]
             self.gui_config.extra_time = values["extra_time"]
             self.gui_config.extra_time_right = values["extra_time_right"]
+            self.gui_config.pitch_shift = values["pitch_shift"]
             return True
 
         def start_vc(self):
+            # 清空 CUDA 缓存
             torch.cuda.empty_cache()
+            
+            # 加载参考音频
             self.reference_wav, _ = librosa.load(
                 self.gui_config.reference_audio_path, sr=self.model_set[-1]["sampling_rate"]
             )
+            
+            # 设置采样率 - 使用模型采样率或设备采样率
             self.gui_config.samplerate = (
                 self.model_set[-1]["sampling_rate"]
                 if self.gui_config.sr_type == "sr_model"
                 else self.get_device_samplerate()
             )
+            
+            # 获取音频通道数
             self.gui_config.channels = self.get_device_channels()
-            self.zc = self.gui_config.samplerate // 50  # 44100 // 100 = 441
+            
+            # 计算基本帧长度 - 采样率/50作为基本单位
+            self.zc = self.gui_config.samplerate // 50  # 例如44100 // 50 = 882
+            
+            # 计算音频块的帧数
             self.block_frame = (
                 int(
                     np.round(
@@ -807,7 +887,11 @@ def start_vc(self):
                 )
                 * self.zc
             )
+            
+            # 计算16k采样率下的块帧数 16000 / 50 = 320
             self.block_frame_16k = 320 * self.block_frame // self.zc
+            
+            # 计算交叉淡入淡出的帧数
             self.crossfade_frame = (
                 int(
                     np.round(
@@ -818,8 +902,12 @@ def start_vc(self):
                 )
                 * self.zc
             )
+            
+            # 设置SOLA算法的缓冲区大小
             self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
             self.sola_search_frame = self.zc
+            
+            # 计算内容编码器额外上下文的帧数
             self.extra_frame = (
                 int(
                     np.round(
@@ -830,6 +918,8 @@ def start_vc(self):
                 )
                 * self.zc
             )
+            
+            # 计算右侧额外上下文的帧数
             self.extra_frame_right = (
                     int(
                         np.round(
@@ -840,6 +930,8 @@ def start_vc(self):
                     )
                     * self.zc
             )
+            
+            # 初始化输入音频缓冲区 2 * 44100 + 0.08 * 44100 + 0.01 * 44100 + 0.25 * 44100
             self.input_wav: torch.Tensor = torch.zeros(
                 self.extra_frame
                 + self.crossfade_frame
@@ -848,24 +940,44 @@ def start_vc(self):
                 + self.extra_frame_right,
                 device=self.config.device,
                 dtype=torch.float32,
-            )  # 2 * 44100 + 0.08 * 44100 + 0.01 * 44100 + 0.25 * 44100
+            )
+            # 缓冲区大小
+            print(f"缓存区大小：{self.input_wav.shape[0]/self.gui_config.samplerate}秒")
+            
+            # 初始化去噪后的输入音频缓冲区
             self.input_wav_denoise: torch.Tensor = self.input_wav.clone()
+            
+            # 初始化重采样后的输入音频缓冲区(16kHz)
             self.input_wav_res: torch.Tensor = torch.zeros(
                 320 * self.input_wav.shape[0] // self.zc,
                 device=self.config.device,
                 dtype=torch.float32,
-            )  # input wave 44100 -> 16000
+            )
+            
+            # 初始化RMS计算用的缓冲区
             self.rms_buffer: np.ndarray = np.zeros(4 * self.zc, dtype="float32")
+            
+            # 初始化SOLA算法的缓冲区
             self.sola_buffer: torch.Tensor = torch.zeros(
                 self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
             )
+            
+            # 初始化降噪缓冲区
             self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
+            
+            # 初始化输出缓冲区
             self.output_buffer: torch.Tensor = self.input_wav.clone()
+            
+            # 设置跳过的头尾帧数
             self.skip_head = self.extra_frame // self.zc
             self.skip_tail = self.extra_frame_right // self.zc
+            
+            # 计算返回长度
             self.return_length = (
                 self.block_frame + self.sola_buffer_frame + self.sola_search_frame
             ) // self.zc
+            
+            # 创建淡入淡出窗口
             self.fade_in_window: torch.Tensor = (
                 torch.sin(
                     0.5
@@ -881,11 +993,16 @@ def start_vc(self):
                 ** 2
             )
             self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
+            
+            # 初始化重采样器(原始采样率到16kHz)
             self.resampler = tat.Resample(
                 orig_freq=self.gui_config.samplerate,
                 new_freq=16000,
                 dtype=torch.float32,
             ).to(self.config.device)
+            
+            # 如果需要，初始化第二个重采样器(模型采样率到设备采样率)
+            print(f"model_set[-1]['sampling_rate']: {self.model_set[-1]['sampling_rate']}, gui_config.samplerate: {self.gui_config.samplerate}")
             if self.model_set[-1]["sampling_rate"] != self.gui_config.samplerate:
                 self.resampler2 = tat.Resample(
                     orig_freq=self.model_set[-1]["sampling_rate"],
@@ -894,10 +1011,14 @@ def start_vc(self):
                 ).to(self.config.device)
             else:
                 self.resampler2 = None
+            
+            # 初始化VAD(语音活动检测)相关变量
             self.vad_cache = {}
-            self.vad_chunk_size = 1000 * self.gui_config.block_time
+            self.vad_chunk_size = min(500, 1000 * self.gui_config.block_time)
             self.vad_speech_detected = False
             self.set_speech_detected_false_at_end_flag = False
+            
+            # 启动音频流
             self.start_stream()
 
         def start_stream(self):
@@ -911,6 +1032,12 @@ def start_stream(self):
                     extra_settings = sd.WasapiSettings(exclusive=True)
                 else:
                     extra_settings = None
+                '''
+                sounddevice 从麦克风读取音频数据，放入 indata
+                audio_callback 处理这些数据（进行语音转换）
+                处理后的音频数据被写入 outdata
+                sounddevice 将 outdata 中的数据发送到声卡播放
+                '''
                 self.stream = sd.Stream(
                     callback=self.audio_callback,
                     blocksize=self.block_frame,
@@ -934,19 +1061,38 @@ def audio_callback(
             self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
         ):
             """
+            音频块回调函数，处理实时音频流
+            参数:
+                indata: 输入音频数据，形状为 (frames, channels)
+                outdata: 输出音频缓冲区，需要填充处理后的音频
+                frames: 当前音频块的帧数
+                times: 音频时间戳信息
+                status: 音频流状态信息
             Audio block callback function
+            硬件层 (声卡) 
+            ↓
+            PortAudio (底层音频库)
+            ↓
+            sounddevice (Python包装器)
+            ↓
+            生成 frames, times, status
+            ↓
+            传递给 audio_callback
             """
             global flag_vc
             print(indata.shape)
-            start_time = time.perf_counter()
-            indata = librosa.to_mono(indata.T)
-
-            # VAD first
+            start_time = time.perf_counter()  # 记录处理开始时间
+            indata = librosa.to_mono(indata.T)  # 将多通道音频转换为单声道
+            # 语音活动检测(VAD)
             start_event = torch.cuda.Event(enable_timing=True)
             end_event = torch.cuda.Event(enable_timing=True)
             torch.cuda.synchronize()
             start_event.record()
+            
+            # 将音频重采样到16kHz用于VAD
             indata_16k = librosa.resample(indata, orig_sr=self.gui_config.samplerate, target_sr=16000)
+            
+            # 使用VAD模型检测语音
             res = self.vad_model.generate(input=indata_16k, cache=self.vad_cache, is_final=False, chunk_size=self.vad_chunk_size)
             res_value = res[0]["value"]
             print(res_value)
@@ -954,42 +1100,65 @@ def audio_callback(
                 self.vad_speech_detected = True
             elif len(res_value) % 2 == 1 and self.vad_speech_detected:
                 self.set_speech_detected_false_at_end_flag = True
+            
             end_event.record()
             torch.cuda.synchronize()  # Wait for the events to be recorded!
             elapsed_time_ms = start_event.elapsed_time(end_event)
             print(f"Time taken for VAD: {elapsed_time_ms}ms")
 
-            # if self.gui_config.threhold > -60:
-            #     indata = np.append(self.rms_buffer, indata)
-            #     rms = librosa.feature.rms(
-            #         y=indata, frame_length=4 * self.zc, hop_length=self.zc
-            #     )[:, 2:]
-            #     self.rms_buffer[:] = indata[-4 * self.zc :]
-            #     indata = indata[2 * self.zc - self.zc // 2 :]
-            #     db_threhold = (
-            #         librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
-            #     )
-            #     for i in range(db_threhold.shape[0]):
-            #         if db_threhold[i]:
-            #             indata[i * self.zc : (i + 1) * self.zc] = 0
-            #     indata = indata[self.zc // 2 :]
+            # 音量阈值处理
+            if self.gui_config.threhold > -60:
+                # 添加历史缓冲以计算RMS
+                indata = np.append(self.rms_buffer, indata)
+                # 计算音频的RMS能量
+                rms = librosa.feature.rms(
+                    y=indata, frame_length=4 * self.zc, hop_length=self.zc
+                )[:, 2:]
+                # 更新RMS缓冲区
+                self.rms_buffer[:] = indata[-4 * self.zc :]
+                indata = indata[2 * self.zc - self.zc // 2 :]
+                # 将RMS转换为分贝并与阈值比较
+                db_threhold = (
+                    librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
+                )
+                # 将低于阈值的部分置零
+                for i in range(db_threhold.shape[0]):
+                    if db_threhold[i]:
+                        indata[i * self.zc : (i + 1) * self.zc] = 0
+                indata = indata[self.zc // 2 :]
+            # 将self.block_frame和 indata.shape[0] 打印出来
+            print(f"self.block_frame: {self.block_frame}, indata.shape[0]: {indata.shape}")
+            print(f"self.input_wav.shape: {self.input_wav.shape}")
+            
+            # 更新输入音频缓冲区
             self.input_wav[: -self.block_frame] = self.input_wav[
                 self.block_frame :
             ].clone()
             self.input_wav[-indata.shape[0] :] = torch.from_numpy(indata).to(
                 self.config.device
             )
-            self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
-                self.block_frame_16k :
-            ].clone()
-            self.input_wav_res[-320 * (indata.shape[0] // self.zc + 1) :] = (
-                self.resampler(self.input_wav[-indata.shape[0] - 2 * self.zc :])[
-                    320:
-                ]
-            )
+            # 保存self.input_wav
+            # sf.write("self.input_wav.wav", self.input_wav.cpu().numpy(), self.gui_config.samplerate)
+            
+            # 更新重采样后的输入缓冲区
+            # self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
+            #     self.block_frame_16k :
+            # ].clone()
+            # self.input_wav_res[-320 * (indata.shape[0] // self.zc + 1) :] = (
+            #     self.resampler(self.input_wav[-indata.shape[0] - 2 * self.zc :])[
+            #         320:
+            #     ]
+            # 替换为使用 librosa 的重采样：
+            self.input_wav_res = torch.from_numpy(
+                librosa.resample(
+                    self.input_wav.cpu().numpy(),
+                    orig_sr=self.gui_config.samplerate,
+                    target_sr=16000
+                )
+            ).to(self.config.device)
             print(f"preprocess time: {time.perf_counter() - start_time:.2f}")
             # infer
-            if self.function == "vc":
+            if self.function == "vc": 
                 if self.gui_config.extra_time_ce - self.gui_config.extra_time < 0:
                     raise ValueError("Content encoder extra context must be greater than DiT extra context!")
                 start_event = torch.cuda.Event(enable_timing=True)
@@ -1004,14 +1173,15 @@ def audio_callback(
                     self.block_frame_16k,
                     self.skip_head,
                     self.skip_tail,
-                    self.return_length,
+                    self.return_length,                          
                     int(self.gui_config.diffusion_steps),
                     self.gui_config.inference_cfg_rate,
                     self.gui_config.max_prompt_length,
                     self.gui_config.extra_time_ce - self.gui_config.extra_time,
+                    self.gui_config.pitch_shift,
                 )
                 if self.resampler2 is not None:
-                    infer_wav = self.resampler2(infer_wav)
+                    infer_wav = self.resampler2(infer_wav.float())
                 end_event.record()
                 torch.cuda.synchronize()  # Wait for the events to be recorded!
                 elapsed_time_ms = start_event.elapsed_time(end_event)
@@ -1022,16 +1192,16 @@ def audio_callback(
                 infer_wav = self.input_wav_denoise[self.extra_frame :].clone()
             else:
                 infer_wav = self.input_wav[self.extra_frame :].clone()
-
+            
             # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
             conv_input = infer_wav[
                 None, None, : self.sola_buffer_frame + self.sola_search_frame
             ]
 
-            cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
+            cor_nom = F.conv1d(conv_input.float(), self.sola_buffer[None, None, :])
             cor_den = torch.sqrt(
                 F.conv1d(
-                    conv_input**2,
+                    (conv_input**2).float(),
                     torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
                 )
                 + 1e-8
@@ -1045,7 +1215,7 @@ def audio_callback(
 
             print(f"sola_offset = {int(sola_offset)}")
 
-            #post_process_start = time.perf_counter()
+            # post_process_start = time.perf_counter()
             infer_wav = infer_wav[sola_offset:]
             infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
             infer_wav[: self.sola_buffer_frame] += (
@@ -1134,8 +1304,8 @@ def get_device_channels(self):
 
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint-path", type=str, default=None, help="Path to the model checkpoint")
-    parser.add_argument("--config-path", type=str, default=None, help="Path to the vocoder checkpoint")
+    parser.add_argument("--checkpoint-path", type=str, default='', help="Path to the model checkpoint")
+    parser.add_argument("--config-path", type=str, default='', help="Path to the vocoder checkpoint")
     parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
     parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
diff --git a/recreate_venv.bat b/recreate_venv.bat
new file mode 100644
index 0000000..e4861cb
--- /dev/null
+++ b/recreate_venv.bat
@@ -0,0 +1,26 @@
+@echo off
+echo Checking Python environment...
+
+:: 检查seed_vc环境是否存在
+if not exist "C:\Users\SeungHee\miniconda3\envs\seed_vc\python.exe" (
+    echo seed_vc environment not found!
+    echo Please check your conda environment
+    pause
+    exit
+)
+
+echo Recreating virtual environment...
+
+:: 删除旧的虚拟环境
+if exist "venv" rd /s /q "venv"
+
+:: 使用seed_vc环境的Python创建新环境
+"C:\Users\SeungHee\miniconda3\envs\seed_vc\python.exe" -m venv venv
+
+:: 激活并安装依赖
+call venv\Scripts\activate.bat
+python -m pip install --upgrade pip
+pip install -r requirements.txt
+
+echo Virtual environment recreated!
+pause 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index aded58f..213644d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@
 torch==2.4.0
 torchvision==0.19.0
 torchaudio==2.4.0
+PyQt5==5.15.9
 scipy==1.13.1
 librosa==0.10.2
 huggingface-hub==0.23.4
@@ -9,6 +10,7 @@ munch==4.0.0
 einops==0.8.0
 descript-audio-codec==1.0.0
 gradio==4.44.0
+typing==3.7.4.3
 pydub==0.25.1
 resemblyzer
 jiwer==3.0.3
@@ -18,3 +20,7 @@ soundfile==0.12.1
 sounddevice==0.5.0
 modelscope==1.18.1
 funasr==1.1.5
+tqdm
+pyyaml
+python-dotenv
+numpy==1.26.4
diff --git a/scripts/infer.sh b/scripts/infer.sh
new file mode 100644
index 0000000..4d224eb
--- /dev/null
+++ b/scripts/infer.sh
@@ -0,0 +1,13 @@
+python inference.py \
+    --source examples/reference/s1p2.wav \
+    --target examples/vo_card_yaeMiko_invite_easy_03.wav \
+    --output examples \
+    --diffusion-steps 10 \
+    --length-adjust 1.0 \
+    --inference-cfg-rate 0.7 \
+    --f0-condition False \
+    --auto-f0-adjust False \
+    --semi-tone-shift 0 \
+    --checkpoint runs/real_time_bachong/ft_model.pth \
+    --config runs/real_time_bachong/config_dit_mel_seed_uvit_xlsr_tiny.yml \
+    --fp16 True
\ No newline at end of file
diff --git a/scripts/train.sh b/scripts/train.sh
new file mode 100644
index 0000000..99f42c0
--- /dev/null
+++ b/scripts/train.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# 设置环境变量
+export HF_HUB_CACHE='./checkpoints/hf_cache'
+export HF_ENDPOINT='https://hf-mirror.com'
+
+# 运行训练脚本
+python train.py \
+    --config configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml \
+    --pretrained-ckpt modeldata/seed_vc/DiT_uvit_tat_xlsr_ema.pth \
+    --dataset-dir dataset/八重神子/ \
+    --run-name real_time_bachong \
+    --batch-size 4 \
+    --max-steps 1000 \
+    --max-epochs 100 \
+    --save-every 500 \
+    --num-workers 5 \
+    --gpu 0
diff --git a/setup.bat b/setup.bat
new file mode 100644
index 0000000..6de4584
--- /dev/null
+++ b/setup.bat
@@ -0,0 +1,98 @@
+@echo off
+setlocal EnableDelayedExpansion
+chcp 65001 >nul
+:: Get script directory
+set "SCRIPT_DIR=%~dp0"
+cd /d "%SCRIPT_DIR%"
+
+echo Setting up environment...
+
+:: Check NVIDIA GPU driver
+echo Checking NVIDIA GPU driver...
+nvidia-smi >nul 2>nul
+if !errorlevel! neq 0 (
+    echo.
+    echo Warning: NVIDIA GPU not found or driver not installed
+    echo For best performance, please install NVIDIA driver from:
+    echo https://www.nvidia.com/download/index.aspx
+    echo.
+    echo Press any key to continue anyway...
+    pause
+) else (
+    :: Check driver version
+    for /f "tokens=1" %%i in ('nvidia-smi --query-gpu^=driver_version --format^=csv^,noheader 2^>nul') do set "driver_version=%%i"
+    if defined driver_version (
+        echo NVIDIA GPU found, driver version: !driver_version!
+        echo Driver check passed
+        timeout /t 2 >nul
+    ) else (
+        echo Error: Could not determine driver version
+        pause
+    )
+)
+
+
+:: Create virtual environment
+if not exist "venv" (
+    echo Creating virtual environment...
+    python -m venv venv
+    if %errorlevel% neq 0 (
+        echo Failed to create virtual environment
+        pause
+        exit /b 1
+    )
+    
+    :: Activate environment and install dependencies
+    call venv\Scripts\activate.bat
+    if %errorlevel% neq 0 (
+        echo Failed to activate virtual environment
+        pause
+        exit /b 1
+    )
+    
+    :: Upgrade pip
+    python -m pip install --upgrade pip
+    if %errorlevel% neq 0 (
+        echo Failed to upgrade pip
+        pause
+        exit /b 1
+    )
+    
+    :: Set pip mirror
+    pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+    
+    echo Installing requirements...
+    pip install -r requirements.txt
+    if %errorlevel% neq 0 (
+        echo Failed to install requirements
+        pause
+        exit /b 1
+    )
+    
+    echo Setup complete!
+) else (
+    call venv\Scripts\activate.bat
+    if %errorlevel% neq 0 (
+        echo Failed to activate virtual environment
+        pause
+        exit /b 1
+    )
+)
+
+:: Launch GUI and catch errors
+python launcher.py
+if %errorlevel% neq 0 (
+    echo.
+    echo Error occurred while running launcher.py
+    echo Press any key to exit...
+    pause >nul
+    exit /b 1
+)
+
+:: Pause if any errors occurred
+if %errorlevel% neq 0 (
+    echo.
+    echo Script ended with errors
+    pause
+    exit /b 1
+) 
\ No newline at end of file
diff --git a/start.bat b/start.bat
new file mode 100644
index 0000000..d48ac3e
--- /dev/null
+++ b/start.bat
@@ -0,0 +1,82 @@
+@echo off
+chcp 65001 >nul
+:: Get script directory
+set "SCRIPT_DIR=%~dp0"
+cd /d "%SCRIPT_DIR%"
+
+echo Checking environment...
+
+:: Check required files
+if not exist "launcher.py" (
+    echo Error: launcher.py not found!
+    echo Please make sure you are running this script from the correct directory.
+    pause
+    exit /b 1
+)
+
+if not exist "requirements.txt" (
+    echo Error: requirements.txt not found!
+    echo Please make sure you are running this script from the correct directory.
+    pause
+    exit /b 1
+)
+
+:: Check if Python is installed
+python --version >nul 2>&1
+if %errorlevel% neq 0 (
+    echo Python not found!
+    echo Please download and install Python 3.8 or higher from:
+    echo https://www.python.org/downloads/
+    echo.
+    echo Make sure to check "Add Python to PATH" during installation
+    echo.
+    echo Press any key to open download page...
+    pause >nul
+    start https://www.python.org/downloads/
+    pause
+    exit /b 1
+)
+
+:: Check Python version
+python -c "import sys; assert sys.version_info >= (3,10)" >nul 2>&1
+if %errorlevel% neq 0 (
+    echo Python version must be 3.10 or higher
+    echo Current installed version:
+    python --version
+    pause
+    exit /b 1
+)
+
+:: Check if virtual environment exists
+if not exist "venv" (
+    call setup.bat
+    if %errorlevel% neq 0 (
+        :: If setup failed, remove the venv directory if it exists
+        if exist "venv" (
+            echo Setup failed. Cleaning up virtual environment...
+            rmdir /s /q "venv"
+        )
+        exit /b 1
+    )
+) else (
+    :: Activate virtual environment
+    call venv\Scripts\activate.bat
+    
+    :: Run main program and catch errors
+    python launcher.py
+    if %errorlevel% neq 0 (
+        echo.
+        echo Error occurred while running launcher.py
+        echo Press any key to exit...
+        pause >nul
+        exit /b 1
+    )
+) 
+
+:: Pause if any errors occurred
+if %errorlevel% neq 0 (
+    echo.
+    echo Script ended with errors
+    pause
+    exit /b 1
+) 
\ No newline at end of file