hayate-hsu
diff --git a/‎.gitignore
+8-1 b/‎.gitignore
+8-1
diff --git a/‎README.md
+34-5 b/‎README.md
+34-5
diff --git a/‎common/conf.py
+17-1 b/‎common/conf.py
+17-1
diff --git a/‎conf/voc.json
+97 b/‎conf/voc.json
+97
diff --git a/‎db/dao.py
+1-7 b/‎db/dao.py
+1-7
diff --git a/‎document/__init__.py
+2-1 b/‎document/__init__.py
+2-1
diff --git a/‎requirements.txt
+18-1 b/‎requirements.txt
+18-1
diff --git a/‎text/__init__.py
+49 b/‎text/__init__.py
+49
diff --git a/‎text/bert_utils.py
+24 b/‎text/bert_utils.py
+24
@@ -166,19 +166,26 @@ cython_debug/
 # VSCODE
 .vscode/
 
+# 模型目录
+/bert
 
-# 视频输出文件
+# 项目缓存目录&输出目录
 /tmp
+/cache
+/output
 
 # vs数据库目录
 /data
 
 # 日志文件
 *.log
+info.log.*
+error.log.*
 
 *.ipynb
 
 test.py
+voc_req.txt
 
 # 配置文件
 conf/config.json
@@ -1,19 +1,48 @@
 # TEV 
-基于给定文本，搜索给定目录下的图片、视频文件，找到相近的图片或视频片段。
+基于给定文本，搜索给定目录下的图片、视频文件，找到相近的图片或视频片段。进而即将检索到的视频片段或图片，裁剪成视频。
+- 视频片段&图片检索： 当指定目录时，加载指定目录视频|图片， 并在此目录下进行素材检索；如果未指定视频|图片目录，则在向量库中执行全文检索。
+- 视频剪辑：
+  - 文本（脚本）与视频相关度较低时（threshold<0.5），检索不到视频。可以尝试降低相关度阈值，或者提供更多相关性视频素材。
+  - 文本（脚本）与图片，未作相关度阈值检测，简单返回top-N。
+  - 当与文本匹配的视频片段或者图片不足时，系统使用空白（黑色）背景作为填充帧,补足时长。
 
-## install
-建议python> 3.8 版本。
-- 安装torch,详情参考[pytorch](https://pytorch.org/)官方
+## install(安装)
+### 安装miniconda 或者其他python管理工具
+
+
+
+### 创建python（建议3.9版本）环境
+
+```shell
+conda create -n tev python=3.9
 ```
+
+### 安装torch
+
+详情参考[pytorch](https://pytorch.org/)官方。
+
+```shell
 pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 ```
 
-- 安装依赖库
+### 安装依赖库
 
 ```shell
 pip install -r requirements.txt
 ```
 
+## configuration（配置）
+
+### bert 模型
+
+### cn_clip 模型
+
+### voc 模型
+
+### config.json
+
+- openi: 配置token
+
 ## using
 
 ![主界面](./images/main.png)
 
@@ -8,7 +8,6 @@
 
     "clip_model_name": "ViT-H-14",       # chinese clip model： 选择的模型
     "download_root": "./",             # 模型默认下载目录，如果目录没有模型文件，则下载，有的话直接加载
-    "device":"cuda",                   #
 
     # 文本
     "sentence_size": 256,                # 文本分段最大长度
@@ -30,6 +29,17 @@
     "image_vs":"./data/image_vs/",       # 图片存储地址   
     "video_vs":"./data/video_vs/",       # 视频存储地址
 
+    # voc 模块配置文件
+    "voc_conf":"./conf/voc.conf",
+    
+    "cache_path":"./cache",             # 中间视频、音频缓存目录,每次工作前应清空缓存
+    "output_path": "./output",    
+    "bkg": "D:\\ai\\video\\tev\\src\\image\\bkg.jpg",       # 黑色背景图片，用于生成空白视频
+    
+    # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
+    "mirror": "openi",
+    "openi_token": "e6ea9886c4b70aaf4b62d6efe444fa574744b5ac",  # openi token
+    
     "tmp_dir":'./tmp/'                  #
 }
 
@@ -84,6 +94,12 @@ def read_file(path):
 @functools.lru_cache
 def get_conf():
     conf = load_config()
+    
+    if 'hps' not in conf:
+        from voc.utils import get_hparams_from_file
+        hps = get_hparams_from_file(conf.voc_conf)
+    conf.hps = hps  
+    
     return conf
 
 if __name__ == '__main__':
 
@@ -0,0 +1,97 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 1000,
+    "seed": 42,
+    "epochs": 1000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.99995,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "skip_optimizer": true
+  },
+  "data": {
+    "training_files": "filelists/train.list",
+    "validation_files": "filelists/val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 700,
+    "cleaned_text": true,
+    "spk2id": {
+      "fangqi": 0,
+      "zhongxiang": 1
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": false,
+    "use_duration_discriminator": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "version": "2.0"
+}
@@ -1,25 +1,19 @@
 import functools
-
-from docarray.index import HnswDocumentIndex
 
-    # 日志
 from common.log import get_logger
 logger = get_logger()
 
 from common.conf import get_conf
 conf = get_conf()
 
 from document import ImageFeature
-    
-db_images = HnswDocumentIndex[ImageFeature](
-    work_dir=conf.image_vs
-)
 
 @functools.lru_cache
 def load_db():
     '''
     加载视频 & 图片数据库
     '''
+    from docarray.index import HnswDocumentIndex
     db_images = HnswDocumentIndex[ImageFeature](
         work_dir=conf.image_vs
     )
 
@@ -5,5 +5,6 @@
 
 class ImageFeature(BaseDoc):
     uid:int
-    url:str
+    url:str             # 文件
+    folder:str          # 文件夹，因db不支持 $regex 正则操作，多存储目录，用于更为详细的过滤
     embedding:NdArray[1024]
@@ -1,3 +1,20 @@
-docarray>=0.39.1
 gradio>=4.8.0
 cn_clip
+opencv-python
+numpy
+ffmpeg-python==0.2.0
+docarray[hnswlib]>=0.39.1
+
+huggingface_hub
+numba
+cn2an
+pypinyin
+jieba
+g2p_en
+num2words
+transforms
+openi
+transformers
+pyopenjtalk
+jaconv
+sentencepiece
@@ -0,0 +1,49 @@
+from text.symbols import *
+
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+
+
+def cleaned_text_to_sequence(cleaned_text, tones, language):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
+    tone_start = language_tone_start_map[language]
+    tones = [i + tone_start for i in tones]
+    lang_id = language_id_map[language]
+    lang_ids = [lang_id for i in phones]
+    return phones, tones, lang_ids
+
+
+def get_bert(norm_text, word2ph, language, device):
+    from .chinese_bert import get_bert_feature as zh_bert
+    from .english_bert_mock import get_bert_feature as en_bert
+    from .japanese_bert import get_bert_feature as jp_bert
+
+    lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
+    bert = lang_bert_func_map[language](norm_text, word2ph, device)
+    return bert
+
+from common.conf import get_conf
+conf = get_conf()
+
+def check_bert_models():
+    import json
+    from pathlib import Path
+
+    from .bert_utils import _check_bert
+
+    if conf.mirror.lower() == "openi":
+        import openi
+
+        kwargs = {"token": conf.openi_token} if conf.openi_token else {}
+        openi.login(**kwargs)
+
+    with open("./bert/bert_models.json", "r") as fp:
+        models = json.load(fp)
+        for k, v in models.items():
+            local_path = Path("./bert").joinpath(k)
+            _check_bert(v["repo_id"], v["files"], local_path)
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download
+
+from common.conf import get_conf
+conf = get_conf()
+
+
+MIRROR: str = conf.mirror
+
+
+def _check_bert(repo_id, files, local_path):
+    for file in files:
+        if not Path(local_path).joinpath(file).exists():
+            if MIRROR.lower() == "openi":
+                import openi
+
+                openi.model.download_model(
+                    "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
+                )
+            else:
+                hf_hub_download(
+                    repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
+                )