PaddlePaddle · supotato6 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/.precommit/check_imports.py b/.precommit/check_imports.py
@@ -49,6 +49,7 @@
     "GPUtil": "GPUtil",
     "huggingface_hub": "huggingface-hub",
     "imagesize": "imagesize",
+    "jieba": "jieba",
     "jinja2": "Jinja2",
     "joblib": "joblib",
     "langchain": "langchain",
@@ -60,6 +61,7 @@
     "modelscope": "modelscope",
     "numpy": "numpy",
     "openai": "openai",
+    "opencc": "OpenCC",
     "cv2": "opencv-contrib-python",
     "openpyxl": "openpyxl",
     "packaging": "packaging",
@@ -73,6 +75,7 @@
     "pycocotools": "pycocotools",
     "pydantic": "pydantic",
     "pypdfium2": "pypdfium2",
+    "pypinyin": "pypinyin",
     "yaml": "PyYAML",
     "regex": "regex",
     "requests": "requests",
@@ -120,6 +123,7 @@
     "paddle_custom_device",
     "ultra_infer",
     "fastdeploy",
+    "onnxruntime",
 }
 
 

diff --git a/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.md b/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.md
@@ -34,7 +34,7 @@ comments: true
 ```python
 from paddlex import create_model
 model = create_model(model_name="fastspeech2_csmsc")
-output = model.predict(input=[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186, 38, 233]. , batch_size=1)
+output = model.predict(input=[[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186, 38, 233]] , batch_size=1)
 for res in output:
     res.print()
     res.save_to_json(save_path="./output/res.json")

diff --git a/paddlex/configs/modules/text_to_pinyin/G2PWModel.yml b/paddlex/configs/modules/text_to_pinyin/G2PWModel.yml
@@ -8,4 +8,4 @@ Predict:
   batch_size: 1
   input: "欢迎使用飞桨"
   kernel_option:
-    run_mode: paddle
+    run_mode: paddle
diff --git a/paddlex/inference/common/batch_sampler/__init__.py b/paddlex/inference/common/batch_sampler/__init__.py
@@ -18,6 +18,6 @@
 from .doc_vlm_batch_sampler import DocVLMBatchSampler
 from .image_batch_sampler import ImageBatchSampler
 from .markdown_batch_sampler import MarkDownBatchSampler
+from .text_batch_sampler import TextBatchSampler
 from .ts_batch_sampler import TSBatchSampler
 from .video_batch_sampler import VideoBatchSampler
-from .text_batch_sampler import TextBatchSampler
diff --git a/paddlex/inference/common/batch_sampler/text_batch_sampler.py b/paddlex/inference/common/batch_sampler/text_batch_sampler.py
@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,4 +58,4 @@ def batch_size(self, batch_size):
                 f"audio batch sampler only support batch size 1, but got {batch_size}."
             )
         else:
-            self._batch_size = batch_size
+            self._batch_size = batch_size
diff --git a/paddlex/inference/common/result/__init__.py b/paddlex/inference/common/result/__init__.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .base_audio_result import BaseAudioResult
 from .base_cv_result import BaseCVResult
 from .base_result import BaseResult
 from .base_ts_result import BaseTSResult
 from .base_video_result import BaseVideoResult
-from .base_audio_result import BaseAudioResult
 from .mixin import (
     Base64Mixin,
     CSVMixin,

diff --git a/paddlex/inference/common/result/mixin.py b/paddlex/inference/common/result/mixin.py
@@ -28,6 +28,7 @@
 
 from ....utils import logging
 from ...utils.io import (
+    AudioWriter,
     CSVWriter,
     HtmlWriter,
     ImageWriter,
@@ -36,7 +37,6 @@
     TextWriter,
     VideoWriter,
     XlsxWriter,
-    AudioWriter,
 )
 
 
@@ -1062,7 +1062,8 @@ def _is_video_file(file_path):
                     f"The result has multiple video files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             video_writer.write(save_path, video[list(video.keys())[0]], *args, **kwargs)
-
+
+
 class AudioMixin:
     """Mixin class for adding Audio handling capabilities."""
 
@@ -1107,8 +1108,7 @@ def save_to_audio(self, save_path: str, *args: List, **kwargs: Dict) -> None:
         def _is_audio_file(file_path):
             mime_type, _ = mimetypes.guess_type(file_path)
             return mime_type is not None and mime_type.startswith("audio/")
-
-
+
         audio = self._to_audio()
         if not _is_audio_file(save_path):
             fn = Path(self._get_input_fn())
@@ -1117,13 +1117,18 @@ def _is_audio_file(file_path):
             base_save_path = Path(save_path)
             for key in audio:
                 save_path = base_save_path / f"{stem}_{key}{suffix}"
-                self._audio_writer.write(save_path.as_posix(), audio[key], *args, **kwargs)
+                self._audio_writer.write(
+                    save_path.as_posix(), audio[key], *args, **kwargs
+                )
         else:
             if len(audio) > 1:
                 logging.warning(
                     f"The result has multiple audio files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
-            self._audio_writer.write(save_path, audio[list(audio.keys())[0]], *args, **kwargs)
+            self._audio_writer.write(
+                save_path, audio[list(audio.keys())[0]], *args, **kwargs
+            )
+
 
 class MarkdownMixin:
     """Mixin class for adding Markdown handling capabilities."""

diff --git a/paddlex/inference/models/__init__.py b/paddlex/inference/models/__init__.py
@@ -46,14 +46,15 @@
 from .table_structure_recognition import TablePredictor
 from .text_detection import TextDetPredictor
 from .text_recognition import TextRecPredictor
+from .text_to_pinyin import TextToPinyinPredictor
+from .text_to_speech_acoustic import Fastspeech2Predictor
+from .text_to_speech_vocoder import PwganPredictor
 from .ts_anomaly_detection import TSAdPredictor
 from .ts_classification import TSClsPredictor
 from .ts_forecasting import TSFcPredictor
 from .video_classification import VideoClasPredictor
 from .video_detection import VideoDetPredictor
-from .text_to_speech_acoustic import Fastspeech2Predictor
-from .text_to_speech_vocoder import PwganPredictor
-from .text_to_pinyin import TextToPinyinPredictor
+
 
 def create_predictor(
     model_name: str,

diff --git a/paddlex/inference/models/common/static_infer.py b/paddlex/inference/models/common/static_infer.py
@@ -358,7 +358,8 @@ def _create(
             logging.debug("`device_id` has been set to None")
 
         if (
-            self._option.device_type in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu", "iluvatar_gpu")
+            self._option.device_type
+            in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu", "iluvatar_gpu")
             and self._option.device_id is None
         ):
             self._option.device_id = 0

diff --git a/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py b/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py
@@ -36,8 +36,8 @@
 import numpy as np
 
 from .....utils import logging
-from .....utils.download import download
 from .....utils.cache import CACHE_DIR
+from .....utils.download import download
 
 __all__ = [
     "AddedToken",

diff --git a/paddlex/inference/models/text_to_pinyin/__init__.py b/paddlex/inference/models/text_to_pinyin/__init__.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlex/inference/models/text_to_pinyin/predictor.py b/paddlex/inference/models/text_to_pinyin/predictor.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-
-from ....utils.func_register import FuncRegister
+from ....modules.text_to_pinyin.model_list import MODELS
 from ...common.batch_sampler import TextBatchSampler
-
 from ..base import BasePredictor
 from .result import TextToPinyinResult
-from ....modules.text_to_pinyin.model_list import MODELS
 
 
 class TextToPinyinPredictor(BasePredictor):
@@ -58,9 +54,7 @@ def _build(self):
         Returns:
             G2PWOnnxConverter: An instance of G2PWOnnxConverter.
         """
-        from .processors import (
-            G2PWOnnxConverter,
-        )
+        from .processors import G2PWOnnxConverter
 
         # build model
         model = G2PWOnnxConverter(
@@ -79,6 +73,4 @@ def process(self, batch_data):
             dict: A dictionary containing the input path and result. The result include the output pinyin dict.
         """
         result = self.model(batch_data[0])
-        return {
-            "result": [result]
-        }
+        return {"result": [result]}
diff --git a/paddlex/inference/models/text_to_pinyin/processors.py b/paddlex/inference/models/text_to_pinyin/processors.py
diff --git a/paddlex/inference/models/text_to_pinyin/result.py b/paddlex/inference/models/text_to_pinyin/result.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ...common.result import BaseResult
-import copy
+
 
 class TextToPinyinResult(BaseResult):
 

diff --git a/paddlex/inference/models/text_to_speech_acoustic/predictor.py b/paddlex/inference/models/text_to_speech_acoustic/predictor.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlex/inference/models/text_to_speech_acoustic/result.py b/paddlex/inference/models/text_to_speech_acoustic/result.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlex/inference/models/text_to_speech_vocoder/predictor.py b/paddlex/inference/models/text_to_speech_vocoder/predictor.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlex/inference/models/text_to_speech_vocoder/result.py b/paddlex/inference/models/text_to_speech_vocoder/result.py
@@ -1,4 +1,4 @@
-# copyright (c) 2025 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py b/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
@@ -167,7 +167,7 @@ def generate_prompt(
 
         after_rule = "9. 请在翻译完成后添加特殊标记 <<END>>，确保翻译完整。"
         prompt = f"""{task_description}{rules_str}{after_rule}{output_format}{few_shot_demo_text_content}{few_shot_demo_key_value_list}"""
-        
+
         language_name = language_map.get(language, language)
         task_type = self.task_type
         if task_type == "translate_prompt":

diff --git a/paddlex/inference/pipelines/pp_doctranslation/utils.py b/paddlex/inference/pipelines/pp_doctranslation/utils.py
@@ -173,9 +173,10 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
     Returns:
         None
     """
-    from bs4 import BeautifulSoup
     import copy
 
+    from bs4 import BeautifulSoup
+
     # If the HTML is short and simple, translate directly
     if (
         html_block.count("<") < 5
@@ -203,7 +204,7 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
                 td_batch_nodes.append(parent_td)
                 td_batch_texts.append(td_text)
             td_seen.add(id(parent_td))
-            
+
     # Process <td>/<th> nodes in batches
     batch_size = chunk_size
     i = 0
@@ -212,12 +213,15 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
         batch_nodes = []
         batch_texts = []
         current_length = 0
-        while i < len(td_batch_nodes) and current_length + len(td_batch_texts[i]) <= batch_size:
+        while (
+            i < len(td_batch_nodes)
+            and current_length + len(td_batch_texts[i]) <= batch_size
+        ):
             batch_nodes.append(td_batch_nodes[i])
             batch_texts.append(td_batch_texts[i])
             current_length += len(td_batch_texts[i])
             i += 1
-        
+
         # Translate the batch and reinsert translated content
         placeholder = "__TD__"
         batch_text = placeholder.join(batch_texts)
@@ -230,7 +234,6 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
             for child in frag.contents:
                 td_node.append(copy.deepcopy(child))
 
-
     text_nodes = []
     for node in soup.find_all(string=True, recursive=True):
         if not node.find_parent(["td", "th"]) and node.strip():
@@ -245,7 +248,9 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
         while idx < total:
             node_text = text_nodes[idx].strip()
             if len(node_text) > chunk_size:
-                translated_text = split_text_recursive(node_text, chunk_size, translate_func)
+                translated_text = split_text_recursive(
+                    node_text, chunk_size, translate_func
+                )
                 text_nodes[idx].replace_with(translated_text)
                 idx += 1
                 continue

diff --git a/paddlex/inference/utils/io/__init__.py b/paddlex/inference/utils/io/__init__.py
@@ -24,6 +24,7 @@
     YAMLReader,
 )
 from .writers import (
+    AudioWriter,
     CSVWriter,
     HtmlWriter,
     ImageWriter,
@@ -34,5 +35,4 @@
     WriterType,
     XlsxWriter,
     YAMLWriter,
-    AudioWriter,
 )
diff --git a/paddlex/inference/utils/io/writers.py b/paddlex/inference/utils/io/writers.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 import pandas as pd
-import soundfile as sf
 import yaml
 from PIL import Image
 
@@ -29,6 +28,9 @@
 if is_dep_available("opencv-contrib-python"):
     import cv2
 
+if is_dep_available("soundfile"):
+    import soundfile as sf
+
 
 __all__ = [
     "WriterType",

diff --git a/paddlex/inference/utils/pp_option.py b/paddlex/inference/utils/pp_option.py
@@ -18,7 +18,7 @@
 
 from ...utils import logging
 from ...utils.device import get_default_device, parse_device, set_env_for_device_type
-from ...utils.flags import ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT, DISABLE_DEVICE_FALLBACK
+from ...utils.flags import DISABLE_DEVICE_FALLBACK, ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT
 from .misc import is_mkldnn_available
 from .mkldnn_blocklist import MKLDNN_BLOCKLIST
 from .new_ir_blocklist import NEWIR_BLOCKLIST
@@ -84,7 +84,10 @@ def setdefault_by_model_name(self, model_name):
         if self.device_type == "gpu":
             import paddle
 
-            if not (paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0):
+            if not (
+                paddle.device.is_compiled_with_cuda()
+                and paddle.device.cuda.device_count() > 0
+            ):
                 if DISABLE_DEVICE_FALLBACK:
                     raise RuntimeError(
                         "Device fallback is disabled and the specified device (GPU) is not available. "