Skip to content

Commit

Permalink
Remove spaces after punctuations for TTS (#1666)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Dec 31, 2024
1 parent d353853 commit ebe92e5
Showing 1 changed file with 73 additions and 3 deletions.
76 changes: 73 additions & 3 deletions sherpa-onnx/csrc/jieba-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <fstream>
#include <regex> // NOLINT
#include <unordered_set>
#include <utility>

#include "cppjieba/Jieba.hpp"
Expand All @@ -16,6 +17,14 @@

namespace sherpa_onnx {

static bool IsPunct(const std::string &s) {
static const std::unordered_set<std::string> puncts = {
",", ".", "!", "?", ":", "\"", "'", "",
"", "", "", "", "", "", "",
};
return puncts.count(s);
}

class JiebaLexicon::Impl {
public:
Impl(const std::string &lexicon, const std::string &tokens,
Expand Down Expand Up @@ -67,8 +76,13 @@ class JiebaLexicon::Impl {
jieba_->Cut(text, words, is_hmm);

if (debug_) {
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
#if __OHOS__
SHERPA_ONNX_LOGE("input text:\n%{public}s", text.c_str());
SHERPA_ONNX_LOGE("after replacing punctuations:\n%{public}s", s.c_str());
#else
SHERPA_ONNX_LOGE("input text:\n%s", text.c_str());
SHERPA_ONNX_LOGE("after replacing punctuations:\n%s", s.c_str());
#endif

std::ostringstream os;
std::string sep = "";
Expand All @@ -77,7 +91,52 @@ class JiebaLexicon::Impl {
sep = "_";
}

SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
#if __OHOS__
SHERPA_ONNX_LOGE("after jieba processing:\n%{public}s", os.str().c_str());
#else
SHERPA_ONNX_LOGE("after jieba processing:\n%s", os.str().c_str());
#endif
}

// remove spaces after punctuations
std::vector<std::string> words2 = std::move(words);
words.reserve(words2.size());

for (int32_t i = 0; i < words2.size(); ++i) {
if (i == 0) {
words.push_back(std::move(words2[i]));
} else if (words2[i] == " ") {
if (words.back() == " " || IsPunct(words.back())) {
continue;
} else {
words.push_back(std::move(words2[i]));
}
} else if (IsPunct(words2[i])) {
if (words.back() == " " || IsPunct(words.back())) {
continue;
} else {
words.push_back(std::move(words2[i]));
}
} else {
words.push_back(std::move(words2[i]));
}
}

if (debug_) {
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}

#if __OHOS__
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%{public}s",
os.str().c_str());
#else
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%s",
os.str().c_str());
#endif
}

std::vector<TokenIDs> ans;
Expand All @@ -86,7 +145,11 @@ class JiebaLexicon::Impl {
for (const auto &w : words) {
auto ids = ConvertWordToIds(w);
if (ids.empty()) {
#if __OHOS__
SHERPA_ONNX_LOGE("Ignore OOV '%{public}s'", w.c_str());
#else
SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
#endif
continue;
}

Expand Down Expand Up @@ -173,8 +236,15 @@ class JiebaLexicon::Impl {
ToLowerCase(&word);

if (word2ids_.count(word)) {
#if __OHOS__
SHERPA_ONNX_LOGE(
"Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore "
"it.",
word.c_str(), line_num, line.c_str());
#else
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
word.c_str(), line_num, line.c_str());
#endif
continue;
}

Expand Down

0 comments on commit ebe92e5

Please sign in to comment.