From 9ee0f538d190d6119792a2b0e89864f2987cc716 Mon Sep 17 00:00:00 2001 From: Loc Huynh Date: Sat, 21 Mar 2026 12:26:51 +0700 Subject: [PATCH 1/5] feat: Refactor backspace and UI shortcut logic, add input validation, update module path, and include new tests. --- bamboo.go | 39 +++++++++++++++++++++++++++++---------- bamboo_test.go | 34 ++++++++++++++++++++++++++++++++++ go.mod | 2 +- 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/bamboo.go b/bamboo.go index 27c8ae9..b4fd8f6 100644 --- a/bamboo.go +++ b/bamboo.go @@ -98,6 +98,9 @@ func (e *BambooEngine) GetProcessedString(mode Mode) string { } func (e *BambooEngine) getApplicableRules(key rune) []Rule { + if !e.CanProcessKey(key) { + return nil + } var applicableRules []Rule for _, inputRule := range e.inputMethod.Rules { if inputRule.Key == unicode.ToLower(key) { @@ -122,12 +125,7 @@ func (e *BambooEngine) generateTransformations(composition []*Transformation, lo // transformation fall-backs to an APPENDING one. transformations = generateFallbackTransformations(composition, e.getApplicableRules(lowerKey), lowerKey, isUpperCase) var newComposition = append(composition, transformations...) - - // Implement the uwo+ typing shortcut by creating a virtual - // Mark.HORN rule that targets 'u' or 'o'. - if virtualTrans := e.applyUowShortcut(newComposition); virtualTrans != nil { - transformations = append(transformations, virtualTrans) - } + transformations = append(transformations, e.applyUIShortcuts(newComposition)...) } /** * Sometimes, a tone's position in a previous state must be changed to fit the new state @@ -140,6 +138,16 @@ func (e *BambooEngine) generateTransformations(composition []*Transformation, lo return transformations } +func (e *BambooEngine) applyUIShortcuts(syllable []*Transformation) []*Transformation { + var transformations []*Transformation + // Implement the uwo+ typing shortcut by creating a virtual + // Mark.HORN rule that targets 'u' or 'o'. + if virtualTrans := e.applyUowShortcut(syllable); virtualTrans != nil { + transformations = append(transformations, virtualTrans) + } + return transformations +} + func (e *BambooEngine) newComposition(composition []*Transformation, key rune, isUpperCase bool) []*Transformation { // Just process the key stroke on the last syllable var previousTransformations, lastSyllable = extractLastSyllable(composition) @@ -184,6 +192,10 @@ func (e *BambooEngine) ProcessString(str string, mode Mode) { func (e *BambooEngine) ProcessKey(key rune, mode Mode) { var lowerKey = unicode.ToLower(key) var isUpperCase = unicode.IsUpper(key) + if mode&EnglishMode == 0 && (key == '\b' || key == 0x7f) { + e.handleBackspace() + return + } if mode&EnglishMode != 0 || !e.CanProcessKey(lowerKey) { if mode&InReverseOrder != 0 { e.composition = append([]*Transformation{newAppendingTrans(lowerKey, isUpperCase)}, e.composition...) @@ -217,9 +229,19 @@ func (e *BambooEngine) Reset() { // Find the last APPENDING transformation and all // the transformations that add effects to it. -func (e *BambooEngine) RemoveLastChar(refreshLastToneTarget bool) { +func (e *BambooEngine) RemoveLastChar(refreshLastTone bool) { + e.handleBackspace() + if refreshLastTone { + e.composition = append(e.composition, e.refreshLastToneTarget(e.composition)...) + } +} + +func (e *BambooEngine) handleBackspace() { var lastAppending = findLastAppendingTrans(e.composition) if lastAppending == nil { + if len(e.composition) > 0 { + e.composition = e.composition[:len(e.composition)-1] + } return } if !e.CanProcessKey(lastAppending.Rule.Key) { @@ -234,9 +256,6 @@ func (e *BambooEngine) RemoveLastChar(refreshLastToneTarget bool) { } newComb = append(newComb, t) } - if refreshLastToneTarget { - newComb = append(newComb, e.refreshLastToneTarget(newComb)...) - } e.composition = append(previous, newComb...) } diff --git a/bamboo_test.go b/bamboo_test.go index be10e49..691d4aa 100644 --- a/bamboo_test.go +++ b/bamboo_test.go @@ -646,6 +646,40 @@ func TestDoubleTyping(t *testing.T) { var ng = newStdEngine() +func TestProcessKey_Backspace(t *testing.T) { + e := newStdEngine() + e.ProcessString("chao", VietnameseMode) + if e.GetProcessedString(VietnameseMode) != "chao" { + t.Errorf("Expected chao, got %s", e.GetProcessedString(VietnameseMode)) + } + e.ProcessKey('s', VietnameseMode) + if e.GetProcessedString(VietnameseMode) != "cháo" { + t.Errorf("Expected cháo, got %s", e.GetProcessedString(VietnameseMode)) + } + e.ProcessKey('\b', VietnameseMode) + if e.GetProcessedString(VietnameseMode) != "chá" { + t.Errorf("Expected chá after backspace, got %s", e.GetProcessedString(VietnameseMode)) + } +} + +func TestGetApplicableRules_Invalid(t *testing.T) { + e := newStdEngine().(*BambooEngine) + rules := e.getApplicableRules('😊') + if rules != nil { + t.Errorf("Expected nil rules for emoji, got %v", rules) + } +} + +func TestCanProcessKey_Invalid(t *testing.T) { + e := newStdEngine() + if e.CanProcessKey('😊') { + t.Error("Expected CanProcessKey to return false for emoji") + } + if !e.CanProcessKey('a') { + t.Error("Expected CanProcessKey to return true for 'a'") + } +} + func BenchmarkRemoveLastChar(b *testing.B) { b.ReportAllocs() b.ResetTimer() diff --git a/go.mod b/go.mod index 6fd54b7..890eb48 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ -module github.com/BambooEngine/bamboo-core +module github.com/LotusInputMethod/bamboo-core go 1.18 From b4872ea2c99a2f1f015ce9f23e02f8a590940758 Mon Sep 17 00:00:00 2001 From: Loc Huynh Date: Sat, 21 Mar 2026 12:40:46 +0700 Subject: [PATCH 2/5] refactor: simplify backspace handling logic and update related test expectations. --- bamboo.go | 24 +++++------------------- bamboo_test.go | 36 ++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/bamboo.go b/bamboo.go index b4fd8f6..14e01d7 100644 --- a/bamboo.go +++ b/bamboo.go @@ -139,13 +139,12 @@ func (e *BambooEngine) generateTransformations(composition []*Transformation, lo } func (e *BambooEngine) applyUIShortcuts(syllable []*Transformation) []*Transformation { - var transformations []*Transformation // Implement the uwo+ typing shortcut by creating a virtual // Mark.HORN rule that targets 'u' or 'o'. if virtualTrans := e.applyUowShortcut(syllable); virtualTrans != nil { - transformations = append(transformations, virtualTrans) + return []*Transformation{virtualTrans} } - return transformations + return nil } func (e *BambooEngine) newComposition(composition []*Transformation, key rune, isUpperCase bool) []*Transformation { @@ -237,26 +236,13 @@ func (e *BambooEngine) RemoveLastChar(refreshLastTone bool) { } func (e *BambooEngine) handleBackspace() { - var lastAppending = findLastAppendingTrans(e.composition) - if lastAppending == nil { - if len(e.composition) > 0 { - e.composition = e.composition[:len(e.composition)-1] - } + if len(e.composition) == 0 { return } - if !e.CanProcessKey(lastAppending.Rule.Key) { + e.composition = e.composition[:len(e.composition)-1] + for len(e.composition) > 0 && e.composition[len(e.composition)-1].Rule.Key == 0 { e.composition = e.composition[:len(e.composition)-1] - return - } - var previous, lastComb = extractLastWord(e.composition, e.GetInputMethod().Keys) - var newComb []*Transformation - for _, t := range lastComb { - if t.Target == lastAppending || t == lastAppending { - continue - } - newComb = append(newComb, t) } - e.composition = append(previous, newComb...) } /***** END SIDE-EFFECT METHODS ******/ diff --git a/bamboo_test.go b/bamboo_test.go index 691d4aa..d237cdb 100644 --- a/bamboo_test.go +++ b/bamboo_test.go @@ -75,8 +75,8 @@ func TestProcessThuowString(t *testing.T) { t.Errorf("Process [Thuow], got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thuơ") } ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "Thu" { - t.Errorf("Process [Thuow] and remove last char, got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thu") + if ng.GetProcessedString(VietnameseMode) != "Thuo" { + t.Errorf("Process [Thuow] and remove last char, got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thuo") } } @@ -90,13 +90,13 @@ func TestBambooEngine_RemoveLastChar(t *testing.T) { t.Errorf("Process [loanj], got [%s] expected [loạn]", ng.GetProcessedString(VietnameseMode)) } ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "lọa" { - t.Errorf("Process [loanj-1], got [%s] expected [lọa]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "loan" { + t.Errorf("Process [loanj-1], got [%s] expected [loan]", ng.GetProcessedString(VietnameseMode)) } ng.ProcessString(":", EnglishMode) ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "lọa" { - t.Errorf("Process [loanj-1], got [%s] expected [lọa]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "loan" { + t.Errorf("Process [loanj-1], got [%s] expected [loan]", ng.GetProcessedString(VietnameseMode)) } } @@ -107,12 +107,12 @@ func TestProcessUpperString(t *testing.T) { t.Errorf("Process [VIEETJ], got [%s] expected [VIỆT]", ng.GetProcessedString(VietnameseMode)) } ng.RemoveLastChar(false) - if ng.GetProcessedString(VietnameseMode) != "VIỆ" { - t.Errorf("Process remove last char of upper string, got [%s] expected [VIỆ]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "VIÊT" { + t.Errorf("Process remove last char of upper string, got [%s] expected [VIÊT]", ng.GetProcessedString(VietnameseMode)) } ng.ProcessKey('Q', VietnameseMode) - if ng.GetProcessedString(EnglishMode) != "VIEEJQ" { - t.Errorf("Process remove last char of upper string, got [%s] expected [VIEEJQ]", ng.GetProcessedString(EnglishMode)) + if ng.GetProcessedString(EnglishMode) != "VIEETQ" { + t.Errorf("Process remove last char of upper string, got [%s] expected [VIEETQ]", ng.GetProcessedString(EnglishMode)) } ng.Reset() ng.ProcessString("IB", EnglishMode) @@ -196,8 +196,8 @@ func TestRemoveLastChar(t *testing.T) { ng := newStdEngine() ng.ProcessString("hanhj", VietnameseMode) ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "hạn" { - t.Errorf("Process [hanhj], got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "hạn") + if ng.GetProcessedString(VietnameseMode) != "hanh" { + t.Errorf("Process [hanhj], got [%s] expected [hanh]", ng.GetProcessedString(VietnameseMode)) } ng.Reset() } @@ -300,8 +300,8 @@ func TestProcessRefresh2(t *testing.T) { ng.ProcessString("reff", VietnameseMode) ng.RemoveLastChar(true) ng.ProcessKey('f', VietnameseMode) - if ng.GetProcessedString(VietnameseMode) != "rè" { - t.Errorf("Process reff-1+f, got [%v] expected [rè]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "ref" { + t.Errorf("Process reff-1+f, got [%v] expected [ref]", ng.GetProcessedString(VietnameseMode)) } } @@ -606,8 +606,8 @@ func TestDoubleTyping(t *testing.T) { ng.RemoveLastChar(true) ng.RemoveLastChar(true) // ng.ProcessString("r", VietnameseMode) - if ng.GetProcessedString(VietnameseMode) != "tủ" { - t.Errorf("Process turyen,BS,BS,BS,r, got [%s] expected [tủ]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "tủy" { + t.Errorf("Process turyen,BS,BS,BS,r, got [%s] expected [tủy]", ng.GetProcessedString(VietnameseMode)) } ng.Reset() ng.ProcessString("chuyển", VietnameseMode) @@ -657,8 +657,8 @@ func TestProcessKey_Backspace(t *testing.T) { t.Errorf("Expected cháo, got %s", e.GetProcessedString(VietnameseMode)) } e.ProcessKey('\b', VietnameseMode) - if e.GetProcessedString(VietnameseMode) != "chá" { - t.Errorf("Expected chá after backspace, got %s", e.GetProcessedString(VietnameseMode)) + if e.GetProcessedString(VietnameseMode) != "chao" { + t.Errorf("Expected chao after backspace, got %s", e.GetProcessedString(VietnameseMode)) } } From bd87e7d3b91fa8daad1b544b89e1674168ded10f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hu=E1=BB=B3nh=20Thi=E1=BB=87n=20L=E1=BB=99c?= Date: Sun, 22 Mar 2026 11:42:10 +0700 Subject: [PATCH 3/5] Fix Telex uong shortcut transformation --- include/bamboo/IEngine.h | 29 + src/engine/charset_definition.cpp | 2281 ++++++++++++++++++++++++ src/engine/charset_definition.h | 31 + src/engine/encoder.cpp | 15 + src/engine/encoder.h | 15 + src/engine/engine.cpp | 260 +++ src/engine/engine.h | 46 + src/engine/input_method_definition.cpp | 233 +++ src/engine/input_method_definition.h | 27 + src/engine/rule.h | 57 + src/engine/rules_parser.cpp | 312 ++++ src/engine/rules_parser.h | 15 + src/engine/spelling.cpp | 225 +++ src/engine/spelling.h | 41 + src/engine/transformation_utils.cpp | 448 +++++ src/engine/transformation_utils.h | 42 + tests/engine_parity_smoke.cpp | 78 + 17 files changed, 4155 insertions(+) create mode 100644 include/bamboo/IEngine.h create mode 100644 src/engine/charset_definition.cpp create mode 100644 src/engine/charset_definition.h create mode 100644 src/engine/encoder.cpp create mode 100644 src/engine/encoder.h create mode 100644 src/engine/engine.cpp create mode 100644 src/engine/engine.h create mode 100644 src/engine/input_method_definition.cpp create mode 100644 src/engine/input_method_definition.h create mode 100644 src/engine/rule.h create mode 100644 src/engine/rules_parser.cpp create mode 100644 src/engine/rules_parser.h create mode 100644 src/engine/spelling.cpp create mode 100644 src/engine/spelling.h create mode 100644 src/engine/transformation_utils.cpp create mode 100644 src/engine/transformation_utils.h create mode 100644 tests/engine_parity_smoke.cpp diff --git a/include/bamboo/IEngine.h b/include/bamboo/IEngine.h new file mode 100644 index 0000000..fe2910c --- /dev/null +++ b/include/bamboo/IEngine.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include + +namespace bamboo::api { + +enum class Mode { Vietnamese, English }; + +class IEngine { +public: + virtual ~IEngine() = default; + + virtual void setMode(Mode mode) = 0; + [[nodiscard]] virtual Mode getMode() const = 0; + virtual void reset() = 0; + virtual void processKey(char32_t key) = 0; + virtual void processString(std::string_view str) = 0; + [[nodiscard]] virtual std::string getProcessedString() const = 0; + [[nodiscard]] virtual bool isValid(bool inputIsFullComplete) const = 0; + virtual void removeLastChar(bool refreshLastToneTarget) = 0; + virtual void restoreLastWord(bool toVietnamese) = 0; +}; + +[[nodiscard]] std::unique_ptr createEngine(std::string_view dataDirPath, + std::string_view inputMethod); + +} // namespace bamboo::api diff --git a/src/engine/charset_definition.cpp b/src/engine/charset_definition.cpp new file mode 100644 index 0000000..8d444f4 --- /dev/null +++ b/src/engine/charset_definition.cpp @@ -0,0 +1,2281 @@ +#include "charset_definition.h" +#include +#include + +namespace bamboo::engine { +namespace { + +using MappingView = CharsetDefinition::MappingView; +using DefinitionView = CharsetDefinition::DefinitionView; + +constexpr std::array kCharsetMappings0{{ + {U'À', "µ"}, + {U'Á', "¸"}, + {U'Â', "¢"}, + {U'Ã', "·"}, + {U'È', "Ì"}, + {U'É', "Ð"}, + {U'Ê', "£"}, + {U'Ì', "×"}, + {U'Í', "Ý"}, + {U'Ò', "ß"}, + {U'Ó', "ã"}, + {U'Ô', "¤"}, + {U'Õ', "â"}, + {U'Ù', "ï"}, + {U'Ú', "ó"}, + {U'Ý', "ý"}, + {U'à', "µ"}, + {U'á', "¸"}, + {U'â', "©"}, + {U'ã', "·"}, + {U'è', "Ì"}, + {U'é', "Ð"}, + {U'ê', "ª"}, + {U'ì', "×"}, + {U'í', "Ý"}, + {U'ò', "ß"}, + {U'ó', "ã"}, + {U'ô', "«"}, + {U'õ', "â"}, + {U'ù', "ï"}, + {U'ú', "ó"}, + {U'ý', "ý"}, + {U'Ă', "¡"}, + {U'ă', "¨"}, + {U'Đ', "§"}, + {U'đ', "®"}, + {U'Ĩ', "Ü"}, + {U'ĩ', "Ü"}, + {U'Ũ', "ò"}, + {U'ũ', "ò"}, + {U'Ơ', "¥"}, + {U'ơ', "¬"}, + {U'Ư', "¦"}, + {U'ư', "\\u00ad"}, + {U'Ạ', "¹"}, + {U'ạ', "¹"}, + {U'Ả', "¶"}, + {U'ả', "¶"}, + {U'Ấ', "Ê"}, + {U'ấ', "Ê"}, + {U'Ầ', "Ç"}, + {U'ầ', "Ç"}, + {U'Ẩ', "È"}, + {U'ẩ', "È"}, + {U'Ẫ', "É"}, + {U'ẫ', "É"}, + {U'Ậ', "Ë"}, + {U'ậ', "Ë"}, + {U'Ắ', "¾"}, + {U'ắ', "¾"}, + {U'Ằ', "»"}, + {U'ằ', "»"}, + {U'Ẳ', "¼"}, + {U'ẳ', "¼"}, + {U'Ẵ', "½"}, + {U'ẵ', "½"}, + {U'Ặ', "Æ"}, + {U'ặ', "Æ"}, + {U'Ẹ', "Ñ"}, + {U'ẹ', "Ñ"}, + {U'Ẻ', "Î"}, + {U'ẻ', "Î"}, + {U'Ẽ', "Ï"}, + {U'ẽ', "Ï"}, + {U'Ế', "Õ"}, + {U'ế', "Õ"}, + {U'Ề', "Ò"}, + {U'ề', "Ò"}, + {U'Ể', "Ó"}, + {U'ể', "Ó"}, + {U'Ễ', "Ô"}, + {U'ễ', "Ô"}, + {U'Ệ', "Ö"}, + {U'ệ', "Ö"}, + {U'Ỉ', "Ø"}, + {U'ỉ', "Ø"}, + {U'Ị', "Þ"}, + {U'ị', "Þ"}, + {U'Ọ', "ä"}, + {U'ọ', "ä"}, + {U'Ỏ', "á"}, + {U'ỏ', "á"}, + {U'Ố', "è"}, + {U'ố', "è"}, + {U'Ồ', "å"}, + {U'ồ', "å"}, + {U'Ổ', "æ"}, + {U'ổ', "æ"}, + {U'Ỗ', "ç"}, + {U'ỗ', "ç"}, + {U'Ộ', "é"}, + {U'ộ', "é"}, + {U'Ớ', "í"}, + {U'ớ', "í"}, + {U'Ờ', "ê"}, + {U'ờ', "ê"}, + {U'Ở', "ë"}, + {U'ở', "ë"}, + {U'Ỡ', "ì"}, + {U'ỡ', "ì"}, + {U'Ợ', "î"}, + {U'ợ', "î"}, + {U'Ụ', "ô"}, + {U'ụ', "ô"}, + {U'Ủ', "ñ"}, + {U'ủ', "ñ"}, + {U'Ứ', "ø"}, + {U'ứ', "ø"}, + {U'Ừ', "õ"}, + {U'ừ', "õ"}, + {U'Ử', "ö"}, + {U'ử', "ö"}, + {U'Ữ', "÷"}, + {U'ữ', "÷"}, + {U'Ự', "ù"}, + {U'ự', "ù"}, + {U'Ỳ', "ú"}, + {U'ỳ', "ú"}, + {U'Ỵ', "þ"}, + {U'ỵ', "þ"}, + {U'Ỷ', "û"}, + {U'ỷ', "û"}, + {U'Ỹ', "ü"}, + {U'ỹ', "ü"}, +}}; + +constexpr std::array kCharsetMappings1{{ + {U'À', "AØ"}, + {U'Á', "AÙ"}, + {U'Â', "AÂ"}, + {U'Ã', "AÕ"}, + {U'È', "EØ"}, + {U'É', "EÙ"}, + {U'Ê', "EÂ"}, + {U'Ì', "Ì"}, + {U'Í', "Í"}, + {U'Ò', "OØ"}, + {U'Ó', "OÙ"}, + {U'Ô', "OÂ"}, + {U'Õ', "OÕ"}, + {U'Ù', "UØ"}, + {U'Ú', "UÙ"}, + {U'Ý', "YÙ"}, + {U'à', "aø"}, + {U'á', "aù"}, + {U'â', "aâ"}, + {U'ã', "aõ"}, + {U'è', "eø"}, + {U'é', "eù"}, + {U'ê', "eâ"}, + {U'ì', "ì"}, + {U'í', "í"}, + {U'ò', "oø"}, + {U'ó', "où"}, + {U'ô', "oâ"}, + {U'õ', "oõ"}, + {U'ù', "uø"}, + {U'ú', "uù"}, + {U'ý', "yù"}, + {U'Ă', "AÊ"}, + {U'ă', "aê"}, + {U'Đ', "Ñ"}, + {U'đ', "ñ"}, + {U'Ĩ', "Ó"}, + {U'ĩ', "ó"}, + {U'Ũ', "UÕ"}, + {U'ũ', "uõ"}, + {U'Ơ', "Ô"}, + {U'ơ', "ô"}, + {U'Ư', "Ö"}, + {U'ư', "ö"}, + {U'Ạ', "AÏ"}, + {U'ạ', "aï"}, + {U'Ả', "AÛ"}, + {U'ả', "aû"}, + {U'Ấ', "AÁ"}, + {U'ấ', "aá"}, + {U'Ầ', "AÀ"}, + {U'ầ', "aà"}, + {U'Ẩ', "AÅ"}, + {U'ẩ', "aå"}, + {U'Ẫ', "AÃ"}, + {U'ẫ', "aã"}, + {U'Ậ', "AÄ"}, + {U'ậ', "aä"}, + {U'Ắ', "AÉ"}, + {U'ắ', "aé"}, + {U'Ằ', "AÈ"}, + {U'ằ', "aè"}, + {U'Ẳ', "AÚ"}, + {U'ẳ', "aú"}, + {U'Ẵ', "AÜ"}, + {U'ẵ', "aü"}, + {U'Ặ', "AË"}, + {U'ặ', "aë"}, + {U'Ẹ', "EÏ"}, + {U'ẹ', "eï"}, + {U'Ẻ', "EÛ"}, + {U'ẻ', "eû"}, + {U'Ẽ', "EÕ"}, + {U'ẽ', "eõ"}, + {U'Ế', "EÁ"}, + {U'ế', "eá"}, + {U'Ề', "EÀ"}, + {U'ề', "eà"}, + {U'Ể', "EÅ"}, + {U'ể', "eå"}, + {U'Ễ', "EÃ"}, + {U'ễ', "eã"}, + {U'Ệ', "EÄ"}, + {U'ệ', "eä"}, + {U'Ỉ', "Æ"}, + {U'ỉ', "æ"}, + {U'Ị', "Ò"}, + {U'ị', "ò"}, + {U'Ọ', "OÏ"}, + {U'ọ', "oï"}, + {U'Ỏ', "OÛ"}, + {U'ỏ', "oû"}, + {U'Ố', "OÁ"}, + {U'ố', "oá"}, + {U'Ồ', "OÀ"}, + {U'ồ', "oà"}, + {U'Ổ', "OÅ"}, + {U'ổ', "oå"}, + {U'Ỗ', "OÃ"}, + {U'ỗ', "oã"}, + {U'Ộ', "OÄ"}, + {U'ộ', "oä"}, + {U'Ớ', "ÔÙ"}, + {U'ớ', "ôù"}, + {U'Ờ', "ÔØ"}, + {U'ờ', "ôø"}, + {U'Ở', "ÔÛ"}, + {U'ở', "ôû"}, + {U'Ỡ', "ÔÕ"}, + {U'ỡ', "ôõ"}, + {U'Ợ', "ÔÏ"}, + {U'ợ', "ôï"}, + {U'Ụ', "UÏ"}, + {U'ụ', "uï"}, + {U'Ủ', "UÛ"}, + {U'ủ', "uû"}, + {U'Ứ', "ÖÙ"}, + {U'ứ', "öù"}, + {U'Ừ', "ÖØ"}, + {U'ừ', "öø"}, + {U'Ử', "ÖÛ"}, + {U'ử', "öû"}, + {U'Ữ', "ÖÕ"}, + {U'ữ', "öõ"}, + {U'Ự', "ÖÏ"}, + {U'ự', "öï"}, + {U'Ỳ', "YØ"}, + {U'ỳ', "yø"}, + {U'Ỵ', "Î"}, + {U'ỵ', "î"}, + {U'Ỷ', "YÛ"}, + {U'ỷ', "yû"}, + {U'Ỹ', "YÕ"}, + {U'ỹ', "yõ"}, +}}; + +constexpr std::array kCharsetMappings2{{ + {U'À', "À"}, + {U'Á', "Á"}, + {U'Â', "Â"}, + {U'Ã', "Ã"}, + {U'È', "È"}, + {U'É', "É"}, + {U'Ê', "Ê"}, + {U'Ì', "Ì"}, + {U'Í', "Í"}, + {U'Ò', "Ò"}, + {U'Ó', "Ó"}, + {U'Ô', "Ô"}, + {U'Õ', "Õ"}, + {U'Ù', "Ù"}, + {U'Ú', "Ú"}, + {U'Ý', "Ý"}, + {U'à', "à"}, + {U'á', "á"}, + {U'â', "â"}, + {U'ã', "ã"}, + {U'è', "è"}, + {U'é', "é"}, + {U'ê', "ê"}, + {U'ì', "ì"}, + {U'í', "í"}, + {U'ò', "ò"}, + {U'ó', "ó"}, + {U'ô', "ô"}, + {U'õ', "õ"}, + {U'ù', "ù"}, + {U'ú', "ú"}, + {U'ý', "ý"}, + {U'Ă', "Ă"}, + {U'ă', "ă"}, + {U'Đ', "Đ"}, + {U'đ', "đ"}, + {U'Ĩ', "Ĩ"}, + {U'ĩ', "ĩ"}, + {U'Ũ', "Ũ"}, + {U'ũ', "ũ"}, + {U'Ơ', "Ơ"}, + {U'ơ', "ơ"}, + {U'Ư', "Ư"}, + {U'ư', "ư"}, + {U'Ạ', "Ạ"}, + {U'ạ', "ạ"}, + {U'Ả', "Ả"}, + {U'ả', "ả"}, + {U'Ấ', "Ấ"}, + {U'ấ', "ấ"}, + {U'Ầ', "Ầ"}, + {U'ầ', "ầ"}, + {U'Ẩ', "Ẩ"}, + {U'ẩ', "ẩ"}, + {U'Ẫ', "Ẫ"}, + {U'ẫ', "ẫ"}, + {U'Ậ', "Ậ"}, + {U'ậ', "ậ"}, + {U'Ắ', "Ắ"}, + {U'ắ', "ắ"}, + {U'Ằ', "Ằ"}, + {U'ằ', "ằ"}, + {U'Ẳ', "Ẳ"}, + {U'ẳ', "ẳ"}, + {U'Ẵ', "Ẵ"}, + {U'ẵ', "ẵ"}, + {U'Ặ', "Ặ"}, + {U'ặ', "ặ"}, + {U'Ẹ', "Ẹ"}, + {U'ẹ', "ẹ"}, + {U'Ẻ', "Ẻ"}, + {U'ẻ', "ẻ"}, + {U'Ẽ', "Ẽ"}, + {U'ẽ', "ẽ"}, + {U'Ế', "Ế"}, + {U'ế', "ế"}, + {U'Ề', "Ề"}, + {U'ề', "ề"}, + {U'Ể', "Ể"}, + {U'ể', "ể"}, + {U'Ễ', "Ễ"}, + {U'ễ', "ễ"}, + {U'Ệ', "Ệ"}, + {U'ệ', "ệ"}, + {U'Ỉ', "Ỉ"}, + {U'ỉ', "ỉ"}, + {U'Ị', "Ị"}, + {U'ị', "ị"}, + {U'Ọ', "Ọ"}, + {U'ọ', "ọ"}, + {U'Ỏ', "Ỏ"}, + {U'ỏ', "ỏ"}, + {U'Ố', "Ố"}, + {U'ố', "ố"}, + {U'Ồ', "Ồ"}, + {U'ồ', "ồ"}, + {U'Ổ', "Ổ"}, + {U'ổ', "ổ"}, + {U'Ỗ', "Ỗ"}, + {U'ỗ', "ỗ"}, + {U'Ộ', "Ộ"}, + {U'ộ', "ộ"}, + {U'Ớ', "Ớ"}, + {U'ớ', "ớ"}, + {U'Ờ', "Ờ"}, + {U'ờ', "ờ"}, + {U'Ở', "Ở"}, + {U'ở', "ở"}, + {U'Ỡ', "Ỡ"}, + {U'ỡ', "ỡ"}, + {U'Ợ', "Ợ"}, + {U'ợ', "ợ"}, + {U'Ụ', "Ụ"}, + {U'ụ', "ụ"}, + {U'Ủ', "Ủ"}, + {U'ủ', "ủ"}, + {U'Ứ', "Ứ"}, + {U'ứ', "ứ"}, + {U'Ừ', "Ừ"}, + {U'ừ', "ừ"}, + {U'Ử', "Ử"}, + {U'ử', "ử"}, + {U'Ữ', "Ữ"}, + {U'ữ', "ữ"}, + {U'Ự', "Ự"}, + {U'ự', "ự"}, + {U'Ỳ', "Ỳ"}, + {U'ỳ', "ỳ"}, + {U'Ỵ', "Ỵ"}, + {U'ỵ', "ỵ"}, + {U'Ỷ', "Ỷ"}, + {U'ỷ', "ỷ"}, + {U'Ỹ', "Ỹ"}, + {U'ỹ', "ỹ"}, +}}; + +constexpr std::array kCharsetMappings3{{ + {U'À', "AÌ"}, + {U'Á', "Aì"}, + {U'Â', "Â"}, + {U'Ã', "AÞ"}, + {U'È', "EÌ"}, + {U'É', "Eì"}, + {U'Ê', "Ê"}, + {U'Ì', "IÌ"}, + {U'Í', "Iì"}, + {U'Ò', "OÌ"}, + {U'Ó', "Oì"}, + {U'Ô', "Ô"}, + {U'Õ', "OÞ"}, + {U'Ù', "UÌ"}, + {U'Ú', "Uì"}, + {U'Ý', "Yì"}, + {U'à', "aÌ"}, + {U'á', "aì"}, + {U'â', "â"}, + {U'ã', "aÞ"}, + {U'è', "eÌ"}, + {U'é', "eì"}, + {U'ê', "ê"}, + {U'ì', "iÌ"}, + {U'í', "iì"}, + {U'ò', "oÌ"}, + {U'ó', "oì"}, + {U'ô', "ô"}, + {U'õ', "oÞ"}, + {U'ù', "uÌ"}, + {U'ú', "uì"}, + {U'ý', "yì"}, + {U'Ă', "Ã"}, + {U'ă', "ã"}, + {U'Đ', "Đ"}, + {U'đ', "ð"}, + {U'Ĩ', "IÞ"}, + {U'ĩ', "iÞ"}, + {U'Ũ', "UÞ"}, + {U'ũ', "uÞ"}, + {U'Ơ', "Õ"}, + {U'ơ', "õ"}, + {U'Ư', "Ý"}, + {U'ư', "ý"}, + {U'Ạ', "Aò"}, + {U'ạ', "aò"}, + {U'Ả', "AÒ"}, + {U'ả', "aÒ"}, + {U'Ấ', "Âì"}, + {U'ấ', "âì"}, + {U'Ầ', "ÂÌ"}, + {U'ầ', "âÌ"}, + {U'Ẩ', "ÂÒ"}, + {U'ẩ', "âÒ"}, + {U'Ẫ', "ÂÞ"}, + {U'ẫ', "âÞ"}, + {U'Ậ', "Âò"}, + {U'ậ', "âò"}, + {U'Ắ', "Ãì"}, + {U'ắ', "ãì"}, + {U'Ằ', "ÃÌ"}, + {U'ằ', "ãÌ"}, + {U'Ẳ', "ÃÒ"}, + {U'ẳ', "ãÒ"}, + {U'Ẵ', "ÃÞ"}, + {U'ẵ', "ãÞ"}, + {U'Ặ', "Ãò"}, + {U'ặ', "ãò"}, + {U'Ẹ', "Eò"}, + {U'ẹ', "eò"}, + {U'Ẻ', "EÒ"}, + {U'ẻ', "eÒ"}, + {U'Ẽ', "EÞ"}, + {U'ẽ', "eÞ"}, + {U'Ế', "Êì"}, + {U'ế', "êì"}, + {U'Ề', "ÊÌ"}, + {U'ề', "êÌ"}, + {U'Ể', "ÊÒ"}, + {U'ể', "êÒ"}, + {U'Ễ', "ÊÞ"}, + {U'ễ', "êÞ"}, + {U'Ệ', "Êò"}, + {U'ệ', "êò"}, + {U'Ỉ', "IÒ"}, + {U'ỉ', "iÒ"}, + {U'Ị', "Iò"}, + {U'ị', "iò"}, + {U'Ọ', "Oò"}, + {U'ọ', "oò"}, + {U'Ỏ', "OÒ"}, + {U'ỏ', "oÒ"}, + {U'Ố', "Ôì"}, + {U'ố', "ôì"}, + {U'Ồ', "ÔÌ"}, + {U'ồ', "ôÌ"}, + {U'Ổ', "ÔÒ"}, + {U'ổ', "ôÒ"}, + {U'Ỗ', "ÔÞ"}, + {U'ỗ', "ôÞ"}, + {U'Ộ', "Ôò"}, + {U'ộ', "ôò"}, + {U'Ớ', "Õì"}, + {U'ớ', "õì"}, + {U'Ờ', "ÕÌ"}, + {U'ờ', "õÌ"}, + {U'Ở', "ÕÒ"}, + {U'ở', "õÒ"}, + {U'Ỡ', "ÕÞ"}, + {U'ỡ', "õÞ"}, + {U'Ợ', "Õò"}, + {U'ợ', "õò"}, + {U'Ụ', "Uò"}, + {U'ụ', "uò"}, + {U'Ủ', "UÒ"}, + {U'ủ', "uÒ"}, + {U'Ứ', "=Ýì"}, + {U'ứ', "ýì"}, + {U'Ừ', "ÝÌ"}, + {U'ừ', "ýÌ"}, + {U'Ử', "ÝÒ"}, + {U'ử', "ýÒ"}, + {U'Ữ', "ÝÞ"}, + {U'ữ', "ýÞ"}, + {U'Ự', "Ýò"}, + {U'ự', "ýò"}, + {U'Ỳ', "YÌ"}, + {U'ỳ', "yÌ"}, + {U'Ỵ', "Yò"}, + {U'ỵ', "yò"}, + {U'Ỷ', "YÒ"}, + {U'ỷ', "yÒ"}, + {U'Ỹ', "YÞ"}, + {U'ỹ', "yÞ"}, +}}; + +constexpr std::array kCharsetMappings4{{ + {U'À', "A`"}, + {U'Á', "A'"}, + {U'Â', "A^"}, + {U'Ã', "A~"}, + {U'È', "E`"}, + {U'É', "E'"}, + {U'Ê', "E^"}, + {U'Ì', "I`"}, + {U'Í', "I'"}, + {U'Ò', "O`"}, + {U'Ó', "O'"}, + {U'Ô', "O^"}, + {U'Õ', "O~"}, + {U'Ù', "U`"}, + {U'Ú', "U'"}, + {U'Ý', "Y'"}, + {U'à', "a`"}, + {U'á', "a'"}, + {U'â', "a^"}, + {U'ã', "a~"}, + {U'è', "e`"}, + {U'é', "e'"}, + {U'ê', "e^"}, + {U'ì', "i`"}, + {U'í', "i'"}, + {U'ò', "o`"}, + {U'ó', "o'"}, + {U'ô', "o^"}, + {U'õ', "o~"}, + {U'ù', "u`"}, + {U'ú', "u'"}, + {U'ý', "y'"}, + {U'Ă', "A("}, + {U'ă', "a("}, + {U'Đ', "DD"}, + {U'đ', "dd"}, + {U'Ĩ', "I~"}, + {U'ĩ', "i~"}, + {U'Ũ', "U~"}, + {U'ũ', "u~"}, + {U'Ơ', "O+"}, + {U'ơ', "o+"}, + {U'Ư', "U+"}, + {U'ư', "u+"}, + {U'Ạ', "A."}, + {U'ạ', "a."}, + {U'Ả', "A?"}, + {U'ả', "a?"}, + {U'Ấ', "A^'"}, + {U'ấ', "a^'"}, + {U'Ầ', "A^`"}, + {U'ầ', "a^`"}, + {U'Ẩ', "A^?"}, + {U'ẩ', "a^?"}, + {U'Ẫ', "A^~"}, + {U'ẫ', "a^~"}, + {U'Ậ', "A^."}, + {U'ậ', "a^."}, + {U'Ắ', "A('"}, + {U'ắ', "a('"}, + {U'Ằ', "A(`"}, + {U'ằ', "a(`"}, + {U'Ẳ', "A(?"}, + {U'ẳ', "a(?"}, + {U'Ẵ', "A(~"}, + {U'ẵ', "a(~"}, + {U'Ặ', "A(."}, + {U'ặ', "a(."}, + {U'Ẹ', "E."}, + {U'ẹ', "e."}, + {U'Ẻ', "E?"}, + {U'ẻ', "e?"}, + {U'Ẽ', "E~"}, + {U'ẽ', "e~"}, + {U'Ế', "E^'"}, + {U'ế', "e^'"}, + {U'Ề', "E^`"}, + {U'ề', "e^`"}, + {U'Ể', "E^?"}, + {U'ể', "e^?"}, + {U'Ễ', "E^~"}, + {U'ễ', "e^~"}, + {U'Ệ', "E^."}, + {U'ệ', "e^."}, + {U'Ỉ', "I?"}, + {U'ỉ', "i?"}, + {U'Ị', "I."}, + {U'ị', "i."}, + {U'Ọ', "O."}, + {U'ọ', "o."}, + {U'Ỏ', "O?"}, + {U'ỏ', "o?"}, + {U'Ố', "O^'"}, + {U'ố', "o^'"}, + {U'Ồ', "O^`"}, + {U'ồ', "o^`"}, + {U'Ổ', "O^?"}, + {U'ổ', "o^?"}, + {U'Ỗ', "O^~"}, + {U'ỗ', "o^~"}, + {U'Ộ', "O^."}, + {U'ộ', "o^."}, + {U'Ớ', "O+'"}, + {U'ớ', "o+'"}, + {U'Ờ', "O+`"}, + {U'ờ', "o+`"}, + {U'Ở', "O+?"}, + {U'ở', "o+?"}, + {U'Ỡ', "O+~"}, + {U'ỡ', "o+~"}, + {U'Ợ', "O+."}, + {U'ợ', "o+."}, + {U'Ụ', "U."}, + {U'ụ', "u."}, + {U'Ủ', "U?"}, + {U'ủ', "u?"}, + {U'Ứ', "U+'"}, + {U'ứ', "u+'"}, + {U'Ừ', "U+`"}, + {U'ừ', "u+`"}, + {U'Ử', "U+?"}, + {U'ử', "u+?"}, + {U'Ữ', "U+~"}, + {U'ữ', "u+~"}, + {U'Ự', "U+."}, + {U'ự', "u+."}, + {U'Ỳ', "Y`"}, + {U'ỳ', "y`"}, + {U'Ỵ', "Y."}, + {U'ỵ', "y."}, + {U'Ỷ', "Y?"}, + {U'ỷ', "y?"}, + {U'Ỹ', "Y~"}, + {U'ỹ', "y~"}, +}}; + +constexpr std::array kCharsetMappings5{{ + {U'À', "À"}, + {U'Á', "Á"}, + {U'Â', "Â"}, + {U'Ã', "Ã"}, + {U'È', "È"}, + {U'É', "É"}, + {U'Ê', "Ê"}, + {U'Ì', "Ì"}, + {U'Í', "Í"}, + {U'Ò', "Ò"}, + {U'Ó', "Ó"}, + {U'Ô', "Ô"}, + {U'Õ', "õ"}, + {U'Ù', "Ù"}, + {U'Ú', "Ú"}, + {U'Ý', "Ý"}, + {U'à', "à"}, + {U'á', "á"}, + {U'â', "â"}, + {U'ã', "ã"}, + {U'è', "è"}, + {U'é', "é"}, + {U'ê', "ê"}, + {U'ì', "ì"}, + {U'í', "í"}, + {U'ò', "ò"}, + {U'ó', "ó"}, + {U'ô', "ô"}, + {U'õ', "õ"}, + {U'ù', "ù"}, + {U'ú', "ú"}, + {U'ý', "ý"}, + {U'Ă', "Å"}, + {U'ă', "å"}, + {U'Đ', "Ð"}, + {U'đ', "ð"}, + {U'Ĩ', "Î"}, + {U'ĩ', "î"}, + {U'Ũ', "\\u009d"}, + {U'ũ', "û"}, + {U'Ơ', "´"}, + {U'ơ', "½"}, + {U'Ư', "¿"}, + {U'ư', "ß"}, + {U'Ạ', "€"}, + {U'ạ', "Õ"}, + {U'Ả', "Ä"}, + {U'ả', "ä"}, + {U'Ấ', "„"}, + {U'ấ', "¤"}, + {U'Ầ', "…"}, + {U'ầ', "¥"}, + {U'Ẩ', "†"}, + {U'ẩ', "¦"}, + {U'Ẫ', "ç"}, + {U'ẫ', "ç"}, + {U'Ậ', "‡"}, + {U'ậ', "§"}, + {U'Ắ', "\\u0081"}, + {U'ắ', "¡"}, + {U'Ằ', "‚"}, + {U'ằ', "¢"}, + {U'Ẳ', "Æ"}, + {U'ẳ', "Æ"}, + {U'Ẵ', "Ç"}, + {U'ẵ', "Ç"}, + {U'Ặ', "ƒ"}, + {U'ặ', "£"}, + {U'Ẹ', "‰"}, + {U'ẹ', "©"}, + {U'Ẻ', "Ë"}, + {U'ẻ', "ë"}, + {U'Ẽ', "ˆ"}, + {U'ẽ', "¨"}, + {U'Ế', "Š"}, + {U'ế', "ª"}, + {U'Ề', "‹"}, + {U'ề', "«"}, + {U'Ể', "Œ"}, + {U'ể', "¬"}, + {U'Ễ', "\\u008d"}, + {U'ễ', "\\u00ad"}, + {U'Ệ', "Ž"}, + {U'ệ', "®"}, + {U'Ỉ', "›"}, + {U'ỉ', "ï"}, + {U'Ị', "˜"}, + {U'ị', "¸"}, + {U'Ọ', "š"}, + {U'ọ', "÷"}, + {U'Ỏ', "™"}, + {U'ỏ', "ö"}, + {U'Ố', "\\u008f"}, + {U'ố', "¯"}, + {U'Ồ', "\\u0090"}, + {U'ồ', "°"}, + {U'Ổ', "‘"}, + {U'ổ', "±"}, + {U'Ỗ', "’"}, + {U'ỗ', "²"}, + {U'Ộ', "“"}, + {U'ộ', "µ"}, + {U'Ớ', "•"}, + {U'ớ', "¾"}, + {U'Ờ', "–"}, + {U'ờ', "¶"}, + {U'Ở', "—"}, + {U'ở', "·"}, + {U'Ỡ', "³"}, + {U'ỡ', "Þ"}, + {U'Ợ', "”"}, + {U'ợ', "þ"}, + {U'Ụ', "ž"}, + {U'ụ', "ø"}, + {U'Ủ', "œ"}, + {U'ủ', "ü"}, + {U'Ứ', "º"}, + {U'ứ', "Ñ"}, + {U'Ừ', "»"}, + {U'ừ', "×"}, + {U'Ử', "¼"}, + {U'ử', "Ø"}, + {U'Ữ', "ÿ"}, + {U'ữ', "æ"}, + {U'Ự', "¹"}, + {U'ự', "ñ"}, + {U'Ỳ', "Ÿ"}, + {U'ỳ', "Ï"}, + {U'Ỵ', "Ü"}, + {U'ỵ', "Ü"}, + {U'Ỷ', "Ö"}, + {U'ỷ', "Ö"}, + {U'Ỹ', "Û"}, + {U'ỹ', "Û"}, +}}; + +constexpr std::array kCharsetMappings6{{ + {U'À', "€"}, + {U'Á', "Á"}, + {U'Â', "Â"}, + {U'Ã', "‚"}, + {U'È', "×"}, + {U'É', "É"}, + {U'Ê', "Ê"}, + {U'Ì', "µ"}, + {U'Í', "´"}, + {U'Ò', "¼"}, + {U'Ó', "¹"}, + {U'Ô', "Ô"}, + {U'Õ', "¾"}, + {U'Ù', "¨"}, + {U'Ú', "Ú"}, + {U'Ý', "Ý"}, + {U'à', "à"}, + {U'á', "á"}, + {U'â', "â"}, + {U'ã', "ã"}, + {U'è', "è"}, + {U'é', "é"}, + {U'ê', "ê"}, + {U'ì', "ì"}, + {U'í', "í"}, + {U'ò', "ò"}, + {U'ó', "ó"}, + {U'ô', "ô"}, + {U'õ', "õ"}, + {U'ù', "ù"}, + {U'ú', "ú"}, + {U'ý', "š"}, + {U'Ă', "ˆ"}, + {U'ă', "æ"}, + {U'Đ', "ñ"}, + {U'đ', "Ç"}, + {U'Ĩ', "¸"}, + {U'ĩ', "ï"}, + {U'Ũ', "¬"}, + {U'ũ', "Û"}, + {U'Ơ', "÷"}, + {U'ơ', "Ö"}, + {U'Ư', "Ð"}, + {U'ư', "Ü"}, + {U'Ạ', "å"}, + {U'ạ', "å"}, + {U'Ả', "\\u0081"}, + {U'ả', "ä"}, + {U'Ấ', "ƒ"}, + {U'ấ', "Ã"}, + {U'Ầ', "„"}, + {U'ầ', "À"}, + {U'Ẩ', "…"}, + {U'ẩ', "Ä"}, + {U'Ẫ', "Å"}, + {U'ẫ', "Å"}, + {U'Ậ', "Æ"}, + {U'ậ', "Æ"}, + {U'Ắ', "\\u008d"}, + {U'ắ', "¡"}, + {U'Ằ', "Ž"}, + {U'ằ', "¢"}, + {U'Ẳ', "\\u008f"}, + {U'ẳ', "£"}, + {U'Ẵ', "ð"}, + {U'ẵ', "¤"}, + {U'Ặ', "¥"}, + {U'ặ', "¥"}, + {U'Ẹ', "Ë"}, + {U'ẹ', "Ë"}, + {U'Ẻ', "Þ"}, + {U'ẻ', "È"}, + {U'Ẽ', "þ"}, + {U'ẽ', "ë"}, + {U'Ế', "\\u0090"}, + {U'ế', "‰"}, + {U'Ề', "“"}, + {U'ề', "Š"}, + {U'Ể', "”"}, + {U'ể', "‹"}, + {U'Ễ', "•"}, + {U'ễ', "Í"}, + {U'Ệ', "Œ"}, + {U'ệ', "Œ"}, + {U'Ỉ', "·"}, + {U'ỉ', "Ì"}, + {U'Ị', "Î"}, + {U'ị', "Î"}, + {U'Ọ', "†"}, + {U'ọ', "†"}, + {U'Ỏ', "½"}, + {U'ỏ', "Õ"}, + {U'Ố', "–"}, + {U'ố', "Ó"}, + {U'Ồ', "—"}, + {U'ồ', "Ò"}, + {U'Ổ', "˜"}, + {U'ổ', "°"}, + {U'Ỗ', "™"}, + {U'ỗ', "‡"}, + {U'Ộ', "¶"}, + {U'ộ', "¶"}, + {U'Ớ', "\\u009d"}, + {U'ớ', "§"}, + {U'Ờ', "ž"}, + {U'ờ', "©"}, + {U'Ở', "Ÿ"}, + {U'ở', "ª"}, + {U'Ỡ', "¦"}, + {U'ỡ', "«"}, + {U'Ợ', "®"}, + {U'ợ', "®"}, + {U'Ụ', "ø"}, + {U'ụ', "ø"}, + {U'Ủ', "Ñ"}, + {U'ủ', "û"}, + {U'Ứ', "\\u00ad"}, + {U'ứ', "Ù"}, + {U'Ừ', "¯"}, + {U'ừ', "Ø"}, + {U'Ử', "±"}, + {U'ử', "º"}, + {U'Ữ', "»"}, + {U'ữ', "»"}, + {U'Ự', "¿"}, + {U'ự', "¿"}, + {U'Ỳ', "²"}, + {U'ỳ', "ÿ"}, + {U'Ỵ', "œ"}, + {U'ỵ', "œ"}, + {U'Ỷ', "ý"}, + {U'ỷ', "›"}, + {U'Ỹ', "³"}, + {U'ỹ', "Ï"}, +}}; + +constexpr std::array kCharsetMappings7{{ + {U'À', "AÂ"}, + {U'Á', "AÁ"}, + {U'Â', "Ê"}, + {U'Ã', "AÄ"}, + {U'È', "EÂ"}, + {U'É', "EÁ"}, + {U'Ê', "Ï"}, + {U'Ì', "Ò"}, + {U'Í', "Ñ"}, + {U'Ò', "OÂ"}, + {U'Ó', "OÁ"}, + {U'Ô', "Ö"}, + {U'Õ', "OÄ"}, + {U'Ù', "UÂ"}, + {U'Ú', "UÁ"}, + {U'Ý', "YÁ"}, + {U'à', "aâ"}, + {U'á', "aá"}, + {U'â', "ê"}, + {U'ã', "aä"}, + {U'è', "eâ"}, + {U'é', "eá"}, + {U'ê', "ï"}, + {U'ì', "ò"}, + {U'í', "ñ"}, + {U'ò', "oâ"}, + {U'ó', "oá"}, + {U'ô', "ö"}, + {U'õ', "oä"}, + {U'ù', "uâ"}, + {U'ú', "uá"}, + {U'ý', "yá"}, + {U'Ă', "Ù"}, + {U'ă', "ù"}, + {U'Đ', "À"}, + {U'đ', "à"}, + {U'Ĩ', "Ô"}, + {U'ĩ', "ô"}, + {U'Ũ', "UÄ"}, + {U'ũ', "uä"}, + {U'Ơ', "Ú"}, + {U'ơ', "ú"}, + {U'Ư', "Û"}, + {U'ư', "û"}, + {U'Ạ', "AÅ"}, + {U'ạ', "aå"}, + {U'Ả', "AÃ"}, + {U'ả', "aã"}, + {U'Ấ', "ÊË"}, + {U'ấ', "êë"}, + {U'Ầ', "ÊÌ"}, + {U'ầ', "êì"}, + {U'Ẩ', "ÊÍ"}, + {U'ẩ', "êí"}, + {U'Ẫ', "ÊÎ"}, + {U'ẫ', "êî"}, + {U'Ậ', "ÊÅ"}, + {U'ậ', "êå"}, + {U'Ắ', "ÙÆ"}, + {U'ắ', "ùæ"}, + {U'Ằ', "ÙÇ"}, + {U'ằ', "ùç"}, + {U'Ẳ', "ÙÈ"}, + {U'ẳ', "ùè"}, + {U'Ẵ', "ÙÉ"}, + {U'ẵ', "ùé"}, + {U'Ặ', "ÙÅ"}, + {U'ặ', "ùå"}, + {U'Ẹ', "EÅ"}, + {U'ẹ', "eå"}, + {U'Ẻ', "EÃ"}, + {U'ẻ', "eã"}, + {U'Ẽ', "EÄ"}, + {U'ẽ', "eä"}, + {U'Ế', "ÏË"}, + {U'ế', "ïë"}, + {U'Ề', "ÏÌ"}, + {U'ề', "ïì"}, + {U'Ể', "ÏÍ"}, + {U'ể', "ïí"}, + {U'Ễ', "ÏÎ"}, + {U'ễ', "ïî"}, + {U'Ệ', "Ïå"}, + {U'ệ', "ïå"}, + {U'Ỉ', "Ó"}, + {U'ỉ', "ó"}, + {U'Ị', "Õ"}, + {U'ị', "õ"}, + {U'Ọ', "OÅ"}, + {U'ọ', "oå"}, + {U'Ỏ', "OÃ"}, + {U'ỏ', "oã"}, + {U'Ố', "ÖË"}, + {U'ố', "öë"}, + {U'Ồ', "ÖÌ"}, + {U'ồ', "öì"}, + {U'Ổ', "ÖÍ"}, + {U'ổ', "öí"}, + {U'Ỗ', "ÖÎ"}, + {U'ỗ', "öî"}, + {U'Ộ', "ÖÅ"}, + {U'ộ', "öå"}, + {U'Ớ', "ÚÁ"}, + {U'ớ', "úá"}, + {U'Ờ', "ÚÂ"}, + {U'ờ', "úâ"}, + {U'Ở', "ÚÃ"}, + {U'ở', "úã"}, + {U'Ỡ', "ÚÄ"}, + {U'ỡ', "úä"}, + {U'Ợ', "ÚÅ"}, + {U'ợ', "úå"}, + {U'Ụ', "UÅ"}, + {U'ụ', "uå"}, + {U'Ủ', "UÃ"}, + {U'ủ', "uã"}, + {U'Ứ', "ÛÁ"}, + {U'ứ', "ûá"}, + {U'Ừ', "ÛÂ"}, + {U'ừ', "ûâ"}, + {U'Ử', "ÛÃ"}, + {U'ử', "ûã"}, + {U'Ữ', "ÛÄ"}, + {U'ữ', "ûä"}, + {U'Ự', "ÛÅ"}, + {U'ự', "ûå"}, + {U'Ỳ', "YÂ"}, + {U'ỳ', "yâ"}, + {U'Ỵ', "YÅ"}, + {U'ỵ', "yå"}, + {U'Ỷ', "YÃ"}, + {U'ỷ', "yã"}, + {U'Ỹ', "YÄ"}, + {U'ỹ', "yä"}, +}}; + +constexpr std::array kCharsetMappings8{{ + {U'À', "\\u0081"}, + {U'Á', "€"}, + {U'Â', "Ÿ"}, + {U'Ã', "ƒ"}, + {U'È', "†"}, + {U'É', "…"}, + {U'Ê', "¥"}, + {U'Ì', "‹"}, + {U'Í', "Š"}, + {U'Ò', "\\u0090"}, + {U'Ó', "\\u008f"}, + {U'Ô', "«"}, + {U'Õ', "’"}, + {U'Ù', "•"}, + {U'Ú', "”"}, + {U'Ý', "{"}, + {U'à', "¿"}, + {U'á', "¾"}, + {U'â', "Ý"}, + {U'ã', "Á"}, + {U'è', "Ä"}, + {U'é', "Ã"}, + {U'ê', "ã"}, + {U'ì', "É"}, + {U'í', "È"}, + {U'ò', "Î"}, + {U'ó', "Í"}, + {U'ô', "é"}, + {U'õ', "Ð"}, + {U'ù', "Ó"}, + {U'ú', "Ò"}, + {U'ý', "û"}, + {U'Ă', "™"}, + {U'ă', "×"}, + {U'Đ', "}"}, + {U'đ', "½"}, + {U'Ĩ', "\\u008d"}, + {U'ĩ', "Ë"}, + {U'Ũ', "—"}, + {U'ũ', "Õ"}, + {U'Ơ', "±"}, + {U'ơ', "ï"}, + {U'Ư', "·"}, + {U'ư', "õ"}, + {U'Ạ', "„"}, + {U'ạ', "Â"}, + {U'Ả', "‚"}, + {U'ả', "À"}, + {U'Ấ', "~"}, + {U'ấ', "Þ"}, + {U'Ầ', "¡"}, + {U'ầ', "ß"}, + {U'Ẩ', "¢"}, + {U'ẩ', "à"}, + {U'Ẫ', "£"}, + {U'ẫ', "á"}, + {U'Ậ', "¤"}, + {U'ậ', "â"}, + {U'Ắ', "š"}, + {U'ắ', "Ø"}, + {U'Ằ', "›"}, + {U'ằ', "Ù"}, + {U'Ẳ', "œ"}, + {U'ẳ', "Ú"}, + {U'Ẵ', "\\u009d"}, + {U'ẵ', "Û"}, + {U'Ặ', "˜"}, + {U'ặ', "Ü"}, + {U'Ẹ', "‰"}, + {U'ẹ', "Ç"}, + {U'Ẻ', "‡"}, + {U'ẻ', "Å"}, + {U'Ẽ', "ˆ"}, + {U'ẽ', "Æ"}, + {U'Ế', "¦"}, + {U'ế', "ä"}, + {U'Ề', "§"}, + {U'ề', "å"}, + {U'Ể', "¨"}, + {U'ể', "æ"}, + {U'Ễ', "©"}, + {U'ễ', "ç"}, + {U'Ệ', "ª"}, + {U'ệ', "è"}, + {U'Ỉ', "Œ"}, + {U'ỉ', "Ê"}, + {U'Ị', "Ž"}, + {U'ị', "Ì"}, + {U'Ọ', "“"}, + {U'ọ', "Ñ"}, + {U'Ỏ', "‘"}, + {U'ỏ', "Ï"}, + {U'Ố', "¬"}, + {U'ố', "ê"}, + {U'Ồ', "\\u00ad"}, + {U'ồ', "ë"}, + {U'Ổ', "®"}, + {U'ổ', "ì"}, + {U'Ỗ', "¯"}, + {U'ỗ', "í"}, + {U'Ộ', "°"}, + {U'ộ', "î"}, + {U'Ớ', "²"}, + {U'ớ', "ð"}, + {U'Ờ', "³"}, + {U'ờ', "ñ"}, + {U'Ở', "´"}, + {U'ở', "ò"}, + {U'Ỡ', "µ"}, + {U'ỡ', "ó"}, + {U'Ợ', "¶"}, + {U'ợ', "ô"}, + {U'Ụ', "˜"}, + {U'ụ', "Ö"}, + {U'Ủ', "–"}, + {U'ủ', "Ô"}, + {U'Ứ', "¸"}, + {U'ứ', "ö"}, + {U'Ừ', "¹"}, + {U'ừ', "÷"}, + {U'Ử', "º"}, + {U'ử', "ø"}, + {U'Ữ', "»"}, + {U'ữ', "ù"}, + {U'Ự', "¼"}, + {U'ự', "ú"}, + {U'Ỳ', "^"}, + {U'ỳ', "ü"}, + {U'Ỵ', "Ž"}, + {U'ỵ', "ÿ"}, + {U'Ỷ', "`"}, + {U'ỷ', "ý"}, + {U'Ỹ', "|"}, + {U'ỹ', "þ"}, +}}; + +constexpr std::array kCharsetMappings9{{ + {U'À', "AÌ"}, + {U'Á', "AÏ"}, + {U'Â', "Á"}, + {U'Ã', "AÎ"}, + {U'È', "EÌ"}, + {U'É', "EÏ"}, + {U'Ê', "Ã"}, + {U'Ì', "Ç"}, + {U'Í', "Ê"}, + {U'Ò', "OÌ"}, + {U'Ó', "OÏ"}, + {U'Ô', "Ä"}, + {U'Õ', "OÎ"}, + {U'Ù', "UÌ"}, + {U'Ú', "UÏ"}, + {U'Ý', "YÏ"}, + {U'à', "aì"}, + {U'á', "aï"}, + {U'â', "á"}, + {U'ã', "aî"}, + {U'è', "eì"}, + {U'é', "eï"}, + {U'ê', "ã"}, + {U'ì', "ç"}, + {U'í', "ê"}, + {U'ò', "oì"}, + {U'ó', "oï"}, + {U'ô', "ä"}, + {U'õ', "oî"}, + {U'ù', "uì"}, + {U'ú', "uï"}, + {U'ý', "yï"}, + {U'Ă', "À"}, + {U'ă', "à"}, + {U'Đ', "Â"}, + {U'đ', "â"}, + {U'Ĩ', "É"}, + {U'ĩ', "é"}, + {U'Ũ', "UÎ"}, + {U'ũ', "uî"}, + {U'Ơ', "Å"}, + {U'ơ', "å"}, + {U'Ư', "Æ"}, + {U'ư', "æ"}, + {U'Ạ', "AÛ"}, + {U'ạ', "aû"}, + {U'Ả', "AÍ"}, + {U'ả', "aí"}, + {U'Ấ', "ÁÚ"}, + {U'ấ', "áú"}, + {U'Ầ', "ÁÖ"}, + {U'ầ', "áö"}, + {U'Ẩ', "ÁØ"}, + {U'ẩ', "áø"}, + {U'Ẫ', "ÁÙ"}, + {U'ẫ', "áù"}, + {U'Ậ', "ÁÛ"}, + {U'ậ', "áû"}, + {U'Ắ', "ÀÕ"}, + {U'ắ', "àõ"}, + {U'Ằ', "ÀÒ"}, + {U'ằ', "àò"}, + {U'Ẳ', "ÀÓ"}, + {U'ẳ', "àó"}, + {U'Ẵ', "ÀÔ"}, + {U'ẵ', "àô"}, + {U'Ặ', "ÀÛ"}, + {U'ặ', "àû"}, + {U'Ẹ', "EÛ"}, + {U'ẹ', "eû"}, + {U'Ẻ', "EÍ"}, + {U'ẻ', "eí"}, + {U'Ẽ', "EÎ"}, + {U'ẽ', "eî"}, + {U'Ế', "ÃÚ"}, + {U'ế', "ãú"}, + {U'Ề', "ÃÖ"}, + {U'ề', "ãö"}, + {U'Ể', "ÃØ"}, + {U'ể', "ãø"}, + {U'Ễ', "ÃÙ"}, + {U'ễ', "ãù"}, + {U'Ệ', "ÃÛ"}, + {U'ệ', "ãû"}, + {U'Ỉ', "È"}, + {U'ỉ', "è"}, + {U'Ị', "Ë"}, + {U'ị', "ë"}, + {U'Ọ', "OÜ"}, + {U'ọ', "oü"}, + {U'Ỏ', "OÍ"}, + {U'ỏ', "oí"}, + {U'Ố', "ÄÚ"}, + {U'ố', "äú"}, + {U'Ồ', "ÄÖ"}, + {U'ồ', "äö"}, + {U'Ổ', "ÄØ"}, + {U'ổ', "äø"}, + {U'Ỗ', "ÄÙ"}, + {U'ỗ', "äù"}, + {U'Ộ', "ÄÜ"}, + {U'ộ', "äü"}, + {U'Ớ', "ÅÏ"}, + {U'ớ', "åï"}, + {U'Ờ', "ÅÌ"}, + {U'ờ', "åì"}, + {U'Ở', "ÅÍ"}, + {U'ở', "åí"}, + {U'Ỡ', "ÅÎ"}, + {U'ỡ', "åî"}, + {U'Ợ', "ÅÜ"}, + {U'ợ', "åü"}, + {U'Ụ', "UÛ"}, + {U'ụ', "uû"}, + {U'Ủ', "UÍ"}, + {U'ủ', "uí"}, + {U'Ứ', "ÆÏ"}, + {U'ứ', "æï"}, + {U'Ừ', "ÆÌ"}, + {U'ừ', "æì"}, + {U'Ử', "ÆÍ"}, + {U'ử', "æí"}, + {U'Ữ', "ÆÎ"}, + {U'ữ', "æî"}, + {U'Ự', "ÆÛ"}, + {U'ự', "æû"}, + {U'Ỳ', "YÌ"}, + {U'ỳ', "yì"}, + {U'Ỵ', "YÑ"}, + {U'ỵ', "yñ"}, + {U'Ỷ', "YÍ"}, + {U'ỷ', "yí"}, + {U'Ỹ', "YÎ"}, + {U'ỹ', "yî"}, +}}; + +constexpr std::array kCharsetMappings10{{ + {U'À', "ª"}, + {U'Á', "À"}, + {U'Â', "—"}, + {U'Ã', "º"}, + {U'È', "Ì"}, + {U'É', "Ï"}, + {U'Ê', "™"}, + {U'Ì', "Ø"}, + {U'Í', "Û"}, + {U'Ò', "ß"}, + {U'Ó', "â"}, + {U'Ô', "š"}, + {U'Õ', "á"}, + {U'Ù', "î"}, + {U'Ú', "ò"}, + {U'Ý', "ü"}, + {U'à', "ª"}, + {U'á', "À"}, + {U'â', "¡"}, + {U'ã', "º"}, + {U'è', "Ì"}, + {U'é', "Ï"}, + {U'ê', "£"}, + {U'ì', "Ø"}, + {U'í', "Û"}, + {U'ò', "ß"}, + {U'ó', "â"}, + {U'ô', "¤"}, + {U'õ', "á"}, + {U'ù', "î"}, + {U'ú', "ò"}, + {U'ý', "ü"}, + {U'Ă', "–"}, + {U'ă', "Ÿ"}, + {U'Đ', "˜"}, + {U'đ', "¢"}, + {U'Ĩ', "Ú"}, + {U'ĩ', "Ú"}, + {U'Ũ', "ñ"}, + {U'ũ', "ñ"}, + {U'Ơ', "›"}, + {U'ơ', "¥"}, + {U'Ư', "œ"}, + {U'ư', "§"}, + {U'Ạ', "Á"}, + {U'ạ', "Á"}, + {U'Ả', "¶"}, + {U'ả', "¶"}, + {U'Ấ', "Ê"}, + {U'ấ', "Ê"}, + {U'Ầ', "Ç"}, + {U'ầ', "Ç"}, + {U'Ẩ', "È"}, + {U'ẩ', "È"}, + {U'Ẫ', "É"}, + {U'ẫ', "É"}, + {U'Ậ', "Ë"}, + {U'ậ', "Ë"}, + {U'Ắ', "Å"}, + {U'ắ', "Å"}, + {U'Ằ', "Â"}, + {U'ằ', "Â"}, + {U'Ẳ', "Ã"}, + {U'ẳ', "Ã"}, + {U'Ẵ', "Ä"}, + {U'ẵ', "Ä"}, + {U'Ặ', "Æ"}, + {U'ặ', "Æ"}, + {U'Ẹ', "Ñ"}, + {U'ẹ', "Ñ"}, + {U'Ẻ', "Í"}, + {U'ẻ', "Í"}, + {U'Ẽ', "Î"}, + {U'ẽ', "Î"}, + {U'Ế', "Õ"}, + {U'ế', "Õ"}, + {U'Ề', "Ò"}, + {U'ề', "Ò"}, + {U'Ể', "Ó"}, + {U'ể', "Ó"}, + {U'Ễ', "Ô"}, + {U'ễ', "Ô"}, + {U'Ệ', "Ö"}, + {U'ệ', "Ö"}, + {U'Ỉ', "Ù"}, + {U'ỉ', "Ù"}, + {U'Ị', "Ü"}, + {U'ị', "Ü"}, + {U'Ọ', "ã"}, + {U'ọ', "ã"}, + {U'Ỏ', "à"}, + {U'ỏ', "à"}, + {U'Ố', "ç"}, + {U'ố', "ç"}, + {U'Ồ', "ä"}, + {U'ồ', "ä"}, + {U'Ổ', "å"}, + {U'ổ', "å"}, + {U'Ỗ', "æ"}, + {U'ỗ', "æ"}, + {U'Ộ', "è"}, + {U'ộ', "è"}, + {U'Ớ', "ì"}, + {U'ớ', "ì"}, + {U'Ờ', "é"}, + {U'ờ', "é"}, + {U'Ở', "ê"}, + {U'ở', "ê"}, + {U'Ỡ', "ë"}, + {U'ỡ', "ë"}, + {U'Ợ', "í"}, + {U'ợ', "í"}, + {U'Ụ', "ó"}, + {U'ụ', "ó"}, + {U'Ủ', "ï"}, + {U'ủ', "ï"}, + {U'Ứ', "÷"}, + {U'ứ', "÷"}, + {U'Ừ', "ô"}, + {U'ừ', "ô"}, + {U'Ử', "õ"}, + {U'ử', "õ"}, + {U'Ữ', "ö"}, + {U'ữ', "ö"}, + {U'Ự', "ø"}, + {U'ự', "ø"}, + {U'Ỳ', "ù"}, + {U'ỳ', "ù"}, + {U'Ỵ', "ÿ"}, + {U'ỵ', "ÿ"}, + {U'Ỷ', "ú"}, + {U'ỷ', "ú"}, + {U'Ỹ', "û"}, + {U'ỹ', "û"}, +}}; + +constexpr std::array kCharsetMappings11{{ + {U'À', "À"}, + {U'Á', "Ã\\u0081"}, + {U'Â', "Â"}, + {U'Ã', "Ã"}, + {U'È', "È"}, + {U'É', "É"}, + {U'Ê', "Ê"}, + {U'Ì', "ÃŒ"}, + {U'Í', "Ã\\u008d"}, + {U'Ò', "Ã’"}, + {U'Ó', "Ó"}, + {U'Ô', "Ô"}, + {U'Õ', "Õ"}, + {U'Ù', "Ù"}, + {U'Ú', "Ú"}, + {U'Ý', "Ã\\u009d"}, + {U'à', "à"}, + {U'á', "á"}, + {U'â', "â"}, + {U'ã', "ã"}, + {U'è', "è"}, + {U'é', "é"}, + {U'ê', "ê"}, + {U'ì', "ì"}, + {U'í', "Ã\\u00ad"}, + {U'ò', "ò"}, + {U'ó', "ó"}, + {U'ô', "ô"}, + {U'õ', "õ"}, + {U'ù', "ù"}, + {U'ú', "ú"}, + {U'ý', "ý"}, + {U'Ă', "Ä‚"}, + {U'ă', "ă"}, + {U'Đ', "Ä\\u0090"}, + {U'đ', "Ä‘"}, + {U'Ĩ', "Ĩ"}, + {U'ĩ', "Ä©"}, + {U'Ũ', "Ũ"}, + {U'ũ', "Å©"}, + {U'Ơ', "Æ "}, + {U'ơ', "Æ¡"}, + {U'Ư', "Ư"}, + {U'ư', "ư"}, + {U'Ạ', "Ạ"}, + {U'ạ', "ạ"}, + {U'Ả', "Ả"}, + {U'ả', "ả"}, + {U'Ấ', "Ấ"}, + {U'ấ', "ấ"}, + {U'Ầ', "Ầ"}, + {U'ầ', "ầ"}, + {U'Ẩ', "Ẩ"}, + {U'ẩ', "ẩ"}, + {U'Ẫ', "Ẫ"}, + {U'ẫ', "ẫ"}, + {U'Ậ', "Ậ"}, + {U'ậ', "áº\\u00ad"}, + {U'Ắ', "Ắ"}, + {U'ắ', "ắ"}, + {U'Ằ', "Ằ"}, + {U'ằ', "ằ"}, + {U'Ẳ', "Ẳ"}, + {U'ẳ', "ẳ"}, + {U'Ẵ', "Ẵ"}, + {U'ẵ', "ẵ"}, + {U'Ặ', "Ặ"}, + {U'ặ', "ặ"}, + {U'Ẹ', "Ẹ"}, + {U'ẹ', "ẹ"}, + {U'Ẻ', "Ẻ"}, + {U'ẻ', "ẻ"}, + {U'Ẽ', "Ẽ"}, + {U'ẽ', "ẽ"}, + {U'Ế', "Ế"}, + {U'ế', "ế"}, + {U'Ề', "Ề"}, + {U'ề', "á»\\u0081"}, + {U'Ể', "Ể"}, + {U'ể', "ể"}, + {U'Ễ', "Ễ"}, + {U'ễ', "á»…"}, + {U'Ệ', "Ệ"}, + {U'ệ', "ệ"}, + {U'Ỉ', "Ỉ"}, + {U'ỉ', "ỉ"}, + {U'Ị', "Ị"}, + {U'ị', "ị"}, + {U'Ọ', "Ọ"}, + {U'ọ', "á»\\u008d"}, + {U'Ỏ', "Ỏ"}, + {U'ỏ', "á»\\u008f"}, + {U'Ố', "á»\\u0090"}, + {U'ố', "ố"}, + {U'Ồ', "á»’"}, + {U'ồ', "ồ"}, + {U'Ổ', "á»”"}, + {U'ổ', "ổ"}, + {U'Ỗ', "á»–"}, + {U'ỗ', "á»—"}, + {U'Ộ', "Ộ"}, + {U'ộ', "á»™"}, + {U'Ớ', "Ớ"}, + {U'ớ', "á»›"}, + {U'Ờ', "Ờ"}, + {U'ờ', "á»\\u009d"}, + {U'Ở', "Ở"}, + {U'ở', "ở"}, + {U'Ỡ', "á» "}, + {U'ỡ', "ỡ"}, + {U'Ợ', "Ợ"}, + {U'ợ', "ợ"}, + {U'Ụ', "Ụ"}, + {U'ụ', "ụ"}, + {U'Ủ', "Ủ"}, + {U'ủ', "á»§"}, + {U'Ứ', "Ứ"}, + {U'ứ', "ứ"}, + {U'Ừ', "Ừ"}, + {U'ừ', "ừ"}, + {U'Ử', "Ử"}, + {U'ử', "á»\\u00ad"}, + {U'Ữ', "á»®"}, + {U'ữ', "ữ"}, + {U'Ự', "á»°"}, + {U'ự', "á»±"}, + {U'Ỳ', "Ỳ"}, + {U'ỳ', "ỳ"}, + {U'Ỵ', "á»´"}, + {U'ỵ', "ỵ"}, + {U'Ỷ', "á»¶"}, + {U'ỷ', "á»·"}, + {U'Ỹ', "Ỹ"}, + {U'ỹ', "ỹ"}, +}}; + +constexpr std::array kCharsetMappings12{{ + {U'À', "À"}, + {U'Á', "Á"}, + {U'Â', "Â"}, + {U'Ã', "Ã"}, + {U'È', "È"}, + {U'É', "É"}, + {U'Ê', "Ê"}, + {U'Ì', "Ì"}, + {U'Í', "Í"}, + {U'Ò', "Ò"}, + {U'Ó', "Ó"}, + {U'Ô', "Ô"}, + {U'Õ', "Õ"}, + {U'Ù', "Ù"}, + {U'Ú', "Ú"}, + {U'Ý', "Ý"}, + {U'à', "à"}, + {U'á', "á"}, + {U'â', "â"}, + {U'ã', "ã"}, + {U'è', "è"}, + {U'é', "é"}, + {U'ê', "ê"}, + {U'ì', "ì"}, + {U'í', "í"}, + {U'ò', "ò"}, + {U'ó', "ó"}, + {U'ô', "ô"}, + {U'õ', "õ"}, + {U'ù', "ù"}, + {U'ú', "ú"}, + {U'ý', "ý"}, + {U'Ă', "Ă"}, + {U'ă', "ă"}, + {U'Đ', "Đ"}, + {U'đ', "đ"}, + {U'Ĩ', "Ĩ"}, + {U'ĩ', "ĩ"}, + {U'Ũ', "Ũ"}, + {U'ũ', "ũ"}, + {U'Ơ', "Ơ"}, + {U'ơ', "ơ"}, + {U'Ư', "Ư"}, + {U'ư', "ư"}, + {U'Ạ', "Ạ"}, + {U'ạ', "ạ"}, + {U'Ả', "Ả"}, + {U'ả', "ả"}, + {U'Ấ', "Ấ"}, + {U'ấ', "ấ"}, + {U'Ầ', "Ầ"}, + {U'ầ', "ầ"}, + {U'Ẩ', "Ẩ"}, + {U'ẩ', "ẩ"}, + {U'Ẫ', "Ẫ"}, + {U'ẫ', "ẫ"}, + {U'Ậ', "Ậ"}, + {U'ậ', "ậ"}, + {U'Ắ', "Ắ"}, + {U'ắ', "ắ"}, + {U'Ằ', "Ằ"}, + {U'ằ', "ằ"}, + {U'Ẳ', "Ẳ"}, + {U'ẳ', "ẳ"}, + {U'Ẵ', "Ẵ"}, + {U'ẵ', "ẵ"}, + {U'Ặ', "Ặ"}, + {U'ặ', "ặ"}, + {U'Ẹ', "Ẹ"}, + {U'ẹ', "ẹ"}, + {U'Ẻ', "Ẻ"}, + {U'ẻ', "ẻ"}, + {U'Ẽ', "Ẽ"}, + {U'ẽ', "ẽ"}, + {U'Ế', "Ế"}, + {U'ế', "ế"}, + {U'Ề', "Ề"}, + {U'ề', "ề"}, + {U'Ể', "Ể"}, + {U'ể', "ể"}, + {U'Ễ', "Ễ"}, + {U'ễ', "ễ"}, + {U'Ệ', "Ệ"}, + {U'ệ', "ệ"}, + {U'Ỉ', "Ỉ"}, + {U'ỉ', "ỉ"}, + {U'Ị', "Ị"}, + {U'ị', "ị"}, + {U'Ọ', "Ọ"}, + {U'ọ', "ọ"}, + {U'Ỏ', "Ỏ"}, + {U'ỏ', "ỏ"}, + {U'Ố', "Ố"}, + {U'ố', "ố"}, + {U'Ồ', "Ồ"}, + {U'ồ', "ồ"}, + {U'Ổ', "Ổ"}, + {U'ổ', "ổ"}, + {U'Ỗ', "Ỗ"}, + {U'ỗ', "ỗ"}, + {U'Ộ', "Ộ"}, + {U'ộ', "ộ"}, + {U'Ớ', "Ớ"}, + {U'ớ', "ớ"}, + {U'Ờ', "Ờ"}, + {U'ờ', "ờ"}, + {U'Ở', "Ở"}, + {U'ở', "ở"}, + {U'Ỡ', "Ỡ"}, + {U'ỡ', "ỡ"}, + {U'Ợ', "Ợ"}, + {U'ợ', "ợ"}, + {U'Ụ', "Ụ"}, + {U'ụ', "ụ"}, + {U'Ủ', "Ủ"}, + {U'ủ', "ủ"}, + {U'Ứ', "Ứ"}, + {U'ứ', "ứ"}, + {U'Ừ', "Ừ"}, + {U'ừ', "ừ"}, + {U'Ử', "Ử"}, + {U'ử', "ử"}, + {U'Ữ', "Ữ"}, + {U'ữ', "ữ"}, + {U'Ự', "Ự"}, + {U'ự', "ự"}, + {U'Ỳ', "Ỳ"}, + {U'ỳ', "ỳ"}, + {U'Ỵ', "Ỵ"}, + {U'ỵ', "ỵ"}, + {U'Ỷ', "Ỷ"}, + {U'ỷ', "ỷ"}, + {U'Ỹ', "Ỹ"}, + {U'ỹ', "ỹ"}, +}}; + +constexpr std::array kCharsetMappings13{{ + {U'Ă', "Ă"}, + {U'ă', "ă"}, + {U'Đ', "Đ"}, + {U'đ', "đ"}, + {U'Ĩ', "Ĩ"}, + {U'ĩ', "ĩ"}, + {U'Ũ', "Ũ"}, + {U'ũ', "ũ"}, + {U'Ơ', "Ơ"}, + {U'ơ', "ơ"}, + {U'Ư', "Ư"}, + {U'ư', "ư"}, + {U'Ạ', "Ạ"}, + {U'ạ', "ạ"}, + {U'Ả', "Ả"}, + {U'ả', "ả"}, + {U'Ấ', "Ấ"}, + {U'ấ', "ấ"}, + {U'Ầ', "Ầ"}, + {U'ầ', "ầ"}, + {U'Ẩ', "Ẩ"}, + {U'ẩ', "ẩ"}, + {U'Ẫ', "Ẫ"}, + {U'ẫ', "ẫ"}, + {U'Ậ', "Ậ"}, + {U'ậ', "ậ"}, + {U'Ắ', "Ắ"}, + {U'ắ', "ắ"}, + {U'Ằ', "Ằ"}, + {U'ằ', "ằ"}, + {U'Ẳ', "Ẳ"}, + {U'ẳ', "ẳ"}, + {U'Ẵ', "Ẵ"}, + {U'ẵ', "ẵ"}, + {U'Ặ', "Ặ"}, + {U'ặ', "ặ"}, + {U'Ẹ', "Ẹ"}, + {U'ẹ', "ẹ"}, + {U'Ẻ', "Ẻ"}, + {U'ẻ', "ẻ"}, + {U'Ẽ', "Ẽ"}, + {U'ẽ', "ẽ"}, + {U'Ế', "Ế"}, + {U'ế', "ế"}, + {U'Ề', "Ề"}, + {U'ề', "ề"}, + {U'Ể', "Ể"}, + {U'ể', "ể"}, + {U'Ễ', "Ễ"}, + {U'ễ', "ễ"}, + {U'Ệ', "Ệ"}, + {U'ệ', "ệ"}, + {U'Ỉ', "Ỉ"}, + {U'ỉ', "ỉ"}, + {U'Ị', "Ị"}, + {U'ị', "ị"}, + {U'Ọ', "Ọ"}, + {U'ọ', "ọ"}, + {U'Ỏ', "Ỏ"}, + {U'ỏ', "ỏ"}, + {U'Ố', "Ố"}, + {U'ố', "ố"}, + {U'Ồ', "Ồ"}, + {U'ồ', "ồ"}, + {U'Ổ', "Ổ"}, + {U'ổ', "ổ"}, + {U'Ỗ', "Ỗ"}, + {U'ỗ', "ỗ"}, + {U'Ộ', "Ộ"}, + {U'ộ', "ộ"}, + {U'Ớ', "Ớ"}, + {U'ớ', "ớ"}, + {U'Ờ', "Ờ"}, + {U'ờ', "ờ"}, + {U'Ở', "Ở"}, + {U'ở', "ở"}, + {U'Ỡ', "Ỡ"}, + {U'ỡ', "ỡ"}, + {U'Ợ', "Ợ"}, + {U'ợ', "ợ"}, + {U'Ụ', "Ụ"}, + {U'ụ', "ụ"}, + {U'Ủ', "Ủ"}, + {U'ủ', "ủ"}, + {U'Ứ', "Ứ"}, + {U'ứ', "ứ"}, + {U'Ừ', "Ừ"}, + {U'ừ', "ừ"}, + {U'Ử', "Ử"}, + {U'ử', "ử"}, + {U'Ữ', "Ữ"}, + {U'ữ', "ữ"}, + {U'Ự', "Ự"}, + {U'ự', "ự"}, + {U'Ỳ', "Ỳ"}, + {U'ỳ', "ỳ"}, + {U'Ỵ', "Ỵ"}, + {U'ỵ', "ỵ"}, + {U'Ỷ', "Ỷ"}, + {U'ỷ', "ỷ"}, + {U'Ỹ', "Ỹ"}, + {U'ỹ', "ỹ"}, +}}; + +constexpr std::array kCharsetMappings14{{ + {U'À', "\\xC0"}, + {U'Á', "\\xC1"}, + {U'Â', "\\xC2"}, + {U'Ã', "\\xC3"}, + {U'È', "\\xC8"}, + {U'É', "\\xC9"}, + {U'Ê', "\\xCA"}, + {U'Ì', "\\xCC"}, + {U'Í', "\\xCD"}, + {U'Ò', "\\xD2"}, + {U'Ó', "\\xD3"}, + {U'Ô', "\\xD4"}, + {U'Õ', "\\xD5"}, + {U'Ù', "\\xD9"}, + {U'Ú', "\\xDA"}, + {U'Ý', "\\xDD"}, + {U'à', "\\xE0"}, + {U'á', "\\xE1"}, + {U'â', "\\xE2"}, + {U'ã', "\\xE3"}, + {U'è', "\\xE8"}, + {U'é', "\\xE9"}, + {U'ê', "\\xEA"}, + {U'ì', "\\xEC"}, + {U'í', "\\xED"}, + {U'ò', "\\xF2"}, + {U'ó', "\\xF3"}, + {U'ô', "\\xF4"}, + {U'õ', "\\xF5"}, + {U'ù', "\\xF9"}, + {U'ú', "\\xFA"}, + {U'ý', "\\xFD"}, + {U'Ă', "\\x102"}, + {U'ă', "\\x103"}, + {U'Đ', "\\x110"}, + {U'đ', "\\x111"}, + {U'Ĩ', "\\x128"}, + {U'ĩ', "\\x129"}, + {U'Ũ', "\\x168"}, + {U'ũ', "\\x169"}, + {U'Ơ', "\\x1A0"}, + {U'ơ', "\\x1A1"}, + {U'Ư', "\\x1AF"}, + {U'ư', "\\x1B0"}, + {U'Ạ', "\\x1EA0"}, + {U'ạ', "\\x1EA1"}, + {U'Ả', "\\x1EA2"}, + {U'ả', "\\x1EA3"}, + {U'Ấ', "\\x1EA4"}, + {U'ấ', "\\x1EA5"}, + {U'Ầ', "\\x1EA6"}, + {U'ầ', "\\x1EA7"}, + {U'Ẩ', "\\x1EA8"}, + {U'ẩ', "\\x1EA9"}, + {U'Ẫ', "\\x1EAA"}, + {U'ẫ', "\\x1EAB"}, + {U'Ậ', "\\x1EAC"}, + {U'ậ', "\\x1EAD"}, + {U'Ắ', "\\x1EAE"}, + {U'ắ', "\\x1EAF"}, + {U'Ằ', "\\x1EB0"}, + {U'ằ', "\\x1EB1"}, + {U'Ẳ', "\\x1EB2"}, + {U'ẳ', "\\x1EB3"}, + {U'Ẵ', "\\x1EB4"}, + {U'ẵ', "\\x1EB5"}, + {U'Ặ', "\\x1EB6"}, + {U'ặ', "\\x1EB7"}, + {U'Ẹ', "\\x1EB8"}, + {U'ẹ', "\\x1EB9"}, + {U'Ẻ', "\\x1EBA"}, + {U'ẻ', "\\x1EBB"}, + {U'Ẽ', "\\x1EBC"}, + {U'ẽ', "\\x1EBD"}, + {U'Ế', "\\x1EBE"}, + {U'ế', "\\x1EBF"}, + {U'Ề', "\\x1EC0"}, + {U'ề', "\\x1EC1"}, + {U'Ể', "\\x1EC2"}, + {U'ể', "\\x1EC3"}, + {U'Ễ', "\\x1EC4"}, + {U'ễ', "\\x1EC5"}, + {U'Ệ', "\\x1EC6"}, + {U'ệ', "\\x1EC7"}, + {U'Ỉ', "\\x1EC8"}, + {U'ỉ', "\\x1EC9"}, + {U'Ị', "\\x1ECA"}, + {U'ị', "\\x1ECB"}, + {U'Ọ', "\\x1ECC"}, + {U'ọ', "\\x1ECD"}, + {U'Ỏ', "\\x1ECE"}, + {U'ỏ', "\\x1ECF"}, + {U'Ố', "\\x1ED0"}, + {U'ố', "\\x1ED1"}, + {U'Ồ', "\\x1ED2"}, + {U'ồ', "\\x1ED3"}, + {U'Ổ', "\\x1ED4"}, + {U'ổ', "\\x1ED5"}, + {U'Ỗ', "\\x1ED6"}, + {U'ỗ', "\\x1ED7"}, + {U'Ộ', "\\x1ED8"}, + {U'ộ', "\\x1ED9"}, + {U'Ớ', "\\x1EDA"}, + {U'ớ', "\\x1EDB"}, + {U'Ờ', "\\x1EDC"}, + {U'ờ', "\\x1EDD"}, + {U'Ở', "\\x1EDE"}, + {U'ở', "\\x1EDF"}, + {U'Ỡ', "\\x1EE0"}, + {U'ỡ', "\\x1EE1"}, + {U'Ợ', "\\x1EE2"}, + {U'ợ', "\\x1EE3"}, + {U'Ụ', "\\x1EE4"}, + {U'ụ', "\\x1EE5"}, + {U'Ủ', "\\x1EE6"}, + {U'ủ', "\\x1EE7"}, + {U'Ứ', "\\x1EE8"}, + {U'ứ', "\\x1EE9"}, + {U'Ừ', "\\x1EEA"}, + {U'ừ', "\\x1EEB"}, + {U'Ử', "\\x1EEC"}, + {U'ử', "\\x1EED"}, + {U'Ữ', "\\x1EEE"}, + {U'ữ', "\\x1EEF"}, + {U'Ự', "\\x1EF0"}, + {U'ự', "\\x1EF1"}, + {U'Ỳ', "\\x1EF2"}, + {U'ỳ', "\\x1EF3"}, + {U'Ỵ', "\\x1EF4"}, + {U'ỵ', "\\x1EF5"}, + {U'Ỷ', "\\x1EF6"}, + {U'ỷ', "\\x1EF7"}, + {U'Ỹ', "\\x1EF8"}, + {U'ỹ', "\\x1EF9"}, +}}; + +constexpr std::array kCharsetMappings15{{ + {U'À', "\\u192"}, + {U'Á', "\\u193"}, + {U'Â', "\\u194"}, + {U'Ã', "\\u195"}, + {U'È', "\\u200"}, + {U'É', "\\u201"}, + {U'Ê', "\\u202"}, + {U'Ì', "\\u204"}, + {U'Í', "\\u205"}, + {U'Ò', "\\u210"}, + {U'Ó', "\\u211"}, + {U'Ô', "\\u212"}, + {U'Õ', "\\u213"}, + {U'Ù', "\\u217"}, + {U'Ú', "\\u218"}, + {U'Ý', "\\u221"}, + {U'à', "\\u224"}, + {U'á', "\\u225"}, + {U'â', "\\u226"}, + {U'ã', "\\u227"}, + {U'è', "\\u232"}, + {U'é', "\\u233"}, + {U'ê', "\\u234"}, + {U'ì', "\\u236"}, + {U'í', "\\u237"}, + {U'ò', "\\u242"}, + {U'ó', "\\u243"}, + {U'ô', "\\u244"}, + {U'õ', "\\u245"}, + {U'ù', "\\u249"}, + {U'ú', "\\u250"}, + {U'ý', "\\u253"}, + {U'Ă', "\\u258"}, + {U'ă', "\\u259"}, + {U'Đ', "\\u272"}, + {U'đ', "\\u273"}, + {U'Ĩ', "\\u296"}, + {U'ĩ', "\\u297"}, + {U'Ũ', "\\u360"}, + {U'ũ', "\\u361"}, + {U'Ơ', "\\u416"}, + {U'ơ', "\\u417"}, + {U'Ư', "\\u431"}, + {U'ư', "\\u432"}, + {U'Ạ', "\\u7840"}, + {U'ạ', "\\u7841"}, + {U'Ả', "\\u7842"}, + {U'ả', "\\u7843"}, + {U'Ấ', "\\u7844"}, + {U'ấ', "\\u7845"}, + {U'Ầ', "\\u7846"}, + {U'ầ', "\\u7847"}, + {U'Ẩ', "\\u7848"}, + {U'ẩ', "\\u7849"}, + {U'Ẫ', "\\u7850"}, + {U'ẫ', "\\u7851"}, + {U'Ậ', "\\u7852"}, + {U'ậ', "\\u7853"}, + {U'Ắ', "\\u7854"}, + {U'ắ', "\\u7855"}, + {U'Ằ', "\\u7856"}, + {U'ằ', "\\u7857"}, + {U'Ẳ', "\\u7858"}, + {U'ẳ', "\\u7859"}, + {U'Ẵ', "\\u7860"}, + {U'ẵ', "\\u7861"}, + {U'Ặ', "\\u7862"}, + {U'ặ', "\\u7863"}, + {U'Ẹ', "\\u7864"}, + {U'ẹ', "\\u7865"}, + {U'Ẻ', "\\u7866"}, + {U'ẻ', "\\u7867"}, + {U'Ẽ', "\\u7868"}, + {U'ẽ', "\\u7869"}, + {U'Ế', "\\u7870"}, + {U'ế', "\\u7871"}, + {U'Ề', "\\u7872"}, + {U'ề', "\\u7873"}, + {U'Ể', "\\u7874"}, + {U'ể', "\\u7875"}, + {U'Ễ', "\\u7876"}, + {U'ễ', "\\u7877"}, + {U'Ệ', "\\u7878"}, + {U'ệ', "\\u7879"}, + {U'Ỉ', "\\u7880"}, + {U'ỉ', "\\u7881"}, + {U'Ị', "\\u7882"}, + {U'ị', "\\u7883"}, + {U'Ọ', "\\u7884"}, + {U'ọ', "\\u7885"}, + {U'Ỏ', "\\u7886"}, + {U'ỏ', "\\u7887"}, + {U'Ố', "\\u7888"}, + {U'ố', "\\u7889"}, + {U'Ồ', "\\u7890"}, + {U'ồ', "\\u7891"}, + {U'Ổ', "\\u7892"}, + {U'ổ', "\\u7893"}, + {U'Ỗ', "\\u7894"}, + {U'ỗ', "\\u7895"}, + {U'Ộ', "\\u7896"}, + {U'ộ', "\\u7897"}, + {U'Ớ', "\\u7898"}, + {U'ớ', "\\u7899"}, + {U'Ờ', "\\u7900"}, + {U'ờ', "\\u7901"}, + {U'Ở', "\\u7902"}, + {U'ở', "\\u7903"}, + {U'Ỡ', "\\u7904"}, + {U'ỡ', "\\u7905"}, + {U'Ợ', "\\u7906"}, + {U'ợ', "\\u7907"}, + {U'Ụ', "\\u7908"}, + {U'ụ', "\\u7909"}, + {U'Ủ', "\\u7910"}, + {U'ủ', "\\u7911"}, + {U'Ứ', "\\u7912"}, + {U'ứ', "\\u7913"}, + {U'Ừ', "\\u7914"}, + {U'ừ', "\\u7915"}, + {U'Ử', "\\u7916"}, + {U'ử', "\\u7917"}, + {U'Ữ', "\\u7918"}, + {U'ữ', "\\u7919"}, + {U'Ự', "\\u7920"}, + {U'ự', "\\u7921"}, + {U'Ỳ', "\\u7922"}, + {U'ỳ', "\\u7923"}, + {U'Ỵ', "\\u7924"}, + {U'ỵ', "\\u7925"}, + {U'Ỷ', "\\u7926"}, + {U'ỷ', "\\u7927"}, + {U'Ỹ', "\\u7928"}, + {U'ỹ', "\\u7929"}, +}}; + +constexpr std::array kDefinitions{{ + {"TCVN3 (ABC)", kCharsetMappings0.data(), kCharsetMappings0.size()} , + {"VNI Windows", kCharsetMappings1.data(), kCharsetMappings1.size()} , + {"Unicode tổ hợp", kCharsetMappings2.data(), kCharsetMappings2.size()} , + {"Windows 1258 codepage", kCharsetMappings3.data(), kCharsetMappings3.size()} , + {"VIQR", kCharsetMappings4.data(), kCharsetMappings4.size()} , + {"VISCII", kCharsetMappings5.data(), kCharsetMappings5.size()} , + {"VPS", kCharsetMappings6.data(), kCharsetMappings6.size()} , + {"BKHCM 2", kCharsetMappings7.data(), kCharsetMappings7.size()} , + {"BKHCM 1", kCharsetMappings8.data(), kCharsetMappings8.size()} , + {"Vietware X", kCharsetMappings9.data(), kCharsetMappings9.size()} , + {"Vietware Full", kCharsetMappings10.data(), kCharsetMappings10.size()} , + {"UTF-8", kCharsetMappings11.data(), kCharsetMappings11.size()} , + {"NCR Decimal", kCharsetMappings12.data(), kCharsetMappings12.size()} , + {"NCR Hex", kCharsetMappings13.data(), kCharsetMappings13.size()} , + {"Unicode C string Hex", kCharsetMappings14.data(), kCharsetMappings14.size()} , + {"Unicode C string Decimal", kCharsetMappings15.data(), kCharsetMappings15.size()} , +}}; + +constexpr std::array kNames{ + CharsetDefinition::kUnicode, + "TCVN3 (ABC)", + "VNI Windows", + "Unicode tổ hợp", + "Windows 1258 codepage", + "VIQR", + "VISCII", + "VPS", + "BKHCM 2", + "BKHCM 1", + "Vietware X", + "Vietware Full", + "UTF-8", + "NCR Decimal", + "NCR Hex", + "Unicode C string Hex", + "Unicode C string Decimal", +}; + +[[nodiscard]] const MappingView* findMapping(const DefinitionView& definition, char32_t codePoint) noexcept { + const auto* begin = definition.mappings; + const auto* end = definition.mappings + definition.size; + const auto* it = std::lower_bound(begin, end, codePoint, [](const MappingView& mapping, char32_t value) { + return mapping.source < value; + }); + if (it != end && it->source == codePoint) { + return it; + } + return nullptr; +} + +void appendUtf8(std::string& output, char32_t codePoint) { + if (codePoint <= 0x7F) { output.push_back(static_cast(codePoint)); return; } + if (codePoint <= 0x7FF) { + output.push_back(static_cast(0xC0 | (codePoint >> 6))); + output.push_back(static_cast(0x80 | (codePoint & 0x3F))); + return; + } + if (codePoint <= 0xFFFF) { + output.push_back(static_cast(0xE0 | (codePoint >> 12))); + output.push_back(static_cast(0x80 | ((codePoint >> 6) & 0x3F))); + output.push_back(static_cast(0x80 | (codePoint & 0x3F))); + return; + } + output.push_back(static_cast(0xF0 | (codePoint >> 18))); + output.push_back(static_cast(0x80 | ((codePoint >> 12) & 0x3F))); + output.push_back(static_cast(0x80 | ((codePoint >> 6) & 0x3F))); + output.push_back(static_cast(0x80 | (codePoint & 0x3F))); +} + +} // namespace + +const CharsetDefinition::DefinitionView* CharsetDefinition::find(std::string_view name) noexcept { + for (const auto& definition : kDefinitions) { + if (definition.name == name) { + return &definition; + } + } + return nullptr; +} + +std::string_view CharsetDefinition::lookupEncoded(std::string_view name, char32_t codePoint) noexcept { + const DefinitionView* definition = find(name); + if (definition == nullptr) { + return {}; + } + const MappingView* mapping = findMapping(*definition, codePoint); + return mapping == nullptr ? std::string_view{} : mapping->encoded; +} + +std::string CharsetDefinition::encode(std::string_view name, std::u32string_view input) { + std::string output; + output.reserve(input.size() * 4U); + const DefinitionView* definition = (name == kUnicode) ? nullptr : find(name); + for (const char32_t codePoint : input) { + if (definition != nullptr) { + if (const MappingView* mapping = findMapping(*definition, codePoint); mapping != nullptr) { + output.append(mapping->encoded.data(), mapping->encoded.size()); + continue; + } + } + appendUtf8(output, codePoint); + } + return output; +} + +const std::array& CharsetDefinition::names() noexcept { + return kNames; +} + +} // namespace bamboo::engine diff --git a/src/engine/charset_definition.h b/src/engine/charset_definition.h new file mode 100644 index 0000000..5b8bb8b --- /dev/null +++ b/src/engine/charset_definition.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include +#include + +namespace bamboo::engine { + +class CharsetDefinition final { +public: + static constexpr std::string_view kUnicode = "Unicode"; + + struct MappingView final { + char32_t source; + std::string_view encoded; + }; + + struct DefinitionView final { + std::string_view name; + const MappingView* mappings; + std::size_t size; + }; + + [[nodiscard]] static const DefinitionView* find(std::string_view name) noexcept; + [[nodiscard]] static std::string_view lookupEncoded(std::string_view name, char32_t codePoint) noexcept; + [[nodiscard]] static std::string encode(std::string_view name, std::u32string_view input); + [[nodiscard]] static const std::array& names() noexcept; +}; + +} // namespace bamboo::engine diff --git a/src/engine/encoder.cpp b/src/engine/encoder.cpp new file mode 100644 index 0000000..8e3cdfe --- /dev/null +++ b/src/engine/encoder.cpp @@ -0,0 +1,15 @@ +#include "encoder.h" + +#include "charset_definition.h" + +namespace bamboo::engine { + +std::string Encoder::encode(std::string_view charsetName, std::u32string_view input) { + return CharsetDefinition::encode(charsetName, input); +} + +const std::array& Encoder::charsetNames() noexcept { + return CharsetDefinition::names(); +} + +} // namespace bamboo::engine diff --git a/src/engine/encoder.h b/src/engine/encoder.h new file mode 100644 index 0000000..e9290da --- /dev/null +++ b/src/engine/encoder.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include +#include + +namespace bamboo::engine { + +class Encoder final { +public: + [[nodiscard]] static std::string encode(std::string_view charsetName, std::u32string_view input); + [[nodiscard]] static const std::array& charsetNames() noexcept; +}; + +} // namespace bamboo::engine diff --git a/src/engine/engine.cpp b/src/engine/engine.cpp new file mode 100644 index 0000000..5414057 --- /dev/null +++ b/src/engine/engine.cpp @@ -0,0 +1,260 @@ +#include "engine.h" + +#include "charset_definition.h" +#include "encoder.h" +#include "rules_parser.h" +#include "spelling.h" +#include "transformation_utils.h" + +#include + +namespace bamboo::engine { +namespace { + +[[nodiscard]] std::u32string decodeUtf8(std::string_view input) { + std::u32string output; + output.reserve(input.size()); + for (std::size_t index = 0; index < input.size();) { + const unsigned char byte0 = static_cast(input[index]); + if (byte0 < 0x80) { + output.push_back(static_cast(byte0)); + ++index; + } else if ((byte0 & 0xE0U) == 0xC0U && index + 1 < input.size()) { + const unsigned char byte1 = static_cast(input[index + 1]); + output.push_back(static_cast(((byte0 & 0x1FU) << 6) | (byte1 & 0x3FU))); + index += 2; + } else if ((byte0 & 0xF0U) == 0xE0U && index + 2 < input.size()) { + const unsigned char byte1 = static_cast(input[index + 1]); + const unsigned char byte2 = static_cast(input[index + 2]); + output.push_back(static_cast(((byte0 & 0x0FU) << 12) | ((byte1 & 0x3FU) << 6) | (byte2 & 0x3FU))); + index += 3; + } else { + output.push_back(static_cast(byte0)); + ++index; + } + } + return output; +} + +} // namespace + +Engine::Engine(std::string_view dataDirPath, std::string_view inputMethod) + : dataDirPath_(dataDirPath), + inputMethod_(parseInputMethod(inputMethod.empty() ? std::string_view{"Telex"} : inputMethod)) { + if (inputMethod_.name.empty()) { + inputMethod_ = parseInputMethod("Telex"); + } + encodedCache_.reserve(256); +} + +void Engine::setMode(api::Mode mode) { mode_ = mode; } +api::Mode Engine::getMode() const { return mode_; } + +void Engine::reset() { + composition_.clear(); + encodedCache_.clear(); + encodedCacheDirty_ = true; +} + +std::vector Engine::applicableRules(char32_t key) const { + std::vector rules; + for (const Rule& rule : inputMethod_.rules) { + if (rule.key == key) { + rules.push_back(rule); + } + } + return rules; +} + +bool Engine::canProcessKey(char32_t key) const noexcept { + if ((key >= U'a' && key <= U'z') || (key >= U'A' && key <= U'Z')) { + return true; + } + return std::find(inputMethod_.keys.begin(), inputMethod_.keys.end(), key) != inputMethod_.keys.end(); +} + +void Engine::appendRawKey(char32_t key, bool isUpperCase) { + Transformation trans; + trans.isUpperCase = isUpperCase; + trans.rule.key = key; + trans.rule.effectOn = key; + trans.rule.result = key; + trans.rule.effectType = EffectType::Appending; + composition_.push_back(trans); + encodedCacheDirty_ = true; +} + +void Engine::processKey(char32_t key) { + const char32_t lowerKey = toLowerCodePoint(key); + const bool isUpperCase = key != lowerKey; + + if (mode_ == api::Mode::English || !canProcessKey(lowerKey)) { + appendRawKey(lowerKey, isUpperCase); + return; + } + + const std::vector rules = applicableRules(lowerKey); + if (std::find(inputMethod_.superKeys.begin(), inputMethod_.superKeys.end(), lowerKey) != inputMethod_.superKeys.end()) { + const std::u32string word = currentWord(composition_); + if (word.size() >= 2) { + const std::u32string tail = word.substr(word.size() - 2); + const bool isUoShortcut = tail == U"uo" || tail == U"ưo"; + const bool isUongShortcut = word.size() >= 5 && word.substr(word.size() - 5) == U"uong"; + if (isUoShortcut || isUongShortcut) { + Transformation* uTarget = nullptr; + Transformation* oTarget = nullptr; + for (auto it = composition_.rbegin(); it != composition_.rend(); ++it) { + if (it->rule.effectType != EffectType::Appending) { + continue; + } + if (oTarget == nullptr && it->rule.effectOn == U'o') { + oTarget = &*it; + } else if (uTarget == nullptr && it->rule.effectOn == U'u') { + uTarget = &*it; + } + } + for (const Rule& rule : rules) { + if (isUongShortcut && uTarget != nullptr && oTarget != nullptr && + rule.effectType == EffectType::MarkTransformation) { + if (rule.effectOn == U'u') { + uTarget->rule.effectOn = U'ư'; + uTarget->rule.result = U'ư'; + } else if (rule.effectOn == U'o') { + oTarget->rule.effectOn = U'ơ'; + oTarget->rule.result = U'ơ'; + } + continue; + } + if (rule.effectType == EffectType::MarkTransformation && rule.effectOn == U'o' && oTarget != nullptr) { + Transformation trans; + trans.rule = rule; + trans.target = oTarget; + trans.isUpperCase = isUpperCase; + composition_.push_back(trans); + } + if (((word.size() == 2 && tail == U"uo") || isUongShortcut) && + rule.effectType == EffectType::MarkTransformation && rule.effectOn == U'u' && uTarget != nullptr) { + Transformation trans; + trans.rule = rule; + trans.target = uTarget; + trans.isUpperCase = isUpperCase; + composition_.push_back(trans); + } + } + if (oTarget != nullptr) { + encodedCacheDirty_ = true; + return; + } + } + } + } + + CompositionView syllable = extractLastSyllable(makeCompositionView(composition_)); + std::vector pending; + + if (const PendingTransformation direct = findTarget(syllable, rules); direct.target != nullptr) { + pending.push_back(direct); + } else { + pending = generateUndoTransformations(syllable, rules); + if (!pending.empty()) { + Rule rawRule{}; + rawRule.key = lowerKey; + rawRule.effectOn = lowerKey; + rawRule.result = lowerKey; + rawRule.effectType = EffectType::Appending; + pending.push_back(PendingTransformation{rawRule, nullptr, isUpperCase}); + } else { + pending = generateFallbackTransformations(rules, lowerKey, isUpperCase); + } + } + + for (PendingTransformation& item : pending) { + Transformation trans; + trans.rule = item.rule; + trans.target = item.target; + trans.isUpperCase = item.isUpperCase; + composition_.push_back(trans); + } + + CompositionView updated = extractLastSyllable(makeCompositionView(composition_)); + for (PendingTransformation& item : refreshLastToneTarget(updated)) { + Transformation trans; + trans.rule = item.rule; + trans.target = item.target; + trans.isUpperCase = item.isUpperCase; + composition_.push_back(trans); + } + + encodedCacheDirty_ = true; +} + +void Engine::processString(std::string_view str) { + for (char32_t cp : decodeUtf8(str)) { + processKey(cp); + } +} + +std::string Engine::getProcessedString() const { + if (!encodedCacheDirty_) { + return encodedCache_; + } + encodedCache_ = Encoder::encode(CharsetDefinition::kUnicode, flattenVietnamese(composition_, false)); + encodedCacheDirty_ = false; + return encodedCache_; +} + +bool Engine::isValid(bool inputIsFullComplete) const { + if (mode_ == api::Mode::English) { + return true; + } + const std::u32string word = currentWord(composition_); + if (word.empty()) { + return true; + } + const Segments segments = splitWord(word); + Spelling spelling; + return spelling.isValidCvc(segments.firstConsonant, segments.vowel, segments.lastConsonant, inputIsFullComplete); +} + +void Engine::removeLastChar(bool /*refreshLastToneTarget*/) { + if (!composition_.empty()) { + composition_.pop_back(); + while (!composition_.empty() && composition_.back().rule.key == 0) { + composition_.pop_back(); + } + encodedCacheDirty_ = true; + } +} + +void Engine::restoreLastWord(bool toVietnamese) { + if (toVietnamese) { + mode_ = api::Mode::Vietnamese; + return; + } + + CompositionView syllable = extractLastSyllable(makeCompositionView(composition_)); + std::deque broken = breakComposition(syllable); + while (!composition_.empty()) { + const Transformation& back = composition_.back(); + if (back.rule.effectType == EffectType::Appending && + (back.rule.key == U' ' || back.rule.key == U'\n' || back.rule.key == U'\t')) { + break; + } + composition_.pop_back(); + } + for (Transformation& trans : broken) { + composition_.push_back(trans); + } + mode_ = api::Mode::English; + encodedCacheDirty_ = true; +} + +} // namespace bamboo::engine + +namespace bamboo::api { + +std::unique_ptr createEngine(std::string_view dataDirPath, std::string_view inputMethod) { + return std::make_unique(dataDirPath, inputMethod); +} + +} // namespace bamboo::api diff --git a/src/engine/engine.h b/src/engine/engine.h new file mode 100644 index 0000000..2483cbd --- /dev/null +++ b/src/engine/engine.h @@ -0,0 +1,46 @@ +#pragma once + +#include "bamboo/IEngine.h" +#include "rule.h" + +#include +#include +#include + +namespace bamboo::engine { + +struct Transformation final { + Rule rule; + Transformation* target{nullptr}; + bool isUpperCase{false}; +}; + +class Engine final : public api::IEngine { +public: + Engine(std::string_view dataDirPath, std::string_view inputMethod); + ~Engine() override = default; + + void setMode(api::Mode mode) override; + [[nodiscard]] api::Mode getMode() const override; + void reset() override; + void processKey(char32_t key) override; + void processString(std::string_view str) override; + [[nodiscard]] std::string getProcessedString() const override; + [[nodiscard]] bool isValid(bool inputIsFullComplete) const override; + void removeLastChar(bool refreshLastToneTarget) override; + void restoreLastWord(bool toVietnamese) override; + +private: + [[nodiscard]] std::vector applicableRules(char32_t key) const; + [[nodiscard]] bool canProcessKey(char32_t key) const noexcept; + void appendRawKey(char32_t key, bool isUpperCase); + + api::Mode mode_{api::Mode::Vietnamese}; + std::string dataDirPath_; + InputMethod inputMethod_; + std::deque composition_; + mutable std::string encodedCache_; + mutable bool encodedCacheDirty_{true}; +}; + +} // namespace bamboo::engine diff --git a/src/engine/input_method_definition.cpp b/src/engine/input_method_definition.cpp new file mode 100644 index 0000000..18da8c9 --- /dev/null +++ b/src/engine/input_method_definition.cpp @@ -0,0 +1,233 @@ +#include "input_method_definition.h" + +#include + +namespace bamboo::engine { +namespace { + +using MappingView = InputMethodDefinition::MappingView; +using DefinitionView = InputMethodDefinition::DefinitionView; + +constexpr std::array kTelexMappings{{ + {U'z', "XoaDauThanh"}, + {U's', "DauSac"}, + {U'f', "DauHuyen"}, + {U'r', "DauHoi"}, + {U'x', "DauNga"}, + {U'j', "DauNang"}, + {U'a', "A_Â"}, + {U'e', "E_Ê"}, + {U'o', "O_Ô"}, + {U'w', "UOA_ƯƠĂ"}, + {U'd', "D_Đ"}, +}}; + +constexpr std::array kVniMappings{{ + {U'0', "XoaDauThanh"}, + {U'1', "DauSac"}, + {U'2', "DauHuyen"}, + {U'3', "DauHoi"}, + {U'4', "DauNga"}, + {U'5', "DauNang"}, + {U'6', "AEO_ÂÊÔ"}, + {U'7', "UO_ƯƠ"}, + {U'8', "A_Ă"}, + {U'9', "D_Đ"}, +}}; + +constexpr std::array kViqrMappings{{ + {U'0', "XoaDauThanh"}, + {U'\'', "DauSac"}, + {U'`', "DauHuyen"}, + {U'?', "DauHoi"}, + {U'~', "DauNga"}, + {U'.', "DauNang"}, + {U'^', "AEO_ÂÊÔ"}, + {U'+', "UO_ƯƠ"}, + {U'*', "UO_ƯƠ"}, + {U'(', "A_Ă"}, + {U'd', "D_Đ"}, +}}; + +constexpr std::array kMicrosoftMappings{{ + {U'8', "DauSac"}, + {U'5', "DauHuyen"}, + {U'6', "DauHoi"}, + {U'7', "DauNga"}, + {U'9', "DauNang"}, + {U'1', "__ă"}, + {U'!', "_Ă"}, + {U'2', "__â"}, + {U'@', "_Â"}, + {U'3', "__ê"}, + {U'#', "_Ê"}, + {U'4', "__ô"}, + {U'$', "_Ô"}, + {U'0', "__đ"}, + {U')', "_Đ"}, + {U'[', "__ư"}, + {U'{', "_Ư"}, + {U']', "__ơ"}, + {U'}', "_Ơ"}, +}}; + +constexpr std::array kTelex2Mappings{{ + {U'z', "XoaDauThanh"}, + {U's', "DauSac"}, + {U'f', "DauHuyen"}, + {U'r', "DauHoi"}, + {U'x', "DauNga"}, + {U'j', "DauNang"}, + {U'a', "A_Â"}, + {U'e', "E_Ê"}, + {U'o', "O_Ô"}, + {U'w', "UOA_ƯƠĂ__Ư"}, + {U'd', "D_Đ"}, + {U']', "__ư"}, + {U'[', "__ơ"}, + {U'}', "_Ư"}, + {U'{', "_Ơ"}, +}}; + +constexpr std::array kTelexVniMappings{{ + {U'z', "XoaDauThanh"}, + {U's', "DauSac"}, + {U'f', "DauHuyen"}, + {U'r', "DauHoi"}, + {U'x', "DauNga"}, + {U'j', "DauNang"}, + {U'a', "A_Â"}, + {U'e', "E_Ê"}, + {U'o', "O_Ô"}, + {U'w', "UOA_ƯƠĂ"}, + {U'd', "D_Đ"}, + {U'0', "XoaDauThanh"}, + {U'1', "DauSac"}, + {U'2', "DauHuyen"}, + {U'3', "DauHoi"}, + {U'4', "DauNga"}, + {U'5', "DauNang"}, + {U'6', "AEO_ÂÊÔ"}, + {U'7', "UO_ƯƠ"}, + {U'8', "A_Ă"}, + {U'9', "D_Đ"}, +}}; + +constexpr std::array kTelexVniViqrMappings{{ + {U'z', "XoaDauThanh"}, + {U's', "DauSac"}, + {U'f', "DauHuyen"}, + {U'r', "DauHoi"}, + {U'x', "DauNga"}, + {U'j', "DauNang"}, + {U'a', "A_Â"}, + {U'e', "E_Ê"}, + {U'o', "O_Ô"}, + {U'w', "UOA_ƯƠĂ"}, + {U'd', "D_Đ"}, + {U'0', "XoaDauThanh"}, + {U'1', "DauSac"}, + {U'2', "DauHuyen"}, + {U'3', "DauHoi"}, + {U'4', "DauNga"}, + {U'5', "DauNang"}, + {U'6', "AEO_ÂÊÔ"}, + {U'7', "UO_ƯƠ"}, + {U'8', "A_Ă"}, + {U'9', "D_Đ"}, + {U'\'', "DauSac"}, + {U'`', "DauHuyen"}, + {U'?', "DauHoi"}, + {U'~', "DauNga"}, + {U'.', "DauNang"}, + {U'^', "AEO_ÂÊÔ"}, + {U'+', "UO_ƯƠ"}, + {U'*', "UO_ƯƠ"}, + {U'(', "A_Ă"}, + {U'\\', "D_Đ"}, +}}; + +constexpr std::array kFrenchVniMappings{{ + {U'&', "XoaDauThanh"}, + {U'é', "DauSac"}, + {U'"', "DauHuyen"}, + {U'\'', "DauHoi"}, + {U'(', "DauNga"}, + {U'-', "DauNang"}, + {U'è', "AEO_ÂÊÔ"}, + {U'_', "UO_ƯƠ"}, + {U'ç', "A_Ă"}, + {U'à', "D_Đ"}, +}}; + +constexpr std::array kTelexWMappings{{ + {U'z', "XoaDauThanh"}, + {U's', "DauSac"}, + {U'f', "DauHuyen"}, + {U'r', "DauHoi"}, + {U'x', "DauNga"}, + {U'j', "DauNang"}, + {U'a', "A_Â"}, + {U'e', "E_Ê"}, + {U'o', "O_Ô"}, + {U'w', "UOA_ƯƠĂ__Ư"}, + {U'd', "D_Đ"}, +}}; + +constexpr std::array kDefinitions{{ + {"Telex", kTelexMappings.data(), kTelexMappings.size()}, + {"VNI", kVniMappings.data(), kVniMappings.size()}, + {"VIQR", kViqrMappings.data(), kViqrMappings.size()}, + {"Microsoft layout", kMicrosoftMappings.data(), kMicrosoftMappings.size()}, + {"Telex 2", kTelex2Mappings.data(), kTelex2Mappings.size()}, + {"Telex + VNI", kTelexVniMappings.data(), kTelexVniMappings.size()}, + {"Telex + VNI + VIQR", kTelexVniViqrMappings.data(), kTelexVniViqrMappings.size()}, + {"VNI Bàn phím tiếng Pháp", kFrenchVniMappings.data(), kFrenchVniMappings.size()}, + {"Telex W", kTelexWMappings.data(), kTelexWMappings.size()}, +}}; + +constexpr std::array kNames{{ + "Telex", + "VNI", + "VIQR", + "Microsoft layout", + "Telex 2", + "Telex + VNI", + "Telex + VNI + VIQR", + "VNI Bàn phím tiếng Pháp", + "Telex W", +}}; + +[[nodiscard]] std::string_view lookupInMappings(const DefinitionView& definition, char32_t key) noexcept { + for (std::size_t index = 0; index < definition.size; ++index) { + if (definition.mappings[index].key == key) { + return definition.mappings[index].action; + } + } + return {}; +} + +} // namespace + +const InputMethodDefinition::DefinitionView* InputMethodDefinition::find(std::string_view name) noexcept { + for (const auto& definition : kDefinitions) { + if (definition.name == name) { + return &definition; + } + } + return nullptr; +} + +std::string_view InputMethodDefinition::lookupAction(std::string_view name, char32_t key) noexcept { + const DefinitionView* definition = find(name); + if (definition == nullptr) { + return {}; + } + return lookupInMappings(*definition, key); +} + +const std::array& InputMethodDefinition::names() noexcept { + return kNames; +} + +} // namespace bamboo::engine diff --git a/src/engine/input_method_definition.h b/src/engine/input_method_definition.h new file mode 100644 index 0000000..46106d4 --- /dev/null +++ b/src/engine/input_method_definition.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +namespace bamboo::engine { + +class InputMethodDefinition final { +public: + struct MappingView final { + char32_t key; + std::string_view action; + }; + + struct DefinitionView final { + std::string_view name; + const MappingView* mappings; + std::size_t size; + }; + + [[nodiscard]] static const DefinitionView* find(std::string_view name) noexcept; + [[nodiscard]] static std::string_view lookupAction(std::string_view name, char32_t key) noexcept; + [[nodiscard]] static const std::array& names() noexcept; +}; + +} // namespace bamboo::engine diff --git a/src/engine/rule.h b/src/engine/rule.h new file mode 100644 index 0000000..fdedc99 --- /dev/null +++ b/src/engine/rule.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include + +namespace bamboo::engine { + +enum class EffectType : std::uint8_t { + Appending = 1u << 0, + MarkTransformation = 1u << 1, + ToneTransformation = 1u << 2, + Replacing = 1u << 3, +}; + +enum class Mark : std::uint8_t { + None = 0, + Hat = 1, + Breve = 2, + Horn = 3, + Dash = 4, + Raw = 5, +}; + +enum class Tone : std::uint8_t { + None = 0, + Grave = 1, + Acute = 2, + Hook = 3, + Tilde = 4, + Dot = 5, +}; + +struct Rule final { + char32_t key{0}; + std::uint8_t effect{0}; + EffectType effectType{EffectType::Appending}; + char32_t effectOn{0}; + char32_t result{0}; + std::vector appendedRules; + + void setTone(Tone tone) noexcept { effect = static_cast(tone); } + void setMark(Mark mark) noexcept { effect = static_cast(mark); } + [[nodiscard]] Tone tone() const noexcept { return static_cast(effect); } + [[nodiscard]] Mark mark() const noexcept { return static_cast(effect); } +}; + +struct InputMethod final { + std::string name; + std::vector rules; + std::vector superKeys; + std::vector toneKeys; + std::vector appendingKeys; + std::vector keys; +}; + +} // namespace bamboo::engine diff --git a/src/engine/rules_parser.cpp b/src/engine/rules_parser.cpp new file mode 100644 index 0000000..844a823 --- /dev/null +++ b/src/engine/rules_parser.cpp @@ -0,0 +1,312 @@ +#include "rules_parser.h" + +#include "input_method_definition.h" +#include "spelling.h" + +#include +#include +#include +#include +#include + +namespace bamboo::engine { +namespace { + +constexpr std::array, 6> kTones{{ + {"XoaDauThanh", Tone::None}, + {"DauSac", Tone::Acute}, + {"DauHuyen", Tone::Grave}, + {"DauNga", Tone::Tilde}, + {"DauNang", Tone::Dot}, + {"DauHoi", Tone::Hook}, +}}; + +constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵ"; + +[[nodiscard]] std::u32string decodeUtf8(std::string_view input) { + std::u32string output; + output.reserve(input.size()); + for (std::size_t index = 0; index < input.size();) { + const unsigned char byte0 = static_cast(input[index]); + if (byte0 < 0x80) { + output.push_back(static_cast(byte0)); + ++index; + } else if ((byte0 & 0xE0U) == 0xC0U && index + 1 < input.size()) { + const unsigned char byte1 = static_cast(input[index + 1]); + output.push_back(static_cast(((byte0 & 0x1FU) << 6) | (byte1 & 0x3FU))); + index += 2; + } else if ((byte0 & 0xF0U) == 0xE0U && index + 2 < input.size()) { + const unsigned char byte1 = static_cast(input[index + 1]); + const unsigned char byte2 = static_cast(input[index + 2]); + output.push_back(static_cast(((byte0 & 0x0FU) << 12) | ((byte1 & 0x3FU) << 6) | (byte2 & 0x3FU))); + index += 3; + } else if ((byte0 & 0xF8U) == 0xF0U && index + 3 < input.size()) { + const unsigned char byte1 = static_cast(input[index + 1]); + const unsigned char byte2 = static_cast(input[index + 2]); + const unsigned char byte3 = static_cast(input[index + 3]); + output.push_back(static_cast(((byte0 & 0x07U) << 18) | ((byte1 & 0x3FU) << 12) | + ((byte2 & 0x3FU) << 6) | (byte3 & 0x3FU))); + index += 4; + } else { + output.push_back(static_cast(byte0)); + ++index; + } + } + return output; +} + +[[nodiscard]] char32_t toLowerCodePoint(char32_t codePoint) noexcept { + if (codePoint >= U'A' && codePoint <= U'Z') { + return codePoint + 32; + } + switch (codePoint) { + case U'Đ': return U'đ'; + case U'Â': return U'â'; + case U'Ă': return U'ă'; + case U'Ê': return U'ê'; + case U'Ô': return U'ô'; + case U'Ơ': return U'ơ'; + case U'Ư': return U'ư'; + default: return codePoint; + } +} + +[[nodiscard]] bool isAsciiAlphaString(std::string_view value) noexcept { + return !value.empty() && std::all_of(value.begin(), value.end(), [](unsigned char c) { return std::isalpha(c) != 0; }); +} + +[[nodiscard]] int findVowelPosition(char32_t chr) noexcept { + for (std::size_t index = 0; index < kVowels.size(); ++index) { + if (kVowels[index] == chr) { + return static_cast(index); + } + } + return -1; +} + +[[nodiscard]] bool isVowel(char32_t chr) noexcept { + return findVowelPosition(chr) >= 0; +} + +[[nodiscard]] char32_t addToneToChar(char32_t chr, std::uint8_t tone) noexcept { + const int position = findVowelPosition(chr); + if (position < 0) { + return chr; + } + const int currentTone = position % 6; + return kVowels[static_cast(position + static_cast(tone) - currentTone)]; +} + +[[nodiscard]] std::u32string_view markFamily(char32_t chr) noexcept { + switch (chr) { + case U'a': + case U'â': + case U'ă': + return U"aâă"; + case U'e': + case U'ê': + return U"eê"; + case U'o': + case U'ô': + case U'ơ': + return U"oôơ"; + case U'u': + case U'ư': + return U"uư"; + case U'd': + case U'đ': + return U"dđ"; + default: + return U""; + } +} + +[[nodiscard]] std::optional findMarkFromChar(char32_t chr) noexcept { + switch (chr) { + case U'a': + case U'e': + case U'o': + case U'u': + case U'd': + return Mark::None; + case U'â': + case U'ê': + case U'ô': + return Mark::Hat; + case U'ă': + return Mark::Breve; + case U'ơ': + case U'ư': + return Mark::Horn; + case U'đ': + return Mark::Dash; + default: + return std::nullopt; + } +} + +[[nodiscard]] std::optional getAppendingRule(char32_t key, std::u32string_view value) { + const std::size_t firstUnderscore = value.find(U'_'); + if (firstUnderscore == std::u32string_view::npos || firstUnderscore > 1) { + return std::nullopt; + } + if (firstUnderscore + 1 >= value.size() || value[firstUnderscore + 1] != U'_') { + return std::nullopt; + } + + const std::u32string chars(value.substr(firstUnderscore + 2)); + if (chars.empty()) { + return std::nullopt; + } + + Rule rule; + rule.key = key; + rule.effectType = EffectType::Appending; + rule.effectOn = chars.front(); + rule.result = chars.front(); + for (std::size_t index = 1; index < chars.size(); ++index) { + Rule appended; + appended.key = key; + appended.effectType = EffectType::Appending; + appended.effectOn = chars[index]; + appended.result = chars[index]; + rule.appendedRules.push_back(appended); + } + return rule; +} + +} // namespace + +InputMethod parseInputMethod(std::string_view inputMethodName) { + InputMethod result; + const InputMethodDefinition::DefinitionView* definition = InputMethodDefinition::find(inputMethodName); + if (definition == nullptr) { + return result; + } + + result.name = std::string(definition->name); + result.rules.reserve(definition->size * 4U); + result.keys.reserve(definition->size); + + for (std::size_t index = 0; index < definition->size; ++index) { + const auto& mapping = definition->mappings[index]; + std::vector parsedRules = parseRules(mapping.key, mapping.action); + result.rules.insert(result.rules.end(), parsedRules.begin(), parsedRules.end()); + if (mapping.action.find("uo") != std::string_view::npos || mapping.action.find("UO") != std::string_view::npos) { + result.superKeys.push_back(mapping.key); + } + result.keys.push_back(mapping.key); + } + + for (const Rule& rule : result.rules) { + if (rule.effectType == EffectType::Appending) { + result.appendingKeys.push_back(rule.key); + } + if (rule.effectType == EffectType::ToneTransformation) { + result.toneKeys.push_back(rule.key); + } + } + + return result; +} + +std::vector parseRules(char32_t key, std::string_view line) { + for (const auto& tone : kTones) { + if (tone.first == line) { + Rule rule; + rule.key = key; + rule.effectType = EffectType::ToneTransformation; + rule.setTone(tone.second); + return {rule}; + } + } + return parseTonelessRules(key, line); +} + +std::vector parseTonelessRules(char32_t key, std::string_view line) { + std::vector rules; + + const std::u32string decodedLine = decodeUtf8(line); + std::u32string normalizedLine; + normalizedLine.reserve(decodedLine.size()); + for (const char32_t codePoint : decodedLine) { + normalizedLine.push_back(toLowerCodePoint(codePoint)); + } + + const std::size_t underscore = normalizedLine.find(U'_'); + if (underscore != std::u32string::npos) { + std::string effectiveOnsAscii; + effectiveOnsAscii.reserve(underscore); + bool asciiOnly = true; + for (std::size_t index = 0; index < underscore; ++index) { + const char32_t cp = normalizedLine[index]; + if (cp > 0x7F) { asciiOnly = false; break; } + effectiveOnsAscii.push_back(static_cast(cp)); + } + + if (asciiOnly && isAsciiAlphaString(effectiveOnsAscii)) { + const std::u32string effectiveOns(normalizedLine.begin(), normalizedLine.begin() + static_cast(underscore)); + const std::u32string rest(normalizedLine.begin() + static_cast(underscore + 1), normalizedLine.end()); + if (rest.size() >= effectiveOns.size()) { + const std::u32string results(rest.begin(), rest.begin() + static_cast(effectiveOns.size())); + for (std::size_t index = 0; index < effectiveOns.size(); ++index) { + if (const std::optional effect = findMarkFromChar(results[index]); effect.has_value()) { + std::vector parsed = parseToneLessRule(key, effectiveOns[index], results[index], *effect); + rules.insert(rules.end(), parsed.begin(), parsed.end()); + } + } + + if (rest.size() > effectiveOns.size()) { + if (const std::optional appendRule = getAppendingRule(key, std::u32string_view(rest).substr(effectiveOns.size())); appendRule.has_value()) { + rules.push_back(*appendRule); + } + } + return rules; + } + } + } + + if (const std::optional appendRule = getAppendingRule(key, normalizedLine); appendRule.has_value()) { + rules.push_back(*appendRule); + } + return rules; +} + +std::vector parseToneLessRule(char32_t key, char32_t effectiveOn, char32_t result, Mark effect) { + std::vector rules; + constexpr std::array tones{{Tone::None, Tone::Dot, Tone::Acute, Tone::Grave, Tone::Hook, Tone::Tilde}}; + + const std::u32string_view family = markFamily(effectiveOn); + for (const char32_t chr : family) { + if (chr == result) { + Rule rule; + rule.key = key; + rule.effectType = EffectType::MarkTransformation; + rule.effectOn = result; + rule.result = effectiveOn; + rules.push_back(rule); + } else if (isVowel(chr)) { + for (const Tone tone : tones) { + Rule rule; + rule.key = key; + rule.effectType = EffectType::MarkTransformation; + rule.setMark(effect); + rule.effectOn = addToneToChar(chr, static_cast(tone)); + rule.result = addToneToChar(result, static_cast(tone)); + rules.push_back(rule); + } + } else { + Rule rule; + rule.key = key; + rule.effectType = EffectType::MarkTransformation; + rule.setMark(effect); + rule.effectOn = chr; + rule.result = result; + rules.push_back(rule); + } + } + + return rules; +} + +} // namespace bamboo::engine diff --git a/src/engine/rules_parser.h b/src/engine/rules_parser.h new file mode 100644 index 0000000..bb6adf9 --- /dev/null +++ b/src/engine/rules_parser.h @@ -0,0 +1,15 @@ +#pragma once + +#include "rule.h" + +#include +#include + +namespace bamboo::engine { + +[[nodiscard]] InputMethod parseInputMethod(std::string_view inputMethodName); +[[nodiscard]] std::vector parseRules(char32_t key, std::string_view line); +[[nodiscard]] std::vector parseTonelessRules(char32_t key, std::string_view line); +[[nodiscard]] std::vector parseToneLessRule(char32_t key, char32_t effectiveOn, char32_t result, Mark effect); + +} // namespace bamboo::engine diff --git a/src/engine/spelling.cpp b/src/engine/spelling.cpp new file mode 100644 index 0000000..a38cad4 --- /dev/null +++ b/src/engine/spelling.cpp @@ -0,0 +1,225 @@ +#include "spelling.h" + +#include + +namespace bamboo::engine { +namespace { + +constexpr std::size_t kMaxTokensPerRow = 31; + +struct TokenRow final { + std::array tokens{}; + std::uint8_t size{0}; +}; + +constexpr TokenRow makeRow(std::initializer_list init) { + TokenRow row{}; + row.size = static_cast(init.size()); + std::size_t index = 0; + for (const auto token : init) { + row.tokens[index++] = token; + } + return row; +} + +constexpr std::array kFirstConsonantRows{ + makeRow({U"b", U"d", U"đ", U"g", U"gh", U"m", U"n", U"nh", U"p", U"ph", U"r", U"s", U"t", U"tr", U"v", U"z"}), + makeRow({U"c", U"h", U"k", U"kh", U"kr", U"qu", U"th"}), + makeRow({U"ch", U"gi", U"l", U"ng", U"ngh", U"x"}), + makeRow({U"b", U"đ", U"l"}), + makeRow({U"h"}), +}; + +constexpr std::array kVowelRows{ + makeRow({U"ê", U"i", U"ua", U"uê", U"uy", U"y"}), + makeRow({U"a", U"iê", U"oa", U"uyê", U"yê"}), + makeRow({U"â", U"ă", U"e", U"o", U"oo", U"ô", U"ơ", U"oe", U"u", U"ư", U"uâ", U"uô", U"ươ"}), + makeRow({U"oă"}), + makeRow({U"uơ"}), + makeRow({U"ai", U"ao", U"au", U"âu", U"ay", U"ây", U"eo", U"êu", U"ia", U"iêu", U"iu", U"oai", U"oao", U"oay", U"oeo", U"oi", U"ôi", U"ơi", U"ưa", U"uây", U"ui", U"ưi", U"uôi", U"ươi", U"ươu", U"ưu", U"uya", U"uyu", U"uêu", U"yêu"}), + makeRow({U"ă", U"u"}), + makeRow({U"i"}), +}; + +constexpr std::array kLastConsonantRows{ + makeRow({U"ch", U"nh"}), + makeRow({U"c", U"ng"}), + makeRow({U"m", U"n", U"p", U"t"}), + makeRow({U"k"}), + makeRow({U"c"}), +}; + +constexpr std::array, 5> kCvMatrix{{ + {{0, 1, 2, 5, 0xFF, 0xFF}}, + {{0, 1, 2, 3, 4, 5}}, + {{0, 1, 2, 3, 5, 0xFF}}, + {{6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, + {{7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, +}}; +constexpr std::array kCvMatrixSizes{{4, 6, 5, 1, 1}}; + +constexpr std::array, 8> kVcMatrix{{ + {{0, 2}}, + {{0, 1}}, + {{1, 2}}, + {{1, 2}}, + {{0xFF, 0xFF}}, + {{0xFF, 0xFF}}, + {{3, 0xFF}}, + {{4, 0xFF}}, +}}; +constexpr std::array kVcMatrixSizes{{2, 2, 2, 2, 0, 0, 1, 1}}; + +bool matchesRow(const TokenRow& row, + std::u32string_view input, + bool inputIsFull, + bool inputIsComplete, + std::uint8_t rowIndex, + IndexSet<8>& result) noexcept { + for (std::uint8_t tokenIndex = 0; tokenIndex < row.size; ++tokenIndex) { + const std::u32string_view token = row.tokens[tokenIndex]; + if (token.size() < input.size() || (inputIsFull && token.size() > input.size())) { + continue; + } + + bool isMatch = true; + for (std::size_t i = 0; i < input.size(); ++i) { + if (input[i] == token[i]) { + continue; + } + if (!inputIsComplete && Spelling::normalizeToneless(token[i]) == input[i]) { + continue; + } + isMatch = false; + break; + } + if (isMatch) { + result.push(rowIndex); + return true; + } + } + return false; +} + +template +auto lookupRows(const std::array& rows, + std::u32string_view input, + bool inputIsFull, + bool inputIsComplete) noexcept { + IndexSet<8> result; + for (std::uint8_t rowIndex = 0; rowIndex < rows.size(); ++rowIndex) { + matchesRow(rows[rowIndex], input, inputIsFull, inputIsComplete, rowIndex, result); + } + return result; +} + +} // namespace + +char32_t Spelling::normalizeToneless(char32_t codePoint) noexcept { + switch (codePoint) { + case U'â': + case U'ă': + return U'a'; + case U'ê': + return U'e'; + case U'ô': + case U'ơ': + return U'o'; + case U'ư': + return U'u'; + case U'đ': + return U'd'; + default: + return codePoint; + } +} + +bool Spelling::isValidCv(const RowMatchSet& firstConsonantIndexes, + const RowMatchSet& vowelIndexes) noexcept { + for (std::uint8_t i = 0; i < firstConsonantIndexes.size; ++i) { + const std::uint8_t firstConsonant = firstConsonantIndexes.values[i]; + for (std::uint8_t cvIndex = 0; cvIndex < kCvMatrixSizes[firstConsonant]; ++cvIndex) { + const std::uint8_t allowedVowel = kCvMatrix[firstConsonant][cvIndex]; + for (std::uint8_t j = 0; j < vowelIndexes.size; ++j) { + if (allowedVowel == vowelIndexes.values[j]) { + return true; + } + } + } + } + return false; +} + +bool Spelling::isValidVc(const RowMatchSet& vowelIndexes, + const RowMatchSet& lastConsonantIndexes) noexcept { + for (std::uint8_t i = 0; i < vowelIndexes.size; ++i) { + const std::uint8_t vowel = vowelIndexes.values[i]; + for (std::uint8_t vcIndex = 0; vcIndex < kVcMatrixSizes[vowel]; ++vcIndex) { + const std::uint8_t allowedLastConsonant = kVcMatrix[vowel][vcIndex]; + for (std::uint8_t j = 0; j < lastConsonantIndexes.size; ++j) { + if (allowedLastConsonant == lastConsonantIndexes.values[j]) { + return true; + } + } + } + } + return false; +} + +bool Spelling::isValidCvc(std::u32string_view firstConsonant, + std::u32string_view vowel, + std::u32string_view lastConsonant, + bool inputIsFullComplete) const noexcept { + RowMatchSet firstConsonantIndexes; + RowMatchSet vowelIndexes; + RowMatchSet lastConsonantIndexes; + + if (!firstConsonant.empty()) { + firstConsonantIndexes = lookupRows(kFirstConsonantRows, + firstConsonant, + inputIsFullComplete || !vowel.empty(), + true); + if (firstConsonantIndexes.empty()) { + return false; + } + } + + if (!vowel.empty()) { + vowelIndexes = lookupRows(kVowelRows, + vowel, + inputIsFullComplete || !lastConsonant.empty(), + inputIsFullComplete); + if (vowelIndexes.empty()) { + return false; + } + } + + if (!lastConsonant.empty()) { + lastConsonantIndexes = lookupRows(kLastConsonantRows, + lastConsonant, + inputIsFullComplete, + true); + if (lastConsonantIndexes.empty()) { + return false; + } + } + + if (vowelIndexes.empty()) { + return !firstConsonantIndexes.empty(); + } + + if (!firstConsonantIndexes.empty()) { + const bool validCv = isValidCv(firstConsonantIndexes, vowelIndexes); + if (!validCv || lastConsonantIndexes.empty()) { + return validCv; + } + } + + if (!lastConsonantIndexes.empty()) { + return isValidVc(vowelIndexes, lastConsonantIndexes); + } + + return true; +} + +} // namespace bamboo::engine diff --git a/src/engine/spelling.h b/src/engine/spelling.h new file mode 100644 index 0000000..a244d6e --- /dev/null +++ b/src/engine/spelling.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include +#include + +namespace bamboo::engine { + +template +struct IndexSet final { + std::array values{}; + std::uint8_t size{0}; + + [[nodiscard]] bool empty() const noexcept { return size == 0; } + + void push(std::uint8_t value) noexcept { + if (size < Capacity) { + values[size++] = value; + } + } +}; + +class Spelling final { +public: + [[nodiscard]] bool isValidCvc(std::u32string_view firstConsonant, + std::u32string_view vowel, + std::u32string_view lastConsonant, + bool inputIsFullComplete) const noexcept; + [[nodiscard]] static char32_t normalizeToneless(char32_t codePoint) noexcept; + +private: + using RowMatchSet = IndexSet<8>; + + [[nodiscard]] static bool isValidCv(const RowMatchSet& firstConsonantIndexes, + const RowMatchSet& vowelIndexes) noexcept; + [[nodiscard]] static bool isValidVc(const RowMatchSet& vowelIndexes, + const RowMatchSet& lastConsonantIndexes) noexcept; +}; + +} // namespace bamboo::engine diff --git a/src/engine/transformation_utils.cpp b/src/engine/transformation_utils.cpp new file mode 100644 index 0000000..2c38866 --- /dev/null +++ b/src/engine/transformation_utils.cpp @@ -0,0 +1,448 @@ +#include "transformation_utils.h" +#include "spelling.h" + +#include + +namespace bamboo::engine { +namespace { + +constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵ"; + +[[nodiscard]] char32_t toUpperCodePoint(char32_t codePoint) noexcept { + if (codePoint >= U'a' && codePoint <= U'z') { + return codePoint - 32; + } + switch (codePoint) { + case U'đ': return U'Đ'; + case U'â': return U'Â'; + case U'ă': return U'Ă'; + case U'ê': return U'Ê'; + case U'ô': return U'Ô'; + case U'ơ': return U'Ơ'; + case U'ư': return U'Ư'; + default: return codePoint; + } +} + +[[nodiscard]] int findVowelPosition(char32_t chr) noexcept { + const auto pos = kVowels.find(chr); + return pos == std::u32string_view::npos ? -1 : static_cast(pos); +} + +[[nodiscard]] char32_t addToneToChar(char32_t chr, std::uint8_t tone) noexcept { + const int position = findVowelPosition(chr); + if (position < 0) { + return chr; + } + const int currentTone = position % 6; + return kVowels[static_cast(position + static_cast(tone) - currentTone)]; +} + +[[nodiscard]] char32_t addMarkToTonelessChar(char32_t chr, std::uint8_t mark) noexcept { + switch (chr) { + case U'a': return mark == 1 ? U'â' : (mark == 2 ? U'ă' : U'a'); + case U'â': return mark == 0 ? U'a' : U'â'; + case U'ă': return mark == 0 ? U'a' : U'ă'; + case U'e': return mark == 1 ? U'ê' : U'e'; + case U'ê': return mark == 0 ? U'e' : U'ê'; + case U'o': return mark == 1 ? U'ô' : (mark == 3 ? U'ơ' : U'o'); + case U'ô': return mark == 0 ? U'o' : U'ô'; + case U'ơ': return mark == 0 ? U'o' : U'ơ'; + case U'u': return mark == 3 ? U'ư' : U'u'; + case U'ư': return mark == 0 ? U'u' : U'ư'; + case U'd': return mark == 4 ? U'đ' : U'd'; + case U'đ': return mark == 0 ? U'd' : U'đ'; + default: return chr; + } +} + +[[nodiscard]] char32_t addMarkToChar(char32_t chr, std::uint8_t mark) noexcept { + const std::uint8_t tone = static_cast(findVowelPosition(chr) >= 0 ? findVowelPosition(chr) % 6 : 0); + const char32_t toneless = addToneToChar(chr, 0); + return addToneToChar(addMarkToTonelessChar(toneless, mark), tone); +} + +[[nodiscard]] char32_t stripToneAndMark(char32_t chr) noexcept { + return addMarkToTonelessChar(addToneToChar(chr, 0), 0); +} + + +[[nodiscard]] std::u32string flattenVietnameseView(const CompositionView& composition, bool lowerCase, bool toneLess, bool markLess) { + std::u32string canvas; + canvas.reserve(composition.size()); + for (const Transformation* appending : composition) { + if (appending->rule.effectType != EffectType::Appending || appending->rule.key == 0) { + continue; + } + char32_t chr = appending->rule.effectOn; + for (const Transformation* trans : composition) { + if (trans->target != appending) { + continue; + } + if (trans->rule.effectType == EffectType::MarkTransformation) { + chr = trans->rule.mark() == Mark::Raw ? appending->rule.key : addMarkToChar(chr, trans->rule.effect); + } else if (trans->rule.effectType == EffectType::ToneTransformation) { + chr = addToneToChar(chr, trans->rule.effect); + } + } + if (toneLess) { + chr = addToneToChar(chr, 0); + } + if (markLess) { + chr = addMarkToTonelessChar(chr, 0); + } + if (lowerCase) { + chr = toLowerCodePoint(chr); + } + canvas.push_back(chr); + } + return canvas; +} + +[[nodiscard]] bool validateWord(std::u32string_view word, bool full) { + const Segments segments = splitWord(word); + Spelling spelling; + return spelling.isValidCvc(segments.firstConsonant, segments.vowel, segments.lastConsonant, full); +} + +[[nodiscard]] char32_t renderTargetChar(const CompositionView& composition, const Transformation* target) { + char32_t chr = target->rule.effectOn; + for (const Transformation* trans : composition) { + if (trans->target != target) { + continue; + } + if (trans->rule.effectType == EffectType::MarkTransformation) { + chr = trans->rule.mark() == Mark::Raw ? target->rule.key : addMarkToChar(chr, trans->rule.effect); + } else if (trans->rule.effectType == EffectType::ToneTransformation) { + chr = addToneToChar(chr, trans->rule.effect); + } + } + return chr; +} + +[[nodiscard]] Transformation* findRootTarget(Transformation* target) noexcept { + while (target != nullptr && target->target != nullptr) { + target = target->target; + } + return target; +} + +[[nodiscard]] Transformation* findToneTarget(const CompositionView& composition) noexcept { + std::vector vowels; + vowels.reserve(3); + for (Transformation* trans : composition) { + if (trans->rule.effectType == EffectType::Appending && isVowel(trans->rule.effectOn)) { + vowels.push_back(trans); + } + } + if (vowels.empty()) { + return nullptr; + } + if (vowels.size() == 1) { + return vowels.front(); + } + + std::u32string vowelShape; + vowelShape.reserve(vowels.size()); + for (Transformation* vowel : vowels) { + vowelShape.push_back(stripToneAndMark(renderTargetChar(composition, vowel))); + } + bool hasTrailingConsonant = false; + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + if ((*it)->rule.effectType != EffectType::Appending || (*it)->rule.key == 0) { + continue; + } + hasTrailingConsonant = !isVowel((*it)->rule.effectOn); + break; + } + + if (vowels.size() == 2) { + if (hasTrailingConsonant) { + return vowels[1]; + } + if (vowelShape == U"oa" || vowelShape == U"oe" || vowelShape == U"uy" || vowelShape == U"ue" || vowelShape == U"uo") { + return vowels[1]; + } + return vowels[0]; + } + + if (vowelShape == U"uye") { + return vowels[2]; + } + return vowels[1]; +} + +} // namespace + +char32_t toLowerCodePoint(char32_t codePoint) noexcept { + if (codePoint >= U'A' && codePoint <= U'Z') { + return codePoint + 32; + } + switch (codePoint) { + case U'Đ': return U'đ'; + case U'Â': return U'â'; + case U'Ă': return U'ă'; + case U'Ê': return U'ê'; + case U'Ô': return U'ô'; + case U'Ơ': return U'ơ'; + case U'Ư': return U'ư'; + default: return codePoint; + } +} + +bool isVowel(char32_t chr) noexcept { + return kVowels.find(chr) != std::u32string_view::npos; +} + +CompositionView makeCompositionView(std::deque& composition) { + CompositionView view; + view.reserve(composition.size()); + for (Transformation& trans : composition) { + view.push_back(&trans); + } + return view; +} + +CompositionView extractLastSyllable(const CompositionView& composition) { + std::size_t start = 0; + for (std::size_t index = composition.size(); index > 0; --index) { + const Transformation* trans = composition[index - 1]; + if (trans->rule.effectType == EffectType::Appending && + (trans->rule.key == U' ' || trans->rule.key == U'\n' || trans->rule.key == U'\t')) { + start = index; + break; + } + } + return CompositionView(composition.begin() + static_cast(start), composition.end()); +} + +PendingTransformation findMarkTarget(const CompositionView& composition, + const std::vector& applicableRules) noexcept { + const std::u32string current = flattenVietnameseView(composition, true, false, false); + for (const Rule& rule : applicableRules) { + if (rule.effectType != EffectType::MarkTransformation) { + continue; + } + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + Transformation* trans = *it; + if (trans->rule.effectType != EffectType::Appending || trans->rule.effectOn != rule.effectOn) { + continue; + } + std::u32string mutated = current; + const char32_t rendered = renderTargetChar(composition, trans); + for (std::size_t index = mutated.size(); index > 0; --index) { + if (mutated[index - 1] == rendered) { + mutated[index - 1] = addMarkToChar(mutated[index - 1], rule.effect); + break; + } + } + if (mutated != current && validateWord(mutated, false)) { + return PendingTransformation{rule, findRootTarget(trans), false}; + } + } + } + return {}; +} + +PendingTransformation findTarget(const CompositionView& composition, + const std::vector& applicableRules) noexcept { + const std::u32string current = flattenVietnameseView(composition, true, false, false); + for (const Rule& rule : applicableRules) { + if (rule.effectType == EffectType::ToneTransformation) { + if (Transformation* target = findToneTarget(composition); target != nullptr) { + std::u32string mutated = current; + const char32_t rendered = renderTargetChar(composition, target); + for (std::size_t index = mutated.size(); index > 0; --index) { + if (mutated[index - 1] == rendered) { + mutated[index - 1] = addToneToChar(mutated[index - 1], rule.effect); + break; + } + } + if (mutated != current) { + return PendingTransformation{rule, target, false}; + } + } + } + } + PendingTransformation markTarget = findMarkTarget(composition, applicableRules); + if (markTarget.target != nullptr) { + return markTarget; + } + + const std::u32string normalized = flattenVietnameseView(composition, true, true, true); + if (normalized.find(U"uo") != std::u32string::npos || normalized.find(U"ươ") != std::u32string::npos || normalized.find(U"ưo") != std::u32string::npos) { + Transformation* vowelTarget = nullptr; + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + Transformation* trans = *it; + if (trans->rule.effectType == EffectType::Appending && isVowel(trans->rule.effectOn)) { + vowelTarget = trans; + break; + } + } + if (vowelTarget != nullptr) { + for (const Rule& rule : applicableRules) { + if (rule.effectType == EffectType::MarkTransformation && rule.effectOn == vowelTarget->rule.effectOn) { + return PendingTransformation{rule, vowelTarget, false}; + } + } + } + } + return {}; +} + +std::vector generateUndoTransformations(const CompositionView& composition, + const std::vector& applicableRules) { + std::vector result; + for (const Rule& rule : applicableRules) { + if (rule.effectType == EffectType::ToneTransformation) { + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + if ((*it)->rule.effectType == EffectType::ToneTransformation && (*it)->target != nullptr) { + Rule undoRule{}; + undoRule.effectType = EffectType::ToneTransformation; + undoRule.effect = static_cast(Tone::None); + result.push_back(PendingTransformation{undoRule, (*it)->target, false}); + return result; + } + } + } else if (rule.effectType == EffectType::MarkTransformation) { + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + if ((*it)->rule.effectType == EffectType::MarkTransformation && (*it)->target != nullptr) { + Rule undoRule{}; + undoRule.effectType = EffectType::MarkTransformation; + undoRule.effect = static_cast(Mark::None); + result.push_back(PendingTransformation{undoRule, (*it)->target, false}); + return result; + } + } + } + } + return result; +} + +std::vector generateFallbackTransformations(const std::vector& applicableRules, + char32_t lowerKey, + bool isUpperCase) { + std::vector result; + for (const Rule& rule : applicableRules) { + if (rule.effectType != EffectType::Appending) { + continue; + } + Rule baseRule = rule; + baseRule.effectOn = toLowerCodePoint(baseRule.effectOn); + baseRule.result = baseRule.effectOn; + result.push_back(PendingTransformation{baseRule, nullptr, isUpperCase || rule.effectOn != baseRule.effectOn}); + for (const Rule& appendedRule : rule.appendedRules) { + Rule virtualRule = appendedRule; + virtualRule.key = 0; + virtualRule.effectOn = toLowerCodePoint(virtualRule.effectOn); + virtualRule.result = virtualRule.effectOn; + result.push_back(PendingTransformation{virtualRule, nullptr, isUpperCase || appendedRule.effectOn != virtualRule.effectOn}); + } + return result; + } + + Rule rawRule{}; + rawRule.key = lowerKey; + rawRule.effectOn = lowerKey; + rawRule.result = lowerKey; + rawRule.effectType = EffectType::Appending; + result.push_back(PendingTransformation{rawRule, nullptr, isUpperCase}); + return result; +} + +std::vector refreshLastToneTarget(const CompositionView& composition) { + Transformation* latestTone = nullptr; + for (auto it = composition.rbegin(); it != composition.rend(); ++it) { + if ((*it)->rule.effectType == EffectType::ToneTransformation && (*it)->target != nullptr) { + latestTone = *it; + break; + } + } + if (latestTone == nullptr) { + return {}; + } + Transformation* newTarget = findToneTarget(composition); + if (newTarget == nullptr || newTarget == latestTone->target) { + return {}; + } + Rule undoRule{}; + undoRule.effectType = EffectType::ToneTransformation; + undoRule.effect = static_cast(Tone::None); + Rule overrideRule = latestTone->rule; + overrideRule.key = 0; + return {PendingTransformation{undoRule, latestTone->target, false}, PendingTransformation{overrideRule, newTarget, false}}; +} + +std::u32string flattenVietnamese(const std::deque& composition, bool lowerCase) { + std::u32string canvas; + canvas.reserve(composition.size()); + for (const Transformation& appending : composition) { + if (appending.rule.effectType != EffectType::Appending || appending.rule.key == 0) { + continue; + } + + char32_t chr = appending.rule.effectOn; + for (const Transformation& trans : composition) { + if (trans.target != &appending) { + continue; + } + if (trans.rule.effectType == EffectType::MarkTransformation) { + if (trans.rule.mark() == Mark::Raw) { + chr = appending.rule.key; + } else { + chr = addMarkToChar(chr, trans.rule.effect); + } + } else if (trans.rule.effectType == EffectType::ToneTransformation) { + chr = addToneToChar(chr, trans.rule.effect); + } + } + + if (lowerCase) { + chr = toLowerCodePoint(chr); + } else if (appending.isUpperCase) { + chr = toUpperCodePoint(chr); + } + canvas.push_back(chr); + } + return canvas; +} + +std::u32string currentWord(const std::deque& composition) { + const std::u32string text = flattenVietnamese(composition, true); + const auto pos = text.find_last_of(U" \n\t"); + if (pos == std::u32string::npos) { + return text; + } + return text.substr(pos + 1); +} + +Segments splitWord(std::u32string_view word) noexcept { + std::size_t firstVowelIndex = 0; + while (firstVowelIndex < word.size() && !isVowel(word[firstVowelIndex])) { + ++firstVowelIndex; + } + std::size_t trailingConsonantIndex = word.size(); + while (trailingConsonantIndex > firstVowelIndex && !isVowel(word[trailingConsonantIndex - 1])) { + --trailingConsonantIndex; + } + return {word.substr(0, firstVowelIndex), word.substr(firstVowelIndex, trailingConsonantIndex - firstVowelIndex), word.substr(trailingConsonantIndex)}; +} + +std::deque breakComposition(const CompositionView& composition) { + std::deque result; + for (const Transformation* trans : composition) { + if (trans->rule.key == 0) { + continue; + } + Transformation broken; + broken.isUpperCase = trans->isUpperCase; + broken.rule.key = trans->rule.key; + broken.rule.effectOn = trans->rule.key; + broken.rule.result = trans->rule.key; + broken.rule.effectType = EffectType::Appending; + result.push_back(broken); + } + return result; +} + +} // namespace bamboo::engine diff --git a/src/engine/transformation_utils.h b/src/engine/transformation_utils.h new file mode 100644 index 0000000..c766076 --- /dev/null +++ b/src/engine/transformation_utils.h @@ -0,0 +1,42 @@ +#pragma once + +#include "engine.h" + +#include + +namespace bamboo::engine { + +using CompositionView = std::vector; + +struct PendingTransformation final { + Rule rule; + Transformation* target{nullptr}; + bool isUpperCase{false}; +}; + +struct Segments final { + std::u32string_view firstConsonant; + std::u32string_view vowel; + std::u32string_view lastConsonant; +}; + +[[nodiscard]] CompositionView makeCompositionView(std::deque& composition); +[[nodiscard]] CompositionView extractLastSyllable(const CompositionView& composition); +[[nodiscard]] PendingTransformation findMarkTarget(const CompositionView& composition, + const std::vector& applicableRules) noexcept; +[[nodiscard]] PendingTransformation findTarget(const CompositionView& composition, + const std::vector& applicableRules) noexcept; +[[nodiscard]] std::vector generateUndoTransformations(const CompositionView& composition, + const std::vector& applicableRules); +[[nodiscard]] std::vector generateFallbackTransformations(const std::vector& applicableRules, + char32_t lowerKey, + bool isUpperCase); +[[nodiscard]] std::vector refreshLastToneTarget(const CompositionView& composition); +[[nodiscard]] std::u32string flattenVietnamese(const std::deque& composition, bool lowerCase); +[[nodiscard]] std::u32string currentWord(const std::deque& composition); +[[nodiscard]] Segments splitWord(std::u32string_view word) noexcept; +[[nodiscard]] std::deque breakComposition(const CompositionView& composition); +[[nodiscard]] char32_t toLowerCodePoint(char32_t codePoint) noexcept; +[[nodiscard]] bool isVowel(char32_t chr) noexcept; + +} // namespace bamboo::engine diff --git a/tests/engine_parity_smoke.cpp b/tests/engine_parity_smoke.cpp new file mode 100644 index 0000000..81d0eff --- /dev/null +++ b/tests/engine_parity_smoke.cpp @@ -0,0 +1,78 @@ +#include "bamboo/IEngine.h" + +#include +#include +#include + +namespace { + +void test_telex_aw() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("aw"); + assert(engine->getProcessedString() == std::string("ă")); +} + +void test_telex_as() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("as"); + assert(engine->getProcessedString() == std::string("á")); +} + +void test_telex_dd() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("dd"); + assert(engine->getProcessedString() == std::string("đ")); +} + +void test_telex_sawss() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("sawss"); + assert(engine->getProcessedString() == std::string("săs")); +} + +void test_telex2_uwow() { + auto engine = bamboo::api::createEngine("/tmp", "Telex 2"); + engine->processString("uwow"); + assert(engine->getProcessedString() == std::string("ươ")); +} + +void test_telex2_thuow_and_backspace() { + auto engine = bamboo::api::createEngine("/tmp", "Telex 2"); + engine->processString("Thuow"); + assert(engine->getProcessedString() == std::string("Thuơ")); + engine->removeLastChar(true); + assert(engine->getProcessedString() == std::string("Thuo")); +} + +void test_telex2_choas() { + auto engine = bamboo::api::createEngine("/tmp", "Telex 2"); + engine->processString("choas"); + assert(engine->getProcessedString() == std::string("choá")); +} + +void test_telex_buwoo() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("buwoo"); + assert(engine->getProcessedString() == std::string("buô")); +} + +void test_telex_cuongw() { + auto engine = bamboo::api::createEngine("/tmp", "Telex"); + engine->processString("cuongw"); + assert(engine->getProcessedString() == std::string("cương")); +} + +} // namespace + +int main() { + test_telex_aw(); + test_telex_as(); + test_telex_dd(); + test_telex_sawss(); + test_telex2_uwow(); + test_telex2_thuow_and_backspace(); + test_telex2_choas(); + test_telex_buwoo(); + test_telex_cuongw(); + return 0; +} From 873e46861378229c49ba570ddbf9b93d1d2f3755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hu=E1=BB=B3nh=20Thi=E1=BB=87n=20L=E1=BB=99c?= Date: Sun, 22 Mar 2026 12:31:41 +0700 Subject: [PATCH 4/5] Refactor composition view to use deque slices --- docs/migration-semantics.md | 61 +++++ src/engine/engine.cpp | 172 +++++++++---- src/engine/engine.h | 3 +- src/engine/rule.h | 46 +++- src/engine/rules_parser.cpp | 15 +- src/engine/spelling.cpp | 20 +- src/engine/transformation_utils.cpp | 293 ++++++++++++++------- src/engine/transformation_utils.h | 56 ++++- tests/differential_cases.json | 142 +++++++++++ tools/run_differential.py | 377 ++++++++++++++++++++++++++++ 10 files changed, 1014 insertions(+), 171 deletions(-) create mode 100644 docs/migration-semantics.md create mode 100644 tests/differential_cases.json create mode 100644 tools/run_differential.py diff --git a/docs/migration-semantics.md b/docs/migration-semantics.md new file mode 100644 index 0000000..ef7af87 --- /dev/null +++ b/docs/migration-semantics.md @@ -0,0 +1,61 @@ +# C++ engine migration semantics + +This document freezes the intended semantic contract between the legacy Go engine and the migrated C++ engine. + +## Scope + +- **Reference implementation:** the existing Go engine in the repository root. +- **Target implementation:** the C++ engine behind `include/bamboo/IEngine.h`. +- **Primary goal:** match Go typing behavior for end users before performing deeper performance refactors. + +## Go behavior + +The Go engine exposes a broad flag- and mode-based API: + +- Per-call modes include Vietnamese, English, tone-less, mark-less, lowercase, full-text, punctuation mode, and reverse-order typing. +- Engine flags include free tone marking, standard tone style, and autocorrect. +- `ProcessKey` receives the mode for each call instead of storing mode inside the engine. +- `RemoveLastChar(refreshLastTone)` optionally recomputes the latest tone target after deletion. +- `RestoreLastWord(false)` breaks the latest transformed word back into raw key strokes. +- `RestoreLastWord(true)` rebuilds the latest word into Vietnamese transformations. + +## C++ target behavior + +The C++ API intentionally narrows the public surface: + +- Public mode is stateful and only exposes `Vietnamese` and `English`. +- `processKey(char32_t)` and `processString(std::string_view)` operate against the current mode set via `setMode`. +- `removeLastChar(bool refreshLastToneTarget)` must preserve the Go deletion semantics. +- `restoreLastWord(bool toVietnamese)` must preserve the Go restore semantics: + - `false`: break the latest transformed word back to raw key strokes and switch to English mode. + - `true`: rebuild the latest word as Vietnamese composition and switch to Vietnamese mode. +- Backspace (`'\b'` and `0x7f`) in Vietnamese mode is treated as deletion, not as ordinary input. + +## Required parity areas + +The following behaviors are treated as **must-match** relative to Go: + +1. Input-method transformations for Telex, VNI, and VIQR. +2. Tone placement and tone retargeting after edits. +3. `removeLastChar(...)` behavior, including optional tone refresh. +4. `restoreLastWord(...)` behavior in both directions. +5. Backspace handling while in Vietnamese mode. + +## Intentional deviations + +The following differences are currently intentional and must be documented rather than treated as regressions: + +1. The C++ public API does **not** expose Go's per-call mode bitmask directly. +2. Go flags are currently treated as internal policy rather than stable public API in C++. +3. Punctuation/full-text/reverse-order/tone-less/mark-less output modes are not yet part of `IEngine`. + +When one of these items needs to be supported in `fcitx5-lotus`, it should first be added to this document before changing the public contract. + +## Migration rule + +Before optimizing the hot path further, every semantic change must satisfy one of these conditions: + +- it is covered by a Go-vs-C++ differential test, or +- it is listed in **Intentional deviations** above. + +If neither condition is true, the change is considered unsafe. diff --git a/src/engine/engine.cpp b/src/engine/engine.cpp index 5414057..51a0595 100644 --- a/src/engine/engine.cpp +++ b/src/engine/engine.cpp @@ -36,6 +36,62 @@ namespace { return output; } +[[nodiscard]] int findVowelPosition(char32_t chr) noexcept { + static constexpr std::u32string_view kVowels = + U"aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵ"; + const auto pos = kVowels.find(chr); + return pos == std::u32string_view::npos ? -1 : static_cast(pos); +} + +[[nodiscard]] char32_t stripTone(char32_t chr) noexcept { + const int position = findVowelPosition(chr); + if (position < 0) { + return chr; + } + return static_cast( + U"aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵ" + [static_cast(position - (position % 6))]); +} + +[[nodiscard]] std::u32string stripTone(std::u32string_view input) { + std::u32string output; + output.reserve(input.size()); + for (char32_t chr : input) { + output.push_back(stripTone(chr)); + } + return output; +} + +[[nodiscard]] bool isBackspaceKey(char32_t key) noexcept { + return key == U'\b' || key == 0x7f; +} + +[[nodiscard]] char32_t toUpperCodePoint(char32_t codePoint) noexcept { + if (codePoint >= U'a' && codePoint <= U'z') { + return codePoint - 32; + } + switch (codePoint) { + case U'đ': return U'Đ'; + case U'â': return U'Â'; + case U'ă': return U'Ă'; + case U'ê': return U'Ê'; + case U'ô': return U'Ô'; + case U'ơ': return U'Ơ'; + case U'ư': return U'Ư'; + default: return codePoint; + } +} + +void appendPending(std::deque& composition, const PendingTransformationList& pending) { + for (const PendingTransformation& item : pending) { + Transformation trans; + trans.rule = item.rule; + trans.target = item.target; + trans.isUpperCase = item.isUpperCase; + composition.push_back(trans); + } +} + } // namespace Engine::Engine(std::string_view dataDirPath, std::string_view inputMethod) @@ -56,15 +112,7 @@ void Engine::reset() { encodedCacheDirty_ = true; } -std::vector Engine::applicableRules(char32_t key) const { - std::vector rules; - for (const Rule& rule : inputMethod_.rules) { - if (rule.key == key) { - rules.push_back(rule); - } - } - return rules; -} +RuleSpan Engine::applicableRules(char32_t key) const noexcept { return inputMethod_.rulesFor(key); } bool Engine::canProcessKey(char32_t key) const noexcept { if ((key >= U'a' && key <= U'z') || (key >= U'A' && key <= U'Z')) { @@ -84,7 +132,23 @@ void Engine::appendRawKey(char32_t key, bool isUpperCase) { encodedCacheDirty_ = true; } +void Engine::handleBackspace() { + if (composition_.empty()) { + return; + } + composition_.pop_back(); + while (!composition_.empty() && composition_.back().rule.key == 0) { + composition_.pop_back(); + } + encodedCacheDirty_ = true; +} + void Engine::processKey(char32_t key) { + if (mode_ != api::Mode::English && isBackspaceKey(key)) { + handleBackspace(); + return; + } + const char32_t lowerKey = toLowerCodePoint(key); const bool isUpperCase = key != lowerKey; @@ -93,7 +157,7 @@ void Engine::processKey(char32_t key) { return; } - const std::vector rules = applicableRules(lowerKey); + const RuleSpan rules = applicableRules(lowerKey); if (std::find(inputMethod_.superKeys.begin(), inputMethod_.superKeys.end(), lowerKey) != inputMethod_.superKeys.end()) { const std::u32string word = currentWord(composition_); if (word.size() >= 2) { @@ -101,6 +165,11 @@ void Engine::processKey(char32_t key) { const bool isUoShortcut = tail == U"uo" || tail == U"ưo"; const bool isUongShortcut = word.size() >= 5 && word.substr(word.size() - 5) == U"uong"; if (isUoShortcut || isUongShortcut) { + auto isValidShortcutWord = [](std::u32string_view candidate) { + Spelling spelling; + const Segments segments = splitWord(candidate); + return spelling.isValidCvc(segments.firstConsonant, segments.vowel, segments.lastConsonant, true); + }; Transformation* uTarget = nullptr; Transformation* oTarget = nullptr; for (auto it = composition_.rbegin(); it != composition_.rend(); ++it) { @@ -132,13 +201,20 @@ void Engine::processKey(char32_t key) { trans.isUpperCase = isUpperCase; composition_.push_back(trans); } - if (((word.size() == 2 && tail == U"uo") || isUongShortcut) && + if ((isUoShortcut || isUongShortcut) && rule.effectType == EffectType::MarkTransformation && rule.effectOn == U'u' && uTarget != nullptr) { - Transformation trans; - trans.rule = rule; - trans.target = uTarget; - trans.isUpperCase = isUpperCase; - composition_.push_back(trans); + std::u32string mutated = word; + if (!mutated.empty() && mutated.back() == U'o') { + mutated.back() = U'ơ'; + } + if (!isValidShortcutWord(mutated) || isUongShortcut) { + Transformation trans; + trans.rule = rule; + trans.rule.key = 0; + trans.target = uTarget; + trans.isUpperCase = isUpperCase; + composition_.push_back(trans); + } } } if (oTarget != nullptr) { @@ -150,7 +226,7 @@ void Engine::processKey(char32_t key) { } CompositionView syllable = extractLastSyllable(makeCompositionView(composition_)); - std::vector pending; + PendingTransformationList pending; if (const PendingTransformation direct = findTarget(syllable, rules); direct.target != nullptr) { pending.push_back(direct); @@ -168,22 +244,10 @@ void Engine::processKey(char32_t key) { } } - for (PendingTransformation& item : pending) { - Transformation trans; - trans.rule = item.rule; - trans.target = item.target; - trans.isUpperCase = item.isUpperCase; - composition_.push_back(trans); - } + appendPending(composition_, pending); CompositionView updated = extractLastSyllable(makeCompositionView(composition_)); - for (PendingTransformation& item : refreshLastToneTarget(updated)) { - Transformation trans; - trans.rule = item.rule; - trans.target = item.target; - trans.isUpperCase = item.isUpperCase; - composition_.push_back(trans); - } + appendPending(composition_, refreshLastToneTarget(updated)); encodedCacheDirty_ = true; } @@ -204,35 +268,34 @@ std::string Engine::getProcessedString() const { } bool Engine::isValid(bool inputIsFullComplete) const { - if (mode_ == api::Mode::English) { - return true; - } const std::u32string word = currentWord(composition_); if (word.empty()) { return true; } - const Segments segments = splitWord(word); + const std::u32string toneLessWord = stripTone(word); + const Segments segments = splitWord(toneLessWord); Spelling spelling; return spelling.isValidCvc(segments.firstConsonant, segments.vowel, segments.lastConsonant, inputIsFullComplete); } -void Engine::removeLastChar(bool /*refreshLastToneTarget*/) { - if (!composition_.empty()) { - composition_.pop_back(); - while (!composition_.empty() && composition_.back().rule.key == 0) { - composition_.pop_back(); - } - encodedCacheDirty_ = true; +void Engine::removeLastChar(bool refreshLastToneTargetFlag) { + handleBackspace(); + if (!refreshLastToneTargetFlag) { + return; } + + CompositionView updated = extractLastSyllable(makeCompositionView(composition_)); + appendPending(composition_, refreshLastToneTarget(updated)); + encodedCacheDirty_ = true; } void Engine::restoreLastWord(bool toVietnamese) { - if (toVietnamese) { - mode_ = api::Mode::Vietnamese; + CompositionView syllable = extractLastSyllable(makeCompositionView(composition_)); + if (syllable.empty()) { + mode_ = toVietnamese ? api::Mode::Vietnamese : api::Mode::English; return; } - CompositionView syllable = extractLastSyllable(makeCompositionView(composition_)); std::deque broken = breakComposition(syllable); while (!composition_.empty()) { const Transformation& back = composition_.back(); @@ -242,10 +305,23 @@ void Engine::restoreLastWord(bool toVietnamese) { } composition_.pop_back(); } - for (Transformation& trans : broken) { - composition_.push_back(trans); + + if (!toVietnamese) { + for (Transformation& trans : broken) { + composition_.push_back(trans); + } + mode_ = api::Mode::English; + encodedCacheDirty_ = true; + return; + } + + const api::Mode previousMode = mode_; + mode_ = api::Mode::Vietnamese; + for (const Transformation& trans : broken) { + const char32_t key = trans.isUpperCase ? toUpperCodePoint(trans.rule.key) : trans.rule.key; + processKey(key); } - mode_ = api::Mode::English; + mode_ = previousMode == api::Mode::English ? api::Mode::Vietnamese : previousMode; encodedCacheDirty_ = true; } diff --git a/src/engine/engine.h b/src/engine/engine.h index 2483cbd..815172a 100644 --- a/src/engine/engine.h +++ b/src/engine/engine.h @@ -31,9 +31,10 @@ class Engine final : public api::IEngine { void restoreLastWord(bool toVietnamese) override; private: - [[nodiscard]] std::vector applicableRules(char32_t key) const; + [[nodiscard]] RuleSpan applicableRules(char32_t key) const noexcept; [[nodiscard]] bool canProcessKey(char32_t key) const noexcept; void appendRawKey(char32_t key, bool isUpperCase); + void handleBackspace(); api::Mode mode_{api::Mode::Vietnamese}; std::string dataDirPath_; diff --git a/src/engine/rule.h b/src/engine/rule.h index fdedc99..237866c 100644 --- a/src/engine/rule.h +++ b/src/engine/rule.h @@ -1,11 +1,16 @@ #pragma once +#include #include +#include #include #include namespace bamboo::engine { +constexpr std::size_t kMaxAppendedChars = 4; +constexpr std::size_t kMaxRuleIndexEntries = 32; + enum class EffectType : std::uint8_t { Appending = 1u << 0, MarkTransformation = 1u << 1, @@ -31,27 +36,66 @@ enum class Tone : std::uint8_t { Dot = 5, }; +struct AppendedChar final { + char32_t effectOn{0}; + char32_t result{0}; +}; + struct Rule final { char32_t key{0}; std::uint8_t effect{0}; EffectType effectType{EffectType::Appending}; char32_t effectOn{0}; char32_t result{0}; - std::vector appendedRules; + std::array appendedChars{}; + std::uint8_t appendedCount{0}; void setTone(Tone tone) noexcept { effect = static_cast(tone); } void setMark(Mark mark) noexcept { effect = static_cast(mark); } [[nodiscard]] Tone tone() const noexcept { return static_cast(effect); } [[nodiscard]] Mark mark() const noexcept { return static_cast(effect); } + + void appendChar(char32_t appendedEffectOn, char32_t appendedResult) noexcept { + if (appendedCount < appendedChars.size()) { + appendedChars[appendedCount++] = AppendedChar{appendedEffectOn, appendedResult}; + } + } +}; + +struct RuleSpan final { + const Rule* data{nullptr}; + std::size_t size{0}; + + [[nodiscard]] const Rule* begin() const noexcept { return data; } + [[nodiscard]] const Rule* end() const noexcept { return data + size; } + [[nodiscard]] bool empty() const noexcept { return size == 0; } + [[nodiscard]] const Rule& operator[](std::size_t index) const noexcept { return data[index]; } +}; + +struct KeyRuleIndex final { + char32_t key{0}; + std::uint16_t begin{0}; + std::uint16_t size{0}; }; struct InputMethod final { std::string name; std::vector rules; + std::array ruleIndex{}; + std::uint8_t ruleIndexSize{0}; std::vector superKeys; std::vector toneKeys; std::vector appendingKeys; std::vector keys; + + [[nodiscard]] RuleSpan rulesFor(char32_t key) const noexcept { + for (std::size_t index = 0; index < ruleIndexSize; ++index) { + if (ruleIndex[index].key == key) { + return RuleSpan{rules.data() + ruleIndex[index].begin, ruleIndex[index].size}; + } + } + return {}; + } }; } // namespace bamboo::engine diff --git a/src/engine/rules_parser.cpp b/src/engine/rules_parser.cpp index 844a823..ce5898e 100644 --- a/src/engine/rules_parser.cpp +++ b/src/engine/rules_parser.cpp @@ -165,12 +165,7 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ rule.effectOn = chars.front(); rule.result = chars.front(); for (std::size_t index = 1; index < chars.size(); ++index) { - Rule appended; - appended.key = key; - appended.effectType = EffectType::Appending; - appended.effectOn = chars[index]; - appended.result = chars[index]; - rule.appendedRules.push_back(appended); + rule.appendChar(chars[index], chars[index]); } return rule; } @@ -190,8 +185,16 @@ InputMethod parseInputMethod(std::string_view inputMethodName) { for (std::size_t index = 0; index < definition->size; ++index) { const auto& mapping = definition->mappings[index]; + const std::size_t begin = result.rules.size(); std::vector parsedRules = parseRules(mapping.key, mapping.action); result.rules.insert(result.rules.end(), parsedRules.begin(), parsedRules.end()); + if (result.ruleIndexSize < result.ruleIndex.size()) { + result.ruleIndex[result.ruleIndexSize++] = KeyRuleIndex{ + mapping.key, + static_cast(begin), + static_cast(result.rules.size() - begin), + }; + } if (mapping.action.find("uo") != std::string_view::npos || mapping.action.find("UO") != std::string_view::npos) { result.superKeys.push_back(mapping.key); } diff --git a/src/engine/spelling.cpp b/src/engine/spelling.cpp index a38cad4..7083654 100644 --- a/src/engine/spelling.cpp +++ b/src/engine/spelling.cpp @@ -58,17 +58,17 @@ constexpr std::array, 5> kCvMatrix{{ }}; constexpr std::array kCvMatrixSizes{{4, 6, 5, 1, 1}}; -constexpr std::array, 8> kVcMatrix{{ - {{0, 2}}, - {{0, 1}}, - {{1, 2}}, - {{1, 2}}, - {{0xFF, 0xFF}}, - {{0xFF, 0xFF}}, - {{3, 0xFF}}, - {{4, 0xFF}}, +constexpr std::array, 8> kVcMatrix{{ + {{0, 2, 0xFF}}, + {{0, 1, 2}}, + {{1, 2, 0xFF}}, + {{1, 2, 0xFF}}, + {{0xFF, 0xFF, 0xFF}}, + {{0xFF, 0xFF, 0xFF}}, + {{3, 0xFF, 0xFF}}, + {{4, 0xFF, 0xFF}}, }}; -constexpr std::array kVcMatrixSizes{{2, 2, 2, 2, 0, 0, 1, 1}}; +constexpr std::array kVcMatrixSizes{{2, 3, 2, 2, 0, 0, 1, 1}}; bool matchesRow(const TokenRow& row, std::u32string_view input, diff --git a/src/engine/transformation_utils.cpp b/src/engine/transformation_utils.cpp index 2c38866..e5d5c81 100644 --- a/src/engine/transformation_utils.cpp +++ b/src/engine/transformation_utils.cpp @@ -40,19 +40,25 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ [[nodiscard]] char32_t addMarkToTonelessChar(char32_t chr, std::uint8_t mark) noexcept { switch (chr) { - case U'a': return mark == 1 ? U'â' : (mark == 2 ? U'ă' : U'a'); - case U'â': return mark == 0 ? U'a' : U'â'; - case U'ă': return mark == 0 ? U'a' : U'ă'; - case U'e': return mark == 1 ? U'ê' : U'e'; - case U'ê': return mark == 0 ? U'e' : U'ê'; - case U'o': return mark == 1 ? U'ô' : (mark == 3 ? U'ơ' : U'o'); - case U'ô': return mark == 0 ? U'o' : U'ô'; - case U'ơ': return mark == 0 ? U'o' : U'ơ'; - case U'u': return mark == 3 ? U'ư' : U'u'; - case U'ư': return mark == 0 ? U'u' : U'ư'; - case U'd': return mark == 4 ? U'đ' : U'd'; - case U'đ': return mark == 0 ? U'd' : U'đ'; - default: return chr; + case U'a': + case U'â': + case U'ă': + return mark == 1 ? U'â' : (mark == 2 ? U'ă' : U'a'); + case U'e': + case U'ê': + return mark == 1 ? U'ê' : U'e'; + case U'o': + case U'ô': + case U'ơ': + return mark == 1 ? U'ô' : (mark == 3 ? U'ơ' : U'o'); + case U'u': + case U'ư': + return mark == 3 ? U'ư' : U'u'; + case U'd': + case U'đ': + return mark == 4 ? U'đ' : U'd'; + default: + return chr; } } @@ -67,15 +73,21 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ } -[[nodiscard]] std::u32string flattenVietnameseView(const CompositionView& composition, bool lowerCase, bool toneLess, bool markLess) { +[[nodiscard]] std::u32string flattenVietnameseView(const CompositionView& composition, + bool lowerCase, + bool toneLess, + bool markLess, + const Transformation* extra = nullptr) { std::u32string canvas; - canvas.reserve(composition.size()); - for (const Transformation* appending : composition) { + canvas.reserve(composition.size() + (extra != nullptr ? 1U : 0U)); + for (std::size_t index = 0; index < composition.size(); ++index) { + const Transformation* appending = composition[index]; if (appending->rule.effectType != EffectType::Appending || appending->rule.key == 0) { continue; } char32_t chr = appending->rule.effectOn; - for (const Transformation* trans : composition) { + for (std::size_t transIndex = 0; transIndex < composition.size(); ++transIndex) { + const Transformation* trans = composition[transIndex]; if (trans->target != appending) { continue; } @@ -85,6 +97,13 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ chr = addToneToChar(chr, trans->rule.effect); } } + if (extra != nullptr && extra->target == appending) { + if (extra->rule.effectType == EffectType::MarkTransformation) { + chr = extra->rule.mark() == Mark::Raw ? appending->rule.key : addMarkToChar(chr, extra->rule.effect); + } else if (extra->rule.effectType == EffectType::ToneTransformation) { + chr = addToneToChar(chr, extra->rule.effect); + } + } if (toneLess) { chr = addToneToChar(chr, 0); } @@ -105,9 +124,28 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ return spelling.isValidCvc(segments.firstConsonant, segments.vowel, segments.lastConsonant, full); } -[[nodiscard]] char32_t renderTargetChar(const CompositionView& composition, const Transformation* target) { +[[nodiscard]] bool hasValidTone(const CompositionView& composition, Tone tone); + +[[nodiscard]] bool hasValidTone(const CompositionView& composition, Tone tone) { + if (tone == Tone::None || tone == Tone::Acute || tone == Tone::Dot) { + return true; + } + const std::u32string word = flattenVietnameseView(composition, true, false, false); + const Segments segments = splitWord(word); + const std::u32string_view lastConsonants = segments.lastConsonant; + if (lastConsonants.empty()) { + return true; + } + return !(lastConsonants == U"c" || lastConsonants == U"k" || lastConsonants == U"p" || + lastConsonants == U"t" || lastConsonants == U"ch"); +} + +[[nodiscard]] char32_t renderTargetChar(const CompositionView& composition, + const Transformation* target, + const Transformation* extra = nullptr) { char32_t chr = target->rule.effectOn; - for (const Transformation* trans : composition) { + for (std::size_t index = 0; index < composition.size(); ++index) { + const Transformation* trans = composition[index]; if (trans->target != target) { continue; } @@ -117,6 +155,13 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ chr = addToneToChar(chr, trans->rule.effect); } } + if (extra != nullptr && extra->target == target) { + if (extra->rule.effectType == EffectType::MarkTransformation) { + chr = extra->rule.mark() == Mark::Raw ? target->rule.key : addMarkToChar(chr, extra->rule.effect); + } else if (extra->rule.effectType == EffectType::ToneTransformation) { + chr = addToneToChar(chr, extra->rule.effect); + } + } return chr; } @@ -128,9 +173,9 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ } [[nodiscard]] Transformation* findToneTarget(const CompositionView& composition) noexcept { - std::vector vowels; - vowels.reserve(3); - for (Transformation* trans : composition) { + SmallList vowels; + for (std::size_t index = 0; index < composition.size(); ++index) { + Transformation* trans = composition[index]; if (trans->rule.effectType == EffectType::Appending && isVowel(trans->rule.effectOn)) { vowels.push_back(trans); } @@ -148,11 +193,12 @@ constexpr std::u32string_view kVowels = U"aàáảãạăằắẳẵặâầấ vowelShape.push_back(stripToneAndMark(renderTargetChar(composition, vowel))); } bool hasTrailingConsonant = false; - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - if ((*it)->rule.effectType != EffectType::Appending || (*it)->rule.key == 0) { + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; + if (trans->rule.effectType != EffectType::Appending || trans->rule.key == 0) { continue; } - hasTrailingConsonant = !isVowel((*it)->rule.effectOn); + hasTrailingConsonant = !isVowel(trans->rule.effectOn); break; } @@ -195,12 +241,7 @@ bool isVowel(char32_t chr) noexcept { } CompositionView makeCompositionView(std::deque& composition) { - CompositionView view; - view.reserve(composition.size()); - for (Transformation& trans : composition) { - view.push_back(&trans); - } - return view; + return CompositionView{&composition, 0, composition.size()}; } CompositionView extractLastSyllable(const CompositionView& composition) { @@ -213,31 +254,35 @@ CompositionView extractLastSyllable(const CompositionView& composition) { break; } } - return CompositionView(composition.begin() + static_cast(start), composition.end()); + return CompositionView{composition.composition, composition.begin + start, composition.end}; } PendingTransformation findMarkTarget(const CompositionView& composition, - const std::vector& applicableRules) noexcept { + RuleSpan applicableRules) noexcept { const std::u32string current = flattenVietnameseView(composition, true, false, false); - for (const Rule& rule : applicableRules) { - if (rule.effectType != EffectType::MarkTransformation) { - continue; + Transformation* lastAppending = nullptr; + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; + if (trans->rule.effectType == EffectType::Appending && trans->rule.key != 0) { + lastAppending = trans; + break; } - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - Transformation* trans = *it; - if (trans->rule.effectType != EffectType::Appending || trans->rule.effectOn != rule.effectOn) { + } + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; + for (const Rule& rule : applicableRules) { + if (rule.effectType != EffectType::MarkTransformation || rule.effect == 0 || trans->rule.result != rule.effectOn) { continue; } - std::u32string mutated = current; - const char32_t rendered = renderTargetChar(composition, trans); - for (std::size_t index = mutated.size(); index > 0; --index) { - if (mutated[index - 1] == rendered) { - mutated[index - 1] = addMarkToChar(mutated[index - 1], rule.effect); - break; - } + Transformation probe; + probe.rule = rule; + probe.target = findRootTarget(trans); + const std::u32string mutated = flattenVietnameseView(composition, true, false, false, &probe); + if (mutated == current) { + continue; } - if (mutated != current && validateWord(mutated, false)) { - return PendingTransformation{rule, findRootTarget(trans), false}; + if (validateWord(mutated, false) || probe.target == lastAppending) { + return PendingTransformation{rule, probe.target, false}; } } } @@ -245,20 +290,18 @@ PendingTransformation findMarkTarget(const CompositionView& composition, } PendingTransformation findTarget(const CompositionView& composition, - const std::vector& applicableRules) noexcept { + RuleSpan applicableRules) noexcept { const std::u32string current = flattenVietnameseView(composition, true, false, false); for (const Rule& rule : applicableRules) { if (rule.effectType == EffectType::ToneTransformation) { + if (!hasValidTone(composition, rule.tone())) { + continue; + } if (Transformation* target = findToneTarget(composition); target != nullptr) { - std::u32string mutated = current; - const char32_t rendered = renderTargetChar(composition, target); - for (std::size_t index = mutated.size(); index > 0; --index) { - if (mutated[index - 1] == rendered) { - mutated[index - 1] = addToneToChar(mutated[index - 1], rule.effect); - break; - } - } - if (mutated != current) { + Transformation probe; + probe.rule = rule; + probe.target = target; + if (flattenVietnameseView(composition, true, false, false, &probe) != current) { return PendingTransformation{rule, target, false}; } } @@ -272,8 +315,8 @@ PendingTransformation findTarget(const CompositionView& composition, const std::u32string normalized = flattenVietnameseView(composition, true, true, true); if (normalized.find(U"uo") != std::u32string::npos || normalized.find(U"ươ") != std::u32string::npos || normalized.find(U"ưo") != std::u32string::npos) { Transformation* vowelTarget = nullptr; - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - Transformation* trans = *it; + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; if (trans->rule.effectType == EffectType::Appending && isVowel(trans->rule.effectOn)) { vowelTarget = trans; break; @@ -290,27 +333,44 @@ PendingTransformation findTarget(const CompositionView& composition, return {}; } -std::vector generateUndoTransformations(const CompositionView& composition, - const std::vector& applicableRules) { - std::vector result; +PendingTransformationList generateUndoTransformations(const CompositionView& composition, + RuleSpan applicableRules) { + PendingTransformationList result; + const std::u32string current = flattenVietnameseView(composition, true, true, false); for (const Rule& rule : applicableRules) { if (rule.effectType == EffectType::ToneTransformation) { - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - if ((*it)->rule.effectType == EffectType::ToneTransformation && (*it)->target != nullptr) { - Rule undoRule{}; - undoRule.effectType = EffectType::ToneTransformation; - undoRule.effect = static_cast(Tone::None); - result.push_back(PendingTransformation{undoRule, (*it)->target, false}); - return result; - } + if (!hasValidTone(composition, rule.tone())) { + continue; + } + Transformation* target = findToneTarget(composition); + if (target == nullptr) { + continue; + } + Rule undoRule{}; + undoRule.effectType = EffectType::ToneTransformation; + undoRule.effect = static_cast(Tone::None); + Transformation probe; + probe.rule = undoRule; + probe.target = target; + if (flattenVietnameseView(composition, true, true, false, &probe) != current) { + result.push_back(PendingTransformation{undoRule, target, false}); + return result; } } else if (rule.effectType == EffectType::MarkTransformation) { - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - if ((*it)->rule.effectType == EffectType::MarkTransformation && (*it)->target != nullptr) { + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; + if (trans->rule.result == rule.effectOn) { + Transformation* target = findRootTarget(trans); Rule undoRule{}; undoRule.effectType = EffectType::MarkTransformation; undoRule.effect = static_cast(Mark::None); - result.push_back(PendingTransformation{undoRule, (*it)->target, false}); + Transformation probe; + probe.rule = undoRule; + probe.target = target; + if (flattenVietnameseView(composition, true, true, false, &probe) == current) { + continue; + } + result.push_back(PendingTransformation{undoRule, target, false}); return result; } } @@ -319,10 +379,10 @@ std::vector generateUndoTransformations(const Composition return result; } -std::vector generateFallbackTransformations(const std::vector& applicableRules, - char32_t lowerKey, - bool isUpperCase) { - std::vector result; +PendingTransformationList generateFallbackTransformations(RuleSpan applicableRules, + char32_t lowerKey, + bool isUpperCase) { + PendingTransformationList result; for (const Rule& rule : applicableRules) { if (rule.effectType != EffectType::Appending) { continue; @@ -331,12 +391,17 @@ std::vector generateFallbackTransformations(const std::ve baseRule.effectOn = toLowerCodePoint(baseRule.effectOn); baseRule.result = baseRule.effectOn; result.push_back(PendingTransformation{baseRule, nullptr, isUpperCase || rule.effectOn != baseRule.effectOn}); - for (const Rule& appendedRule : rule.appendedRules) { - Rule virtualRule = appendedRule; + for (std::size_t index = 0; index < rule.appendedCount; ++index) { + Rule virtualRule{}; virtualRule.key = 0; - virtualRule.effectOn = toLowerCodePoint(virtualRule.effectOn); + virtualRule.effectType = EffectType::Appending; + virtualRule.effectOn = toLowerCodePoint(rule.appendedChars[index].effectOn); virtualRule.result = virtualRule.effectOn; - result.push_back(PendingTransformation{virtualRule, nullptr, isUpperCase || appendedRule.effectOn != virtualRule.effectOn}); + result.push_back(PendingTransformation{ + virtualRule, + nullptr, + isUpperCase || rule.appendedChars[index].effectOn != virtualRule.effectOn, + }); } return result; } @@ -350,11 +415,12 @@ std::vector generateFallbackTransformations(const std::ve return result; } -std::vector refreshLastToneTarget(const CompositionView& composition) { +PendingTransformationList refreshLastToneTarget(const CompositionView& composition) { Transformation* latestTone = nullptr; - for (auto it = composition.rbegin(); it != composition.rend(); ++it) { - if ((*it)->rule.effectType == EffectType::ToneTransformation && (*it)->target != nullptr) { - latestTone = *it; + for (std::size_t index = composition.size(); index > 0; --index) { + Transformation* trans = composition[index - 1]; + if (trans->rule.effectType == EffectType::ToneTransformation && trans->target != nullptr) { + latestTone = trans; break; } } @@ -370,7 +436,10 @@ std::vector refreshLastToneTarget(const CompositionView& undoRule.effect = static_cast(Tone::None); Rule overrideRule = latestTone->rule; overrideRule.key = 0; - return {PendingTransformation{undoRule, latestTone->target, false}, PendingTransformation{overrideRule, newTarget, false}}; + PendingTransformationList result; + result.push_back(PendingTransformation{undoRule, latestTone->target, false}); + result.push_back(PendingTransformation{overrideRule, newTarget, false}); + return result; } std::u32string flattenVietnamese(const std::deque& composition, bool lowerCase) { @@ -417,20 +486,54 @@ std::u32string currentWord(const std::deque& composition) { } Segments splitWord(std::u32string_view word) noexcept { - std::size_t firstVowelIndex = 0; - while (firstVowelIndex < word.size() && !isVowel(word[firstVowelIndex])) { - ++firstVowelIndex; + if (word.empty()) { + return {}; + } + + std::u32string_view head = word; + std::u32string_view lastConsonant; + if (!isVowel(word.back())) { + std::size_t begin = word.size() - 1; + while (begin > 0 && !isVowel(word[begin - 1])) { + --begin; + } + head = word.substr(0, begin); + lastConsonant = word.substr(begin); + } + + std::u32string_view firstConsonant = head; + std::u32string_view vowel; + if (!head.empty() && isVowel(head.back())) { + std::size_t begin = head.size() - 1; + while (begin > 0 && isVowel(head[begin - 1])) { + --begin; + } + firstConsonant = head.substr(0, begin); + vowel = head.substr(begin); + } + + if (head.empty() && !lastConsonant.empty()) { + return {lastConsonant, {}, {}}; } - std::size_t trailingConsonantIndex = word.size(); - while (trailingConsonantIndex > firstVowelIndex && !isVowel(word[trailingConsonantIndex - 1])) { - --trailingConsonantIndex; + + if (firstConsonant.size() == 1 && !vowel.empty()) { + const bool isGi = + firstConsonant[0] == U'g' && vowel[0] == U'i' && vowel.size() > 1 && + !(vowel.size() > 1 && vowel[1] == U'e' && !lastConsonant.empty()); + const bool isQu = firstConsonant[0] == U'q' && vowel[0] == U'u'; + if (isGi || isQu) { + firstConsonant = head.substr(0, firstConsonant.size() + 1); + vowel = head.substr(firstConsonant.size()); + } } - return {word.substr(0, firstVowelIndex), word.substr(firstVowelIndex, trailingConsonantIndex - firstVowelIndex), word.substr(trailingConsonantIndex)}; + + return {firstConsonant, vowel, lastConsonant}; } std::deque breakComposition(const CompositionView& composition) { std::deque result; - for (const Transformation* trans : composition) { + for (std::size_t index = 0; index < composition.size(); ++index) { + const Transformation* trans = composition[index]; if (trans->rule.key == 0) { continue; } diff --git a/src/engine/transformation_utils.h b/src/engine/transformation_utils.h index c766076..4535f97 100644 --- a/src/engine/transformation_utils.h +++ b/src/engine/transformation_utils.h @@ -2,11 +2,45 @@ #include "engine.h" -#include +#include namespace bamboo::engine { -using CompositionView = std::vector; +struct CompositionView final { + std::deque* composition{nullptr}; + std::size_t begin{0}; + std::size_t end{0}; + + [[nodiscard]] bool empty() const noexcept { return composition == nullptr || begin >= end; } + [[nodiscard]] std::size_t size() const noexcept { return empty() ? 0 : end - begin; } + [[nodiscard]] Transformation* operator[](std::size_t index) const noexcept { + return &(*composition)[begin + index]; + } + [[nodiscard]] Transformation* front() const noexcept { return (*this)[0]; } +}; + +template +struct SmallList final { + std::array items{}; + std::size_t count{0}; + + [[nodiscard]] bool empty() const noexcept { return count == 0; } + [[nodiscard]] std::size_t size() const noexcept { return count; } + [[nodiscard]] const T* begin() const noexcept { return items.data(); } + [[nodiscard]] const T* end() const noexcept { return items.data() + count; } + [[nodiscard]] T* begin() noexcept { return items.data(); } + [[nodiscard]] T* end() noexcept { return items.data() + count; } + [[nodiscard]] const T& operator[](std::size_t index) const noexcept { return items[index]; } + [[nodiscard]] T& operator[](std::size_t index) noexcept { return items[index]; } + [[nodiscard]] const T& front() const noexcept { return items[0]; } + [[nodiscard]] T& front() noexcept { return items[0]; } + + void push_back(const T& value) noexcept { + if (count < Capacity) { + items[count++] = value; + } + } +}; struct PendingTransformation final { Rule rule; @@ -14,6 +48,8 @@ struct PendingTransformation final { bool isUpperCase{false}; }; +using PendingTransformationList = SmallList; + struct Segments final { std::u32string_view firstConsonant; std::u32string_view vowel; @@ -23,15 +59,15 @@ struct Segments final { [[nodiscard]] CompositionView makeCompositionView(std::deque& composition); [[nodiscard]] CompositionView extractLastSyllable(const CompositionView& composition); [[nodiscard]] PendingTransformation findMarkTarget(const CompositionView& composition, - const std::vector& applicableRules) noexcept; + RuleSpan applicableRules) noexcept; [[nodiscard]] PendingTransformation findTarget(const CompositionView& composition, - const std::vector& applicableRules) noexcept; -[[nodiscard]] std::vector generateUndoTransformations(const CompositionView& composition, - const std::vector& applicableRules); -[[nodiscard]] std::vector generateFallbackTransformations(const std::vector& applicableRules, - char32_t lowerKey, - bool isUpperCase); -[[nodiscard]] std::vector refreshLastToneTarget(const CompositionView& composition); + RuleSpan applicableRules) noexcept; +[[nodiscard]] PendingTransformationList generateUndoTransformations(const CompositionView& composition, + RuleSpan applicableRules); +[[nodiscard]] PendingTransformationList generateFallbackTransformations(RuleSpan applicableRules, + char32_t lowerKey, + bool isUpperCase); +[[nodiscard]] PendingTransformationList refreshLastToneTarget(const CompositionView& composition); [[nodiscard]] std::u32string flattenVietnamese(const std::deque& composition, bool lowerCase); [[nodiscard]] std::u32string currentWord(const std::deque& composition); [[nodiscard]] Segments splitWord(std::u32string_view word) noexcept; diff --git a/tests/differential_cases.json b/tests/differential_cases.json new file mode 100644 index 0000000..4a84b6b --- /dev/null +++ b/tests/differential_cases.json @@ -0,0 +1,142 @@ +[ + { + "id": "telex2_aw", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "aw" } + ] + }, + { + "id": "telex2_uwow", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "uwow" } + ] + }, + { + "id": "telex2_chuaarn", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "chuaarn" } + ] + }, + { + "id": "telex2_giamaf", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "giamaf" } + ] + }, + { + "id": "telex2_ddafi", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "ddafi" } + ] + }, + { + "id": "telex2_mootj", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "mootj" } + ] + }, + { + "id": "telex2_loanj_remove", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "loanj" }, + { "op": "remove_last_char", "refresh": true } + ] + }, + { + "id": "telex2_toowi", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "toowi" } + ] + }, + { + "id": "telex2_aloo", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "aloo" } + ] + }, + { + "id": "telex2_catr", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "catr" } + ] + }, + { + "id": "telex_tieesng_vieetj", + "input_method": "Telex", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "tieesng Vieetj" } + ] + }, + { + "id": "telex_chuyeern", + "input_method": "Telex", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "chuyeern" } + ] + }, + { + "id": "telex_nguowif", + "input_method": "Telex", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "nguowif" } + ] + }, + { + "id": "telex_ddawng", + "input_method": "Telex", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "ddawng" } + ] + }, + { + "id": "backspace_refresh", + "input_method": "Telex 2", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "Thuow" }, + { "op": "remove_last_char", "refresh": true } + ] + }, + { + "id": "restore_last_word_roundtrip", + "input_method": "Telex", + "initial_mode": "Vietnamese", + "steps": [ + { "op": "process_string", "value": "nguowif" }, + { "op": "restore_last_word", "mode": "English" }, + { "op": "restore_last_word", "mode": "Vietnamese" } + ] + }, + { + "id": "english_passthrough", + "input_method": "Telex", + "initial_mode": "English", + "steps": [ + { "op": "process_string", "value": "IBus" } + ] + } +] diff --git a/tools/run_differential.py b/tools/run_differential.py new file mode 100644 index 0000000..89c34e2 --- /dev/null +++ b/tools/run_differential.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +"""Run a lightweight Go-vs-C++ differential harness for the migrated engine.""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +CASES_PATH = REPO_ROOT / "tests" / "differential_cases.json" + + +def cpp_string_literal(value: str) -> str: + return json.dumps(value) + + +def emit_cpp_cases(cases: list[dict]) -> str: + lines: list[str] = [] + for case in cases: + lines.append(" cases.push_back(TestCase{") + lines.append(f" {cpp_string_literal(case['id'])},") + lines.append(f" {cpp_string_literal(case['input_method'])},") + lines.append(f" parseModeName({cpp_string_literal(case['initial_mode'])}),") + lines.append(" {") + for step in case["steps"]: + lines.append(" Step{") + lines.append(f" {cpp_string_literal(step['op'])},") + lines.append(f" {cpp_string_literal(step.get('value', ''))},") + lines.append(f" {cpp_string_literal(step.get('mode', ''))},") + lines.append(f" {'true' if step.get('refresh', False) else 'false'}") + lines.append(" },") + lines.append(" }") + lines.append(" });") + return "\n".join(lines) + + +def generate_cpp_runner(cases: list[dict]) -> str: + emitted_cases = emit_cpp_cases(cases) + return f"""#include "bamboo/IEngine.h" + +#include +#include +#include +#include +#include +#include +#include + +using bamboo::api::IEngine; +using bamboo::api::Mode; + +struct Step {{ + std::string op; + std::string value; + std::string mode; + bool refresh; +}}; + +struct TestCase {{ + std::string id; + std::string input_method; + Mode initial_mode; + std::vector steps; +}}; + +struct Snapshot {{ + std::string mode; + std::string text; + bool valid_partial; + bool valid_full; +}}; + +Mode parseModeName(const std::string& mode) {{ + if (mode == "English") {{ + return Mode::English; + }} + return Mode::Vietnamese; +}} + +std::string modeName(Mode mode) {{ + return mode == Mode::English ? "English" : "Vietnamese"; +}} + +std::string escapeJson(const std::string& input) {{ + std::ostringstream out; + for (unsigned char ch : input) {{ + switch (ch) {{ + case '\\\\': out << "\\\\\\\\"; break; + case '"': out << "\\\\\\""; break; + case '\\n': out << "\\\\n"; break; + case '\\r': out << "\\\\r"; break; + case '\\t': out << "\\\\t"; break; + default: + if (ch < 0x20) {{ + out << "\\\\u"; + const char* digits = "0123456789abcdef"; + out << '0' << '0' << digits[(ch >> 4) & 0xF] << digits[ch & 0xF]; + }} else {{ + out << static_cast(ch); + }} + }} + }} + return out.str(); +}} + +char32_t decodeSingleCodePoint(const std::string& input) {{ + if (input.empty()) {{ + return 0; + }} + const unsigned char byte0 = static_cast(input[0]); + if (byte0 < 0x80) {{ + return static_cast(byte0); + }} + if ((byte0 & 0xE0U) == 0xC0U && input.size() >= 2) {{ + const unsigned char byte1 = static_cast(input[1]); + return static_cast(((byte0 & 0x1FU) << 6) | (byte1 & 0x3FU)); + }} + if ((byte0 & 0xF0U) == 0xE0U && input.size() >= 3) {{ + const unsigned char byte1 = static_cast(input[1]); + const unsigned char byte2 = static_cast(input[2]); + return static_cast(((byte0 & 0x0FU) << 12) | ((byte1 & 0x3FU) << 6) | (byte2 & 0x3FU)); + }} + if ((byte0 & 0xF8U) == 0xF0U && input.size() >= 4) {{ + const unsigned char byte1 = static_cast(input[1]); + const unsigned char byte2 = static_cast(input[2]); + const unsigned char byte3 = static_cast(input[3]); + return static_cast(((byte0 & 0x07U) << 18) | ((byte1 & 0x3FU) << 12) | + ((byte2 & 0x3FU) << 6) | (byte3 & 0x3FU)); + }} + return static_cast(byte0); +}} + +Snapshot snapshot(const IEngine& engine) {{ + return Snapshot{{modeName(engine.getMode()), engine.getProcessedString(), engine.isValid(false), engine.isValid(true)}}; +}} + +int main() {{ + std::vector cases; +{emitted_cases} + + std::ostringstream out; + out << "["; + for (std::size_t case_index = 0; case_index < cases.size(); ++case_index) {{ + const TestCase& test_case = cases[case_index]; + std::unique_ptr engine = bamboo::api::createEngine("", test_case.input_method); + engine->setMode(test_case.initial_mode); + + out << "{{\\"id\\":\\"" << escapeJson(test_case.id) << "\\",\\"snapshots\\":["; + for (std::size_t step_index = 0; step_index < test_case.steps.size(); ++step_index) {{ + const Step& step = test_case.steps[step_index]; + if (step.op == "set_mode") {{ + engine->setMode(parseModeName(step.mode)); + }} else if (step.op == "process_string") {{ + engine->processString(step.value); + }} else if (step.op == "process_key") {{ + engine->processKey(decodeSingleCodePoint(step.value)); + }} else if (step.op == "remove_last_char") {{ + engine->removeLastChar(step.refresh); + }} else if (step.op == "restore_last_word") {{ + engine->restoreLastWord(step.mode == "Vietnamese"); + }} else if (step.op == "reset") {{ + engine->reset(); + }} else {{ + throw std::runtime_error("unknown op: " + step.op); + }} + + const Snapshot state = snapshot(*engine); + if (step_index != 0) {{ + out << ","; + }} + out << "{{\\"mode\\":\\"" << state.mode + << "\\",\\"text\\":\\"" << escapeJson(state.text) + << "\\",\\"valid_partial\\":" << (state.valid_partial ? "true" : "false") + << ",\\"valid_full\\":" << (state.valid_full ? "true" : "false") << "}}"; + }} + out << "]}}"; + if (case_index + 1 != cases.size()) {{ + out << ","; + }} + }} + out << "]"; + std::cout << out.str(); + return 0; +}} +""" + + +def generate_go_runner(cases: list[dict]) -> str: + cases_json = json.dumps(cases, ensure_ascii=False) + return f"""package main + +import ( + "encoding/json" + "fmt" + bamboo "github.com/LotusInputMethod/bamboo-core" +) + +type Step struct {{ + Op string `json:"op"` + Value string `json:"value"` + Mode string `json:"mode"` + Refresh bool `json:"refresh"` +}} + +type TestCase struct {{ + ID string `json:"id"` + InputMethod string `json:"input_method"` + InitialMode string `json:"initial_mode"` + Steps []Step `json:"steps"` +}} + +type Snapshot struct {{ + Mode string `json:"mode"` + Text string `json:"text"` + ValidPartial bool `json:"valid_partial"` + ValidFull bool `json:"valid_full"` +}} + +type Result struct {{ + ID string `json:"id"` + Snapshots []Snapshot `json:"snapshots"` +}} + +func modeFromName(name string) bamboo.Mode {{ + if name == "English" {{ + return bamboo.EnglishMode + }} + return bamboo.VietnameseMode +}} + +func modeName(mode bamboo.Mode) string {{ + if mode == bamboo.EnglishMode {{ + return "English" + }} + return "Vietnamese" +}} + +func textMode(mode bamboo.Mode) bamboo.Mode {{ + if mode == bamboo.EnglishMode {{ + return bamboo.EnglishMode | bamboo.FullText + }} + return bamboo.VietnameseMode | bamboo.FullText +}} + +func main() {{ + var cases []TestCase + if err := json.Unmarshal([]byte({json.dumps(cases_json, ensure_ascii=False)}), &cases); err != nil {{ + panic(err) + }} + + results := make([]Result, 0, len(cases)) + for _, tc := range cases {{ + im := bamboo.ParseInputMethod(bamboo.InputMethodDefinitions, tc.InputMethod) + eng := bamboo.NewEngine(im, bamboo.EstdFlags) + currentMode := modeFromName(tc.InitialMode) + snapshots := make([]Snapshot, 0, len(tc.Steps)) + + for _, step := range tc.Steps {{ + switch step.Op {{ + case "set_mode": + currentMode = modeFromName(step.Mode) + case "process_string": + eng.ProcessString(step.Value, currentMode) + case "process_key": + runes := []rune(step.Value) + if len(runes) != 1 {{ + panic("process_key requires exactly one rune") + }} + eng.ProcessKey(runes[0], currentMode) + case "remove_last_char": + eng.RemoveLastChar(step.Refresh) + case "restore_last_word": + toVietnamese := step.Mode == "Vietnamese" + eng.RestoreLastWord(toVietnamese) + currentMode = modeFromName(step.Mode) + case "reset": + eng.Reset() + default: + panic("unknown op: " + step.Op) + }} + + snapshots = append(snapshots, Snapshot{{ + Mode: modeName(currentMode), + Text: eng.GetProcessedString(textMode(currentMode)), + ValidPartial: eng.IsValid(false), + ValidFull: eng.IsValid(true), + }}) + }} + + results = append(results, Result{{ID: tc.ID, Snapshots: snapshots}}) + }} + + data, err := json.Marshal(results) + if err != nil {{ + panic(err) + }} + fmt.Print(string(data)) +}} +""" + + +def run_command(cmd: list[str], cwd: Path) -> subprocess.CompletedProcess[str]: + return subprocess.run(cmd, cwd=cwd, text=True, capture_output=True, check=False) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--cases", default=str(CASES_PATH)) + args = parser.parse_args() + + with open(args.cases, "r", encoding="utf-8") as fh: + cases = json.load(fh) + + with tempfile.TemporaryDirectory(prefix="bamboo-diff-") as tmp: + tmpdir = Path(tmp) + cpp_src = tmpdir / "runner.cpp" + cpp_bin = tmpdir / "runner_cpp" + go_src = tmpdir / "runner.go" + + cpp_src.write_text(generate_cpp_runner(cases), encoding="utf-8") + go_src.write_text(generate_go_runner(cases), encoding="utf-8") + + cpp_cmd = [ + "g++", + "-std=c++17", + "-Iinclude", + str(cpp_src), + "src/engine/engine.cpp", + "src/engine/spelling.cpp", + "src/engine/encoder.cpp", + "src/engine/charset_definition.cpp", + "src/engine/input_method_definition.cpp", + "src/engine/rules_parser.cpp", + "src/engine/transformation_utils.cpp", + "-o", + str(cpp_bin), + ] + cpp_build = run_command(cpp_cmd, REPO_ROOT) + if cpp_build.returncode != 0: + sys.stderr.write(cpp_build.stdout) + sys.stderr.write(cpp_build.stderr) + return cpp_build.returncode + + go_run = run_command(["go", "run", str(go_src)], REPO_ROOT) + if go_run.returncode != 0: + sys.stderr.write(go_run.stdout) + sys.stderr.write(go_run.stderr) + return go_run.returncode + + cpp_run = run_command([str(cpp_bin)], REPO_ROOT) + if cpp_run.returncode != 0: + sys.stderr.write(cpp_run.stdout) + sys.stderr.write(cpp_run.stderr) + return cpp_run.returncode + + go_results = json.loads(go_run.stdout) + cpp_results = json.loads(cpp_run.stdout) + if go_results != cpp_results: + sys.stderr.write("Differential mismatch detected.\\n") + sys.stderr.write("Go results:\\n") + sys.stderr.write(json.dumps(go_results, ensure_ascii=False, indent=2) + "\\n") + sys.stderr.write("C++ results:\\n") + sys.stderr.write(json.dumps(cpp_results, ensure_ascii=False, indent=2) + "\\n") + return 1 + + print(json.dumps(cpp_results, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From cf7a1bc8b8467fb4dbc40beb500caca59ad2014f Mon Sep 17 00:00:00 2001 From: Loc Huynh Date: Sun, 22 Mar 2026 12:36:45 +0700 Subject: [PATCH 5/5] Revert go files to master state --- bamboo.go | 51 ++++++++++++++++++-------------------- bamboo_test.go | 66 ++++++++++++-------------------------------------- 2 files changed, 39 insertions(+), 78 deletions(-) diff --git a/bamboo.go b/bamboo.go index 14e01d7..27c8ae9 100644 --- a/bamboo.go +++ b/bamboo.go @@ -98,9 +98,6 @@ func (e *BambooEngine) GetProcessedString(mode Mode) string { } func (e *BambooEngine) getApplicableRules(key rune) []Rule { - if !e.CanProcessKey(key) { - return nil - } var applicableRules []Rule for _, inputRule := range e.inputMethod.Rules { if inputRule.Key == unicode.ToLower(key) { @@ -125,7 +122,12 @@ func (e *BambooEngine) generateTransformations(composition []*Transformation, lo // transformation fall-backs to an APPENDING one. transformations = generateFallbackTransformations(composition, e.getApplicableRules(lowerKey), lowerKey, isUpperCase) var newComposition = append(composition, transformations...) - transformations = append(transformations, e.applyUIShortcuts(newComposition)...) + + // Implement the uwo+ typing shortcut by creating a virtual + // Mark.HORN rule that targets 'u' or 'o'. + if virtualTrans := e.applyUowShortcut(newComposition); virtualTrans != nil { + transformations = append(transformations, virtualTrans) + } } /** * Sometimes, a tone's position in a previous state must be changed to fit the new state @@ -138,15 +140,6 @@ func (e *BambooEngine) generateTransformations(composition []*Transformation, lo return transformations } -func (e *BambooEngine) applyUIShortcuts(syllable []*Transformation) []*Transformation { - // Implement the uwo+ typing shortcut by creating a virtual - // Mark.HORN rule that targets 'u' or 'o'. - if virtualTrans := e.applyUowShortcut(syllable); virtualTrans != nil { - return []*Transformation{virtualTrans} - } - return nil -} - func (e *BambooEngine) newComposition(composition []*Transformation, key rune, isUpperCase bool) []*Transformation { // Just process the key stroke on the last syllable var previousTransformations, lastSyllable = extractLastSyllable(composition) @@ -191,10 +184,6 @@ func (e *BambooEngine) ProcessString(str string, mode Mode) { func (e *BambooEngine) ProcessKey(key rune, mode Mode) { var lowerKey = unicode.ToLower(key) var isUpperCase = unicode.IsUpper(key) - if mode&EnglishMode == 0 && (key == '\b' || key == 0x7f) { - e.handleBackspace() - return - } if mode&EnglishMode != 0 || !e.CanProcessKey(lowerKey) { if mode&InReverseOrder != 0 { e.composition = append([]*Transformation{newAppendingTrans(lowerKey, isUpperCase)}, e.composition...) @@ -228,21 +217,27 @@ func (e *BambooEngine) Reset() { // Find the last APPENDING transformation and all // the transformations that add effects to it. -func (e *BambooEngine) RemoveLastChar(refreshLastTone bool) { - e.handleBackspace() - if refreshLastTone { - e.composition = append(e.composition, e.refreshLastToneTarget(e.composition)...) - } -} - -func (e *BambooEngine) handleBackspace() { - if len(e.composition) == 0 { +func (e *BambooEngine) RemoveLastChar(refreshLastToneTarget bool) { + var lastAppending = findLastAppendingTrans(e.composition) + if lastAppending == nil { return } - e.composition = e.composition[:len(e.composition)-1] - for len(e.composition) > 0 && e.composition[len(e.composition)-1].Rule.Key == 0 { + if !e.CanProcessKey(lastAppending.Rule.Key) { e.composition = e.composition[:len(e.composition)-1] + return + } + var previous, lastComb = extractLastWord(e.composition, e.GetInputMethod().Keys) + var newComb []*Transformation + for _, t := range lastComb { + if t.Target == lastAppending || t == lastAppending { + continue + } + newComb = append(newComb, t) + } + if refreshLastToneTarget { + newComb = append(newComb, e.refreshLastToneTarget(newComb)...) } + e.composition = append(previous, newComb...) } /***** END SIDE-EFFECT METHODS ******/ diff --git a/bamboo_test.go b/bamboo_test.go index d237cdb..be10e49 100644 --- a/bamboo_test.go +++ b/bamboo_test.go @@ -75,8 +75,8 @@ func TestProcessThuowString(t *testing.T) { t.Errorf("Process [Thuow], got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thuơ") } ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "Thuo" { - t.Errorf("Process [Thuow] and remove last char, got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thuo") + if ng.GetProcessedString(VietnameseMode) != "Thu" { + t.Errorf("Process [Thuow] and remove last char, got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "Thu") } } @@ -90,13 +90,13 @@ func TestBambooEngine_RemoveLastChar(t *testing.T) { t.Errorf("Process [loanj], got [%s] expected [loạn]", ng.GetProcessedString(VietnameseMode)) } ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "loan" { - t.Errorf("Process [loanj-1], got [%s] expected [loan]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "lọa" { + t.Errorf("Process [loanj-1], got [%s] expected [lọa]", ng.GetProcessedString(VietnameseMode)) } ng.ProcessString(":", EnglishMode) ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "loan" { - t.Errorf("Process [loanj-1], got [%s] expected [loan]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "lọa" { + t.Errorf("Process [loanj-1], got [%s] expected [lọa]", ng.GetProcessedString(VietnameseMode)) } } @@ -107,12 +107,12 @@ func TestProcessUpperString(t *testing.T) { t.Errorf("Process [VIEETJ], got [%s] expected [VIỆT]", ng.GetProcessedString(VietnameseMode)) } ng.RemoveLastChar(false) - if ng.GetProcessedString(VietnameseMode) != "VIÊT" { - t.Errorf("Process remove last char of upper string, got [%s] expected [VIÊT]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "VIỆ" { + t.Errorf("Process remove last char of upper string, got [%s] expected [VIỆ]", ng.GetProcessedString(VietnameseMode)) } ng.ProcessKey('Q', VietnameseMode) - if ng.GetProcessedString(EnglishMode) != "VIEETQ" { - t.Errorf("Process remove last char of upper string, got [%s] expected [VIEETQ]", ng.GetProcessedString(EnglishMode)) + if ng.GetProcessedString(EnglishMode) != "VIEEJQ" { + t.Errorf("Process remove last char of upper string, got [%s] expected [VIEEJQ]", ng.GetProcessedString(EnglishMode)) } ng.Reset() ng.ProcessString("IB", EnglishMode) @@ -196,8 +196,8 @@ func TestRemoveLastChar(t *testing.T) { ng := newStdEngine() ng.ProcessString("hanhj", VietnameseMode) ng.RemoveLastChar(true) - if ng.GetProcessedString(VietnameseMode) != "hanh" { - t.Errorf("Process [hanhj], got [%s] expected [hanh]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "hạn" { + t.Errorf("Process [hanhj], got [%s] expected [%s]", ng.GetProcessedString(VietnameseMode), "hạn") } ng.Reset() } @@ -300,8 +300,8 @@ func TestProcessRefresh2(t *testing.T) { ng.ProcessString("reff", VietnameseMode) ng.RemoveLastChar(true) ng.ProcessKey('f', VietnameseMode) - if ng.GetProcessedString(VietnameseMode) != "ref" { - t.Errorf("Process reff-1+f, got [%v] expected [ref]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "rè" { + t.Errorf("Process reff-1+f, got [%v] expected [rè]", ng.GetProcessedString(VietnameseMode)) } } @@ -606,8 +606,8 @@ func TestDoubleTyping(t *testing.T) { ng.RemoveLastChar(true) ng.RemoveLastChar(true) // ng.ProcessString("r", VietnameseMode) - if ng.GetProcessedString(VietnameseMode) != "tủy" { - t.Errorf("Process turyen,BS,BS,BS,r, got [%s] expected [tủy]", ng.GetProcessedString(VietnameseMode)) + if ng.GetProcessedString(VietnameseMode) != "tủ" { + t.Errorf("Process turyen,BS,BS,BS,r, got [%s] expected [tủ]", ng.GetProcessedString(VietnameseMode)) } ng.Reset() ng.ProcessString("chuyển", VietnameseMode) @@ -646,40 +646,6 @@ func TestDoubleTyping(t *testing.T) { var ng = newStdEngine() -func TestProcessKey_Backspace(t *testing.T) { - e := newStdEngine() - e.ProcessString("chao", VietnameseMode) - if e.GetProcessedString(VietnameseMode) != "chao" { - t.Errorf("Expected chao, got %s", e.GetProcessedString(VietnameseMode)) - } - e.ProcessKey('s', VietnameseMode) - if e.GetProcessedString(VietnameseMode) != "cháo" { - t.Errorf("Expected cháo, got %s", e.GetProcessedString(VietnameseMode)) - } - e.ProcessKey('\b', VietnameseMode) - if e.GetProcessedString(VietnameseMode) != "chao" { - t.Errorf("Expected chao after backspace, got %s", e.GetProcessedString(VietnameseMode)) - } -} - -func TestGetApplicableRules_Invalid(t *testing.T) { - e := newStdEngine().(*BambooEngine) - rules := e.getApplicableRules('😊') - if rules != nil { - t.Errorf("Expected nil rules for emoji, got %v", rules) - } -} - -func TestCanProcessKey_Invalid(t *testing.T) { - e := newStdEngine() - if e.CanProcessKey('😊') { - t.Error("Expected CanProcessKey to return false for emoji") - } - if !e.CanProcessKey('a') { - t.Error("Expected CanProcessKey to return true for 'a'") - } -} - func BenchmarkRemoveLastChar(b *testing.B) { b.ReportAllocs() b.ResetTimer()