From 5e44fe4c552b92e5e7b8735cda190025f7172b87 Mon Sep 17 00:00:00 2001 From: Alec Deitloff Date: Wed, 19 Oct 2022 20:50:57 -0700 Subject: [PATCH] =?UTF-8?q?Rework=20kanji/reading=20association=20using=20?= =?UTF-8?q?RegEx=20to=20address=20=E6=81=AF=E6=8A=9C=E3=81=8D=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- reading.py | 111 ++++++++++++++++++++++++------------------- test/test_reading.py | 4 +- 2 files changed, 64 insertions(+), 51 deletions(-) diff --git a/reading.py b/reading.py index 76ca5e1..b3b535c 100644 --- a/reading.py +++ b/reading.py @@ -100,12 +100,18 @@ def __iter__(self): def convertToHiragana(expr: str) -> str: return expr.translate(translator) -# Determines if two strings have the same phonetic reading, regardless -# of which script they're currently in (eg, 'は' == 'は' and 'ハ' == 'は') -def areKanaEqual(a: str, b: str) -> bool: - hiraA = convertToHiragana(a) - hiraB = convertToHiragana(b) - return hiraA == hiraB +def isKana(char: str) -> bool: + code = ord(char) + + # Hiragana + if code >= UNICODE_HIRAGANA_START and code <= UNICODE_HIRAGANA_END: + return True + + # Katakana + if code >= UNICODE_KATAKANA_START and code <= UNICODE_KATAKANA_END: + return True + + return False # Mecab @@ -133,6 +139,44 @@ def format(self, useRubyTags: bool) -> str: else: return '%s[%s]' % (self.text, self.reading) +class RegexDefinition: + def __init__(self, text: str, regexGroupIndex: Optional[int]): + self.text = text + self.regexGroupIndex = regexGroupIndex + +def kanjiToRegex(kanji: str): + regexPieces: list[str] = [] + definitions: list[RegexDefinition] = [] + numCaptureGroups = 0 + index = 0 + while index < len(kanji): + # Hiragana and Katakana characters are inlined into the Regex + if isKana(kanji[index]): + # The reading variable is ALWAYS in hiragana only + regexPieces.append(convertToHiragana(kanji[index])) + + # Use kanji[index] here to retain original katakana/hiragana + # (We convert to hiragana just to match against reading) + definitions.append(RegexDefinition(kanji[index], None)) + + # Advance to the next character + index += 1 + continue + + # We have a kanji character, which will become a lazy capture group + # in our Regex. First, absorb all sequential kanji characters into a + # single capture group + captureGroup = "" + while index < len(kanji) and not isKana(kanji[index]): + captureGroup += kanji[index] + index += 1 + + regexPieces.append("(.+?)") + definitions.append(RegexDefinition(captureGroup, numCaptureGroups)) + numCaptureGroups += 1 + + return ("^%s$" % ''.join(regexPieces), definitions) + class MecabController(object): def __init__(self): @@ -188,49 +232,18 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False): nodes.append(ReadingNode(kanji, None)) continue - # iterate through the reading and the kanji, and only produce furigana - # for the characters that differ between the two (only give furigana to - # the kanji, not the kana) - # INVARIANT: reading is always at least as long as the kanji/word - indexKanji = 0 - indexReading = 0 - while indexReading < len(reading): - # If the reading and the kanji have the same value, the current - # character must be kana. Continue reading until we find the next - # difference - if areKanaEqual(kanji[indexKanji], reading[indexReading]): - indexStart = indexKanji - while indexReading < len(reading) and \ - indexKanji < len(kanji) and \ - areKanaEqual(kanji[indexKanji], reading[indexReading]): - indexReading += 1 - indexKanji += 1 - nodes.append(ReadingNode(kanji[indexStart:indexKanji], None)) - continue - - # The current characters are different, which must mean that we're - # at the start of a kanji. Make a node with furigana that contains - # all of the reading until we have a match up again between kanji string - # and reading - indexStartReading = indexReading - indexReading += 1 # Ensure we start our checks on the NEXT kana after triggering - while indexReading < len(reading): - # Check to see if the current reading kana is found in the string. - # This implements a lazy algorithm w.r.t. furigana length - try: - indexEnd = kanji.index(reading[indexReading], indexKanji) - nodes.append(ReadingNode(kanji[indexKanji:indexEnd], reading[indexStartReading:indexReading])) - indexKanji = indexEnd - break - except ValueError: - pass - - indexReading += 1 - - if indexReading == len(reading): - # We made it to the end of the reading, which should mean that the entire remaining - # kanji has the entire remaining reading - nodes.append(ReadingNode(kanji[indexKanji:], reading[indexStartReading:])) + # Convert the kanji variable into a Regex pattern where non-kana are + # turned into Regex capture groups, and then apply it to the reading + # to figure out (using lazy matching) what the smallest furigana readings + # are for the kanji + (regexPattern, regexDefinitions) = kanjiToRegex(kanji) + match = re.search(regexPattern, reading) + for definition in regexDefinitions: + if definition.regexGroupIndex is None: + nodes.append(ReadingNode(definition.text, None)) + else: + groupReading = match.group(definition.regexGroupIndex + 1) + nodes.append(ReadingNode(definition.text, groupReading)) # Combine our nodes together into a single sentece fin = ''.join(node.format(useRubyTags) for node in nodes) diff --git a/test/test_reading.py b/test/test_reading.py index b09dfa6..ab3fcc6 100644 --- a/test/test_reading.py +++ b/test/test_reading.py @@ -41,8 +41,8 @@ def testKanjiNumber(self): # ensure that verbs with okurigana don't produce furigana for the kana portions def testOkurigana(self): - actual = reading.mecab.reading("口走る") - self.assertEqual(actual, "口走[くちばし]る") + self.assertEqual(reading.mecab.reading("口走る"), "口走[くちばし]る") + self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト勉強[べんきょう]の息抜[いきぬ]きとか どうしてんの") # ensure that a single word that has plain kana appearing before the kanji in # the word do not have attached furigana