From 02f6fddfc16b3cd7802252954856c53f0a37670d Mon Sep 17 00:00:00 2001 From: eroux Date: Thu, 16 Jun 2022 17:33:58 +0200 Subject: [PATCH] implement char reorder for #17 --- src/main/java/io/bdrc/lucene/bo/TibPattFilter.java | 14 ++++++++++++++ .../io/bdrc/lucene/bo/TibetanAnalyzerTest.java | 13 ++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/bdrc/lucene/bo/TibPattFilter.java b/src/main/java/io/bdrc/lucene/bo/TibPattFilter.java index 703a072..26c8960 100644 --- a/src/main/java/io/bdrc/lucene/bo/TibPattFilter.java +++ b/src/main/java/io/bdrc/lucene/bo/TibPattFilter.java @@ -80,6 +80,20 @@ public final static String normalizeR(final String in) { } } + public static class ReorderFilter extends PatternReplaceCharFilter { + public ReorderFilter(Reader in) { + super(rReorder, repl, in); + } + // https://github.com/buda-base/lucene-bo/issues/17 + // reorder vowel + subscript into subscript + vowel + public static final Pattern rReorder = Pattern.compile("([ཱ-྇]+)([ྍ-ྼ]+)"); + public static final String repl = "$2$1"; + public final static String normalizeR(final String in) { + final Matcher matcher = rReorder.matcher(in); + return matcher.replaceAll(repl); + } + } + public static Reader plugFilters(Reader in) { in = new MergedSylFilter1(in); in = new MergedSylFilter2(in); diff --git a/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java b/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java index 1ac30e5..f81ca5b 100644 --- a/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java +++ b/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java @@ -223,7 +223,7 @@ public void syllableLemaTest() throws IOException { System.out.println("Testing TibPattFilter() for Old Tibetan"); String input = "བཀྲ་ཤིས་བདེ་ལེགས་དགར་ཁོར་ཁྲོ་ཁྲོས་འཐུ་གདུ་གདུམ་ཐེབ་ཐེབས"; Reader reader = new StringReader(input); - List expected = Arrays.asList("བཀྲ", "ཤིས", "བདེ", "ལེགས", "དགར", "དགར", "ཁྲོ", "ཁྲོ", "འཐུ", "འཐུ", "གདུམ", "ཐེབས", "ཐེབས"); + List expected = Arrays.asList("བཀྲ", "ཤིས", "བདེ", "ལེགས", "དགར", "དགར", "ཁྲོ", "ཁྲོ", "འཐུ", "གདུ", "གདུམ", "ཐེབས", "ཐེབས"); System.out.print(input + " => "); TokenStream res = tokenize(reader, new TibSyllableTokenizer()); final TokenFilter resF = new TibSyllableLemmatizer(res); @@ -241,6 +241,17 @@ public void pattFilterTest() throws IOException { assertTokenStream(res, expected); } + @Test + public void reorderTest() throws IOException { + System.out.println("Testing TibSyllableLemmatizer()"); + String input = "ཀྲི ཀིྲ"; + Reader reader = new StringReader(input); + List expected = Arrays.asList("ཀྲི", "ཀྲི"); + System.out.print(input + " => "); + TokenStream res = tokenize(new TibPattFilter.ReorderFilter(reader), new TibSyllableTokenizer()); + assertTokenStream(res, expected); + } + @Test public void ewtsOffsetBug() throws IOException { System.out.println("Testing TibEwtsFilter() offsets");