Skip to content

Commit

Permalink
implement char reorder for #17
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Jun 16, 2022
1 parent 8d90080 commit 02f6fdd
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
14 changes: 14 additions & 0 deletions src/main/java/io/bdrc/lucene/bo/TibPattFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,20 @@ public final static String normalizeR(final String in) {
}
}

public static class ReorderFilter extends PatternReplaceCharFilter {
public ReorderFilter(Reader in) {
super(rReorder, repl, in);
}
// https://github.com/buda-base/lucene-bo/issues/17
// reorder vowel + subscript into subscript + vowel
public static final Pattern rReorder = Pattern.compile("([ཱ-྇]+)([ྍ-ྼ]+)");
public static final String repl = "$2$1";
public final static String normalizeR(final String in) {
final Matcher matcher = rReorder.matcher(in);
return matcher.replaceAll(repl);
}
}

public static Reader plugFilters(Reader in) {
in = new MergedSylFilter1(in);
in = new MergedSylFilter2(in);
Expand Down
13 changes: 12 additions & 1 deletion src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ public void syllableLemaTest() throws IOException {
System.out.println("Testing TibPattFilter() for Old Tibetan");
String input = "བཀྲ་ཤིས་བདེ་ལེགས་དགར་ཁོར་ཁྲོ་ཁྲོས་འཐུ་གདུ་གདུམ་ཐེབ་ཐེབས";
Reader reader = new StringReader(input);
List<String> expected = Arrays.asList("བཀྲ", "ཤིས", "བདེ", "ལེགས", "དགར", "དགར", "ཁྲོ", "ཁྲོ", "འཐུ", "འཐུ", "གདུམ", "ཐེབས", "ཐེབས");
List<String> expected = Arrays.asList("བཀྲ", "ཤིས", "བདེ", "ལེགས", "དགར", "དགར", "ཁྲོ", "ཁྲོ", "འཐུ", "གདུ", "གདུམ", "ཐེབས", "ཐེབས");
System.out.print(input + " => ");
TokenStream res = tokenize(reader, new TibSyllableTokenizer());
final TokenFilter resF = new TibSyllableLemmatizer(res);
Expand All @@ -241,6 +241,17 @@ public void pattFilterTest() throws IOException {
assertTokenStream(res, expected);
}

@Test
public void reorderTest() throws IOException {
System.out.println("Testing TibSyllableLemmatizer()");
String input = "ཀྲི ཀིྲ";
Reader reader = new StringReader(input);
List<String> expected = Arrays.asList("ཀྲི", "ཀྲི");
System.out.print(input + " => ");
TokenStream res = tokenize(new TibPattFilter.ReorderFilter(reader), new TibSyllableTokenizer());
assertTokenStream(res, expected);
}

@Test
public void ewtsOffsetBug() throws IOException {
System.out.println("Testing TibEwtsFilter() offsets");
Expand Down

0 comments on commit 02f6fdd

Please sign in to comment.