opensearch-project · adnapibar · Nov 8, 2022
@@ -32,12 +32,14 @@
 
 package org.opensearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.env.Environment;
 import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.Analysis;
 import org.xml.sax.InputSource;
 
 import java.io.InputStream;
@@ -61,13 +63,15 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
             throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
         }
 
-        Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath);
+        Path hyphenationPatternsFile = Analysis.resolveAnalyzerPath(env, hyphenationPatternsPath);
 
         try {
             InputStream in = Files.newInputStream(hyphenationPatternsFile);
             hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in));
         } catch (Exception e) {
-            throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
+            LogManager.getLogger(HyphenationCompoundWordTokenFilterFactory.class)
+                .error("Exception while reading hyphenation_patterns_path ", e);
+            throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.");
         }
     }
 

@@ -39,6 +39,7 @@
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.analysis.AbstractCharFilterFactory;
 import org.opensearch.index.analysis.Analysis;
+import org.opensearch.index.analysis.MappingRule;
 import org.opensearch.index.analysis.NormalizingCharFilterFactory;
 
 import java.io.Reader;
@@ -53,13 +54,13 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
     MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name);
 
-        List<String> rules = Analysis.getWordList(env, settings, "mappings");
+        List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse);
         if (rules == null) {
             throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
         }
 
         NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder();
-        parseRules(rules, normMapBuilder);
+        rules.forEach(rule -> normMapBuilder.add(rule.getLeft(), rule.getRight()));
         normMap = normMapBuilder.build();
     }
 
@@ -71,18 +72,13 @@ public Reader create(Reader tokenStream) {
     // source => target
     private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
 
-    /**
-     * parses a list of MappingCharFilter style rules into a normalize char map
-     */
-    private void parseRules(List<String> rules, NormalizeCharMap.Builder map) {
-        for (String rule : rules) {
-            Matcher m = rulePattern.matcher(rule);
-            if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
-            String lhs = parseString(m.group(1).trim());
-            String rhs = parseString(m.group(2).trim());
-            if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
-            map.add(lhs, rhs);
-        }
+    private MappingRule<String, String> parse(String rule) {
+        Matcher m = rulePattern.matcher(rule);
+        if (!m.find()) throw new RuntimeException("Invalid mapping rule : [" + rule + "]");
+        String lhs = parseString(m.group(1).trim());
+        String rhs = parseString(m.group(2).trim());
+        if (lhs == null || rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal mapping.");
+        return new MappingRule<>(lhs, rhs);
     }
 
     char[] out = new char[256];

@@ -40,24 +40,31 @@
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.analysis.AbstractTokenFilterFactory;
 import org.opensearch.index.analysis.Analysis;
+import org.opensearch.index.analysis.MappingRule;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 
 public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final String MAPPING_SEPARATOR = "=>";
     private final StemmerOverrideMap overrideMap;
 
     StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
         super(indexSettings, name, settings);
 
-        List<String> rules = Analysis.getWordList(env, settings, "rules");
+        List<MappingRule<List<String>, String>> rules = Analysis.parseWordList(env, settings, "rules", this::parse);
         if (rules == null) {
             throw new IllegalArgumentException("stemmer override filter requires either `rules` or `rules_path` to be configured");
         }
 
         StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
-        parseRules(rules, builder, "=>");
+        for (MappingRule<List<String>, String> rule : rules) {
+            for (String key : rule.getLeft()) {
+                builder.add(key, rule.getRight());
+            }
+        }
         overrideMap = builder.build();
 
     }
@@ -67,27 +74,26 @@ public TokenStream create(TokenStream tokenStream) {
         return new StemmerOverrideFilter(tokenStream, overrideMap);
     }
 
-    static void parseRules(List<String> rules, StemmerOverrideFilter.Builder builder, String mappingSep) {
-        for (String rule : rules) {
-            String[] sides = rule.split(mappingSep, -1);
-            if (sides.length != 2) {
-                throw new RuntimeException("Invalid Keyword override Rule:" + rule);
-            }
+    private MappingRule<List<String>, String> parse(String rule) {
+        String[] sides = rule.split(MAPPING_SEPARATOR, -1);
+        if (sides.length != 2) {
+            throw new RuntimeException("Invalid keyword override rule: " + rule);
+        }
 
-            String[] keys = sides[0].split(",", -1);
-            String override = sides[1].trim();
-            if (override.isEmpty() || override.indexOf(',') != -1) {
-                throw new RuntimeException("Invalid Keyword override Rule:" + rule);
-            }
+        String[] keys = sides[0].split(",", -1);
+        String override = sides[1].trim();
+        if (override.isEmpty() || override.indexOf(',') != -1) {
+            throw new RuntimeException("Invalid keyword override rule: " + rule);
+        }
 
-            for (String key : keys) {
-                String trimmedKey = key.trim();
-                if (trimmedKey.isEmpty()) {
-                    throw new RuntimeException("Invalid Keyword override Rule:" + rule);
-                }
-                builder.add(trimmedKey, override);
+        List<String> trimmedKeys = new ArrayList<>();
+        for (String key : keys) {
+            String trimmedKey = key.trim();
+            if (trimmedKey.isEmpty()) {
+                throw new RuntimeException("Invalid keyword override rule: " + rule);
             }
+            trimmedKeys.add(trimmedKey);
         }
+        return new MappingRule<>(trimmedKeys, override);
     }
-
 }
@@ -32,6 +32,7 @@
 
 package org.opensearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.synonym.SynonymFilter;
@@ -155,14 +156,15 @@ SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
             }
             return parser.build();
         } catch (Exception e) {
-            throw new IllegalArgumentException("failed to build synonyms", e);
+            LogManager.getLogger(SynonymTokenFilterFactory.class).error("Failed to build synonyms: ", e);
+            throw new IllegalArgumentException("Failed to build synonyms");
         }
     }
 
     Reader getRulesFromSettings(Environment env) {
         Reader rulesReader;
         if (settings.getAsList("synonyms", null) != null) {
-            List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
+            List<String> rulesList = Analysis.parseWordList(env, settings, "synonyms", s -> s);
             StringBuilder sb = new StringBuilder();
             for (String line : rulesList) {
                 sb.append(line).append(System.lineSeparator());

@@ -41,6 +41,7 @@
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.analysis.AbstractTokenFilterFactory;
 import org.opensearch.index.analysis.Analysis;
+import org.opensearch.index.analysis.MappingRule;
 import org.opensearch.index.analysis.TokenFilterFactory;
 
 import java.util.List;
@@ -73,7 +74,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
         // . => DIGIT
         // \u002C => DIGIT
         // \u200D => ALPHANUM
-        List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
+        List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
+            env,
+            settings,
+            "type_table",
+            WordDelimiterTokenFilterFactory::parse
+        );
         if (charTypeTableValues == null) {
             this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
         } else {

@@ -41,6 +41,7 @@
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.analysis.AbstractTokenFilterFactory;
 import org.opensearch.index.analysis.Analysis;
+import org.opensearch.index.analysis.MappingRule;
 import org.opensearch.index.analysis.TokenFilterFactory;
 
 import java.util.Collection;
@@ -76,7 +77,12 @@ public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment
         // . => DIGIT
         // \u002C => DIGIT
         // \u200D => ALPHANUM
-        List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
+        List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
+            env,
+            settings,
+            "type_table",
+            WordDelimiterTokenFilterFactory::parse
+        );
         if (charTypeTableValues == null) {
             this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
         } else {
@@ -127,19 +133,23 @@ public int getFlag(int flag, Settings settings, String key, boolean defaultValue
     // source => type
     private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
 
+    static MappingRule<Character, Byte> parse(String rule) {
+        Matcher m = typePattern.matcher(rule);
+        if (!m.find()) throw new RuntimeException("Invalid mapping rule: [" + rule + "]");
+        String lhs = parseString(m.group(1).trim());
+        Byte rhs = parseType(m.group(2).trim());
+        if (lhs.length() != 1) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Only a single character is allowed.");
+        if (rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal type.");
+        return new MappingRule<>(lhs.charAt(0), rhs);
+    }
+
     /**
      * parses a list of MappingCharFilter style rules into a custom byte[] type table
      */
-    static byte[] parseTypes(Collection<String> rules) {
+    static byte[] parseTypes(Collection<MappingRule<Character, Byte>> rules) {
         SortedMap<Character, Byte> typeMap = new TreeMap<>();
-        for (String rule : rules) {
-            Matcher m = typePattern.matcher(rule);
-            if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
-            String lhs = parseString(m.group(1).trim());
-            Byte rhs = parseType(m.group(2).trim());
-            if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
-            if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
-            typeMap.put(lhs.charAt(0), rhs);
+        for (MappingRule<Character, Byte> rule : rules) {
+            typeMap.put(rule.getLeft(), rule.getRight());
         }
 
         // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance

@@ -195,4 +195,24 @@ public void testStemEnglishPossessive() throws IOException {
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
     }
+
+    private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .putList("index.analysis.filter.my_word_delimiter.type_table", rules)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+        analysis.tokenFilter.get("my_word_delimiter");
+    }
+
+    public void testTypeTableParsingError() {
+        String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
+        RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules));
+        assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
+    }
 }
@@ -0,0 +1,70 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.analysis.CharFilter;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.analysis.AnalysisTestsHelper;
+import org.opensearch.index.analysis.CharFilterFactory;
+import org.opensearch.test.OpenSearchTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+
+public class MappingCharFilterFactoryTests extends OpenSearchTestCase {
+    public static CharFilterFactory create(String... rules) throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
+                .put("index.analysis.analyzer.my_analyzer.char_filter", "my_mappings_char_filter")
+                .put("index.analysis.char_filter.my_mappings_char_filter.type", "mapping")
+                .putList("index.analysis.char_filter.my_mappings_char_filter.mappings", rules)
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        return analysis.charFilter.get("my_mappings_char_filter");
+    }
+
+    public void testRulesOk() throws IOException {
+        MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create(
+            "# This is a comment",
+            ":) => _happy_",
+            ":( => _sad_"
+        );
+        CharFilter inputReader = (CharFilter) mappingCharFilterFactory.create(new StringReader("I'm so :)"));
+        char[] tempBuff = new char[14];
+        StringBuilder output = new StringBuilder();
+        while (true) {
+            int length = inputReader.read(tempBuff);
+            if (length == -1) break;
+            output.append(tempBuff, 0, length);
+        }
+        assertEquals("I'm so _happy_", output.toString());
+    }
+
+    public void testRuleError() {
+        for (String rule : Arrays.asList(
+            "",        // empty
+            "a",       // no arrow
+            "a:>b"     // invalid delimiter
+        )) {
+            RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule));
+            assertEquals("Line [1]: Invalid mapping rule : [" + rule + "]", ex.getMessage());
+        }
+    }
+
+    public void testRulePartError() {
+        RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# This is a comment", ":) => _happy_", "a:b"));
+        assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage());
+    }
+}
@@ -46,7 +46,6 @@
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.Arrays;
-import java.util.Locale;
 
 public class StemmerOverrideTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
     @Rule
@@ -76,11 +75,8 @@ public void testRuleError() {
             "=>a",     // no keys
             "a,=>b"    // empty key
         )) {
-            expectThrows(
-                RuntimeException.class,
-                String.format(Locale.ROOT, "Should fail for invalid rule: '%s'", rule),
-                () -> create(rule)
-            );
+            RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule));
+            assertEquals("Line [1]: Invalid keyword override rule: " + rule, ex.getMessage());
         }
     }
 
@@ -90,4 +86,9 @@ public void testRulesOk() throws IOException {
         tokenizer.setReader(new StringReader("a b c"));
         assertTokenStreamContents(tokenFilterFactory.create(tokenizer), new String[] { "1", "2", "2" });
     }
+
+    public void testRulePartError() {
+        RuntimeException ex = expectThrows(RuntimeException.class, () -> create("a => 1", "b,c => 2", "# This is a comment", "=>a=>b"));
+        assertEquals("Line [4]: Invalid keyword override rule: =>a=>b", ex.getMessage());
+    }
 }