From 184b337175c29df2802cc29ca0176f8990816c6f Mon Sep 17 00:00:00 2001 From: Rabi Panda Date: Tue, 8 Nov 2022 01:49:06 +0000 Subject: [PATCH] [Backport] Fix error handling while reading analyzer mapping rules Add new parseWordList method that takes a parser as a parameter. It reads custom rules from settings or a file, parses and handles errors. Make error messages less verbose for rules files outside config directory. Signed-off-by: Rabi Panda --- ...enationCompoundWordTokenFilterFactory.java | 8 +- .../common/MappingCharFilterFactory.java | 24 ++-- .../StemmerOverrideTokenFilterFactory.java | 46 +++---- .../common/SynonymTokenFilterFactory.java | 6 +- .../WordDelimiterGraphTokenFilterFactory.java | 8 +- .../WordDelimiterTokenFilterFactory.java | 30 +++-- ...rdDelimiterTokenFilterFactoryTestCase.java | 20 ++++ .../common/MappingCharFilterFactoryTests.java | 70 +++++++++++ ...temmerOverrideTokenFilterFactoryTests.java | 13 +- .../common/SynonymsAnalysisTests.java | 4 +- .../IcuCollationTokenFilterFactory.java | 8 +- .../KuromojiPartOfSpeechFilterFactory.java | 2 +- .../analysis/KuromojiTokenizerFactory.java | 38 +++--- .../index/analysis/KuromojiAnalysisTests.java | 4 +- .../index/analysis/NoriAnalyzerProvider.java | 2 +- .../NoriPartOfSpeechStopFilterFactory.java | 2 +- .../index/analysis/NoriTokenizerFactory.java | 8 +- .../opensearch.release-notes-2.4.0.md | 2 +- .../opensearch/index/analysis/Analysis.java | 113 ++++++++++++------ .../analysis/CustomMappingRuleParser.java | 21 ++++ .../index/analysis/MappingRule.java | 30 +++++ .../indices/analysis/HunspellService.java | 3 +- .../index/analysis/AnalysisTests.java | 63 ++++++---- .../indices/analysis/AnalysisModuleTests.java | 11 +- .../indices/analyze/HunspellServiceTests.java | 12 +- 25 files changed, 393 insertions(+), 155 deletions(-) create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java create mode 100644 server/src/main/java/org/opensearch/index/analysis/CustomMappingRuleParser.java create mode 100644 server/src/main/java/org/opensearch/index/analysis/MappingRule.java diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java index 875c5261f8387..25bf58409928e 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java @@ -32,12 +32,14 @@ package org.opensearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.Analysis; import org.xml.sax.InputSource; import java.io.InputStream; @@ -61,13 +63,15 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW throw new IllegalArgumentException("hyphenation_patterns_path is a required setting."); } - Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath); + Path hyphenationPatternsFile = Analysis.resolveAnalyzerPath(env, hyphenationPatternsPath); try { InputStream in = Files.newInputStream(hyphenationPatternsFile); hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in)); } catch (Exception e) { - throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e); + LogManager.getLogger(HyphenationCompoundWordTokenFilterFactory.class) + .error("Exception while reading hyphenation_patterns_path ", e); + throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path."); } } diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java index 7200b69135a30..d6d9f8975f2fc 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MappingCharFilterFactory.java @@ -39,6 +39,7 @@ import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AbstractCharFilterFactory; import org.opensearch.index.analysis.Analysis; +import org.opensearch.index.analysis.MappingRule; import org.opensearch.index.analysis.NormalizingCharFilterFactory; import java.io.Reader; @@ -53,13 +54,13 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name); - List rules = Analysis.getWordList(env, settings, "mappings"); + List> rules = Analysis.parseWordList(env, settings, "mappings", this::parse); if (rules == null) { throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured"); } NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder(); - parseRules(rules, normMapBuilder); + rules.forEach(rule -> normMapBuilder.add(rule.getLeft(), rule.getRight())); normMap = normMapBuilder.build(); } @@ -71,18 +72,13 @@ public Reader create(Reader tokenStream) { // source => target private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$"); - /** - * parses a list of MappingCharFilter style rules into a normalize char map - */ - private void parseRules(List rules, NormalizeCharMap.Builder map) { - for (String rule : rules) { - Matcher m = rulePattern.matcher(rule); - if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); - String lhs = parseString(m.group(1).trim()); - String rhs = parseString(m.group(2).trim()); - if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping."); - map.add(lhs, rhs); - } + private MappingRule parse(String rule) { + Matcher m = rulePattern.matcher(rule); + if (!m.find()) throw new RuntimeException("Invalid mapping rule : [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + String rhs = parseString(m.group(2).trim()); + if (lhs == null || rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal mapping."); + return new MappingRule<>(lhs, rhs); } char[] out = new char[256]; diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactory.java index 89f0766542296..bdd6e01261443 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactory.java @@ -40,24 +40,31 @@ import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AbstractTokenFilterFactory; import org.opensearch.index.analysis.Analysis; +import org.opensearch.index.analysis.MappingRule; import java.io.IOException; +import java.util.ArrayList; import java.util.List; public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactory { + private static final String MAPPING_SEPARATOR = "=>"; private final StemmerOverrideMap overrideMap; StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException { super(indexSettings, name, settings); - List rules = Analysis.getWordList(env, settings, "rules"); + List, String>> rules = Analysis.parseWordList(env, settings, "rules", this::parse); if (rules == null) { throw new IllegalArgumentException("stemmer override filter requires either `rules` or `rules_path` to be configured"); } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); - parseRules(rules, builder, "=>"); + for (MappingRule, String> rule : rules) { + for (String key : rule.getLeft()) { + builder.add(key, rule.getRight()); + } + } overrideMap = builder.build(); } @@ -67,27 +74,26 @@ public TokenStream create(TokenStream tokenStream) { return new StemmerOverrideFilter(tokenStream, overrideMap); } - static void parseRules(List rules, StemmerOverrideFilter.Builder builder, String mappingSep) { - for (String rule : rules) { - String[] sides = rule.split(mappingSep, -1); - if (sides.length != 2) { - throw new RuntimeException("Invalid Keyword override Rule:" + rule); - } + private MappingRule, String> parse(String rule) { + String[] sides = rule.split(MAPPING_SEPARATOR, -1); + if (sides.length != 2) { + throw new RuntimeException("Invalid keyword override rule: " + rule); + } - String[] keys = sides[0].split(",", -1); - String override = sides[1].trim(); - if (override.isEmpty() || override.indexOf(',') != -1) { - throw new RuntimeException("Invalid Keyword override Rule:" + rule); - } + String[] keys = sides[0].split(",", -1); + String override = sides[1].trim(); + if (override.isEmpty() || override.indexOf(',') != -1) { + throw new RuntimeException("Invalid keyword override rule: " + rule); + } - for (String key : keys) { - String trimmedKey = key.trim(); - if (trimmedKey.isEmpty()) { - throw new RuntimeException("Invalid Keyword override Rule:" + rule); - } - builder.add(trimmedKey, override); + List trimmedKeys = new ArrayList<>(); + for (String key : keys) { + String trimmedKey = key.trim(); + if (trimmedKey.isEmpty()) { + throw new RuntimeException("Invalid keyword override rule: " + rule); } + trimmedKeys.add(trimmedKey); } + return new MappingRule<>(trimmedKeys, override); } - } diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java index dc6b5b2dd8b7b..01a65e87d7466 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java @@ -32,6 +32,7 @@ package org.opensearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; @@ -155,14 +156,15 @@ SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) { } return parser.build(); } catch (Exception e) { - throw new IllegalArgumentException("failed to build synonyms", e); + LogManager.getLogger(SynonymTokenFilterFactory.class).error("Failed to build synonyms: ", e); + throw new IllegalArgumentException("Failed to build synonyms"); } } Reader getRulesFromSettings(Environment env) { Reader rulesReader; if (settings.getAsList("synonyms", null) != null) { - List rulesList = Analysis.getWordList(env, settings, "synonyms"); + List rulesList = Analysis.parseWordList(env, settings, "synonyms", s -> s); StringBuilder sb = new StringBuilder(); for (String line : rulesList) { sb.append(line).append(System.lineSeparator()); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index 31d52d030cb71..51ac3141fd465 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -41,6 +41,7 @@ import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AbstractTokenFilterFactory; import org.opensearch.index.analysis.Analysis; +import org.opensearch.index.analysis.MappingRule; import org.opensearch.index.analysis.TokenFilterFactory; import java.util.List; @@ -73,7 +74,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM - List charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); + List> charTypeTableValues = Analysis.parseWordList( + env, + settings, + "type_table", + WordDelimiterTokenFilterFactory::parse + ); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterTokenFilterFactory.java index d40acfa05dd21..96e50206fb53d 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/WordDelimiterTokenFilterFactory.java @@ -41,6 +41,7 @@ import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AbstractTokenFilterFactory; import org.opensearch.index.analysis.Analysis; +import org.opensearch.index.analysis.MappingRule; import org.opensearch.index.analysis.TokenFilterFactory; import java.util.Collection; @@ -76,7 +77,12 @@ public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM - List charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); + List> charTypeTableValues = Analysis.parseWordList( + env, + settings, + "type_table", + WordDelimiterTokenFilterFactory::parse + ); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { @@ -127,19 +133,23 @@ public int getFlag(int flag, Settings settings, String key, boolean defaultValue // source => type private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$"); + static MappingRule parse(String rule) { + Matcher m = typePattern.matcher(rule); + if (!m.find()) throw new RuntimeException("Invalid mapping rule: [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + Byte rhs = parseType(m.group(2).trim()); + if (lhs.length() != 1) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Only a single character is allowed."); + if (rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal type."); + return new MappingRule<>(lhs.charAt(0), rhs); + } + /** * parses a list of MappingCharFilter style rules into a custom byte[] type table */ - static byte[] parseTypes(Collection rules) { + static byte[] parseTypes(Collection> rules) { SortedMap typeMap = new TreeMap<>(); - for (String rule : rules) { - Matcher m = typePattern.matcher(rule); - if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); - String lhs = parseString(m.group(1).trim()); - Byte rhs = parseType(m.group(2).trim()); - if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); - if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); - typeMap.put(lhs.charAt(0), rhs); + for (MappingRule rule : rules) { + typeMap.put(rule.getLeft(), rule.getRight()); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java index 9d54776755766..2c3864a36fd22 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -195,4 +195,24 @@ public void testStemEnglishPossessive() throws IOException { tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } + + private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .putList("index.analysis.filter.my_word_delimiter.type_table", rules) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build(), + new CommonAnalysisPlugin() + ); + analysis.tokenFilter.get("my_word_delimiter"); + } + + public void testTypeTableParsingError() { + String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" }; + RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules)); + assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage()); + } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java new file mode 100644 index 0000000000000..bdc452f8863d4 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MappingCharFilterFactoryTests.java @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.CharFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.index.analysis.CharFilterFactory; +import org.opensearch.test.OpenSearchTestCase; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Arrays; + +public class MappingCharFilterFactoryTests extends OpenSearchTestCase { + public static CharFilterFactory create(String... rules) throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .put("index.analysis.analyzer.my_analyzer.char_filter", "my_mappings_char_filter") + .put("index.analysis.char_filter.my_mappings_char_filter.type", "mapping") + .putList("index.analysis.char_filter.my_mappings_char_filter.mappings", rules) + .build(), + new CommonAnalysisPlugin() + ); + + return analysis.charFilter.get("my_mappings_char_filter"); + } + + public void testRulesOk() throws IOException { + MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create( + "# This is a comment", + ":) => _happy_", + ":( => _sad_" + ); + CharFilter inputReader = (CharFilter) mappingCharFilterFactory.create(new StringReader("I'm so :)")); + char[] tempBuff = new char[14]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = inputReader.read(tempBuff); + if (length == -1) break; + output.append(tempBuff, 0, length); + } + assertEquals("I'm so _happy_", output.toString()); + } + + public void testRuleError() { + for (String rule : Arrays.asList( + "", // empty + "a", // no arrow + "a:>b" // invalid delimiter + )) { + RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule)); + assertEquals("Line [1]: Invalid mapping rule : [" + rule + "]", ex.getMessage()); + } + } + + public void testRulePartError() { + RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# This is a comment", ":) => _happy_", "a:b")); + assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage()); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactoryTests.java index 96e05efa97768..9e3345aa30dca 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerOverrideTokenFilterFactoryTests.java @@ -46,7 +46,6 @@ import java.io.IOException; import java.io.StringReader; import java.util.Arrays; -import java.util.Locale; public class StemmerOverrideTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase { @Rule @@ -76,11 +75,8 @@ public void testRuleError() { "=>a", // no keys "a,=>b" // empty key )) { - expectThrows( - RuntimeException.class, - String.format(Locale.ROOT, "Should fail for invalid rule: '%s'", rule), - () -> create(rule) - ); + RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule)); + assertEquals("Line [1]: Invalid keyword override rule: " + rule, ex.getMessage()); } } @@ -90,4 +86,9 @@ public void testRulesOk() throws IOException { tokenizer.setReader(new StringReader("a b c")); assertTokenStreamContents(tokenFilterFactory.create(tokenizer), new String[] { "1", "2", "2" }); } + + public void testRulePartError() { + RuntimeException ex = expectThrows(RuntimeException.class, () -> create("a => 1", "b,c => 2", "# This is a comment", "=>a=>b")); + assertEquals("Line [4]: Invalid keyword override rule: =>a=>b", ex.getMessage()); + } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java index 8094e24b9adc8..03404a284020b 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java @@ -117,7 +117,7 @@ public void testSynonymWordDeleteByAnalyzer() throws IOException { fail("fail! due to synonym word deleted by analyzer"); } catch (Exception e) { assertThat(e, instanceOf(IllegalArgumentException.class)); - assertThat(e.getMessage(), startsWith("failed to build synonyms")); + assertThat(e.getMessage(), startsWith("Failed to build synonyms")); } } @@ -138,7 +138,7 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException { fail("fail! due to synonym word deleted by analyzer"); } catch (Exception e) { assertThat(e, instanceOf(IllegalArgumentException.class)); - assertThat(e.getMessage(), startsWith("failed to build synonyms")); + assertThat(e.getMessage(), startsWith("Failed to build synonyms")); } } diff --git a/plugins/analysis-icu/src/main/java/org/opensearch/index/analysis/IcuCollationTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/opensearch/index/analysis/IcuCollationTokenFilterFactory.java index 757a55487a162..af564bcc9d535 100644 --- a/plugins/analysis-icu/src/main/java/org/opensearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/opensearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -37,6 +37,7 @@ import java.nio.file.Files; import java.nio.file.InvalidPathException; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.opensearch.common.io.Streams; import org.opensearch.common.settings.Settings; @@ -80,9 +81,12 @@ public IcuCollationTokenFilterFactory(IndexSettings indexSettings, Environment e collator = new RuleBasedCollator(rules); } catch (Exception e) { if (failureToResolve != null) { - throw new IllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + LogManager.getLogger(IcuCollationTokenFilterFactory.class) + .error("Failed to resolve collation rules location", failureToResolve); + throw new IllegalArgumentException("Failed to resolve collation rules location"); } else { - throw new IllegalArgumentException("Failed to parse collation rules", e); + LogManager.getLogger(IcuCollationTokenFilterFactory.class).error("Failed to parse collation rules", e); + throw new IllegalArgumentException("Failed to parse collation rules"); } } } else { diff --git a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java index fef8d06c466b9..8e9c209ae421d 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java @@ -49,7 +49,7 @@ public class KuromojiPartOfSpeechFilterFactory extends AbstractTokenFilterFactor public KuromojiPartOfSpeechFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); - List wordList = Analysis.getWordList(env, settings, "stoptags"); + List wordList = Analysis.parseWordList(env, settings, "stoptags", s -> s); if (wordList != null) { stopTags.addAll(wordList); } else { diff --git a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiTokenizerFactory.java index b5e718eaa6fa0..ac4def8ac9a11 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiTokenizerFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiTokenizerFactory.java @@ -32,6 +32,8 @@ package org.opensearch.index.analysis; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; @@ -50,6 +52,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { + private static final Logger LOGGER = LogManager.getLogger(KuromojiTokenizerFactory.class); private static final String USER_DICT_PATH_OPTION = "user_dictionary"; private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules"; private static final String NBEST_COST = "nbest_cost"; @@ -74,6 +77,14 @@ public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, St discardCompoundToken = settings.getAsBoolean(DISCARD_COMPOUND_TOKEN, false); } + private static String parse(String rule, Set dup) { + String[] values = CSVUtil.parse(rule); + if (dup.add(values[0]) == false) { + throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary."); + } + return rule; + } + public static UserDictionary getUserDictionary(Environment env, Settings settings) { if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { throw new IllegalArgumentException( @@ -81,31 +92,26 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting ); } try { - List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false); + Set dup = new HashSet<>(); + List ruleList = Analysis.parseWordList( + env, + settings, + USER_DICT_PATH_OPTION, + USER_DICT_RULES_OPTION, + s -> parse(s, dup) + ); if (ruleList == null || ruleList.isEmpty()) { return null; } - Set dup = new HashSet<>(); - int lineNum = 0; - for (String line : ruleList) { - // ignore comments - if (line.startsWith("#") == false) { - String[] values = CSVUtil.parse(line); - if (dup.add(values[0]) == false) { - throw new IllegalArgumentException( - "Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]" - ); - } - } - ++lineNum; - } + StringBuilder sb = new StringBuilder(); for (String line : ruleList) { sb.append(line).append(System.lineSeparator()); } return UserDictionary.open(new StringReader(sb.toString())); } catch (IOException e) { - throw new OpenSearchException("failed to load kuromoji user dictionary", e); + LOGGER.error("Failed to load kuromoji user dictionary", e); + throw new OpenSearchException("Failed to load kuromoji user dictionary"); } } diff --git a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java index e17658d83a085..23e6ef9fea059 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java @@ -390,8 +390,8 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception { "制限スピード,制限スピード,セイゲンスピード,テスト名詞" ) .build(); - IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); - assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]")); + RuntimeException exc = expectThrows(RuntimeException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), equalTo("Line [4]: Found duplicate term [制限スピード] in user dictionary.")); } public void testDiscardCompoundToken() throws Exception { diff --git a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriAnalyzerProvider.java b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriAnalyzerProvider.java index 3dee606185429..e3b1cef6aee8a 100644 --- a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriAnalyzerProvider.java +++ b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriAnalyzerProvider.java @@ -52,7 +52,7 @@ public NoriAnalyzerProvider(IndexSettings indexSettings, Environment env, String super(indexSettings, name, settings); final KoreanTokenizer.DecompoundMode mode = NoriTokenizerFactory.getMode(settings); final UserDictionary userDictionary = NoriTokenizerFactory.getUserDictionary(env, settings); - final List tagList = Analysis.getWordList(env, settings, "stoptags"); + final List tagList = Analysis.parseWordList(env, settings, "stoptags", s -> s); final Set stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS; analyzer = new KoreanAnalyzer(userDictionary, mode, stopTags, false); } diff --git a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java index 18cbc3c7c153d..5023db50422fc 100644 --- a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java +++ b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java @@ -48,7 +48,7 @@ public class NoriPartOfSpeechStopFilterFactory extends AbstractTokenFilterFactor public NoriPartOfSpeechStopFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); - List tagList = Analysis.getWordList(env, settings, "stoptags"); + List tagList = Analysis.parseWordList(env, settings, "stoptags", s -> s); this.stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS; } diff --git a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriTokenizerFactory.java index 5136277611e3a..9f3183194cdae 100644 --- a/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/opensearch/index/analysis/NoriTokenizerFactory.java @@ -32,6 +32,8 @@ package org.opensearch.index.analysis; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ko.KoreanTokenizer; import org.apache.lucene.analysis.ko.dict.UserDictionary; @@ -47,6 +49,7 @@ import java.util.Locale; public class NoriTokenizerFactory extends AbstractTokenizerFactory { + private static final Logger LOGGER = LogManager.getLogger(NoriTokenizerFactory.class); private static final String USER_DICT_PATH_OPTION = "user_dictionary"; private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules"; @@ -67,7 +70,7 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting "It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]" ); } - List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true); + List ruleList = Analysis.parseWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, s -> s); StringBuilder sb = new StringBuilder(); if (ruleList == null || ruleList.isEmpty()) { return null; @@ -78,7 +81,8 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting try (Reader rulesReader = new StringReader(sb.toString())) { return UserDictionary.open(rulesReader); } catch (IOException e) { - throw new OpenSearchException("failed to load nori user dictionary", e); + LOGGER.error("Failed to load nori user dictionary", e); + throw new OpenSearchException("Failed to load nori user dictionary"); } } diff --git a/release-notes/opensearch.release-notes-2.4.0.md b/release-notes/opensearch.release-notes-2.4.0.md index a2c9311f4159b..a2919798a358a 100644 --- a/release-notes/opensearch.release-notes-2.4.0.md +++ b/release-notes/opensearch.release-notes-2.4.0.md @@ -95,6 +95,6 @@ - Fix message "No OpenSearchException found" when detailed_error disabled by return meaningful messages ([#4708](https://github.com/opensearch-project/OpenSearch/pull/4708)) - Add fix for auto expand replica validation ([#4994](https://github.com/opensearch-project/OpenSearch/pull/4994)) - Fix build failures on the Windows platform ([#4924](https://github.com/opensearch-project/OpenSearch/issues/4924)) - +- Fix error handling while reading analyzer mapping rules ([6d20423](https://github.com/opensearch-project/OpenSearch/commit/6d20423f5920745463b1abc5f1daf6a786c41aa0)) ### Security - CVE-2022-25857 org.yaml:snakeyaml DOS vulnerability ([#4341](https://github.com/opensearch-project/OpenSearch/pull/4341)) diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java index 9fe5abcb3ea9d..f2a9caebb9c03 100644 --- a/server/src/main/java/org/opensearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/opensearch/index/analysis/Analysis.java @@ -32,6 +32,8 @@ package org.opensearch.index.analysis; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer; @@ -94,6 +96,7 @@ * @opensearch.internal */ public class Analysis { + private static final Logger LOGGER = LogManager.getLogger(Analysis.class); public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); @@ -166,7 +169,7 @@ public static CharArraySet parseWords( return resolveNamedWords(settings.getAsList(name), namedWords, ignoreCase); } } - List pathLoadedWords = getWordList(env, settings, name); + List pathLoadedWords = parseWordList(env, settings, name, s -> s); if (pathLoadedWords != null) { return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase); } @@ -207,7 +210,7 @@ private static CharArraySet resolveNamedWords(Collection words, Map wordList = getWordList(env, settings, settingsPrefix); + List wordList = parseWordList(env, settings, settingsPrefix, s -> s); if (wordList == null) { return null; } @@ -215,15 +218,48 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String return new CharArraySet(wordList, ignoreCase); } + public static List parseWordList(Environment env, Settings settings, String settingPrefix, CustomMappingRuleParser parser) { + return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser); + } + /** - * Fetches a list of words from the specified settings file. The list should either be available at the key - * specified by settingsPrefix or in a file specified by settingsPrefix + _path. + * Parses a list of words from the specified settings or from a file, with the given parser. * * @throws IllegalArgumentException * If the word list cannot be found at either key. + * @throws RuntimeException + * If there is error parsing the words */ - public static List getWordList(Environment env, Settings settings, String settingPrefix) { - return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true); + public static List parseWordList( + Environment env, + Settings settings, + String settingPath, + String settingList, + CustomMappingRuleParser parser + ) { + List words = getWordList(env, settings, settingPath, settingList); + if (words == null) { + return null; + } + List rules = new ArrayList<>(); + int lineNum = 0; + for (String word : words) { + lineNum++; + if (word.startsWith("#") == false) { + try { + rules.add(parser.apply(word)); + } catch (RuntimeException ex) { + String wordListPath = settings.get(settingPath, null); + if (wordListPath == null || isUnderConfig(env, wordListPath)) { + throw new RuntimeException("Line [" + lineNum + "]: " + ex.getMessage()); + } else { + LOGGER.error("Line [{}]: {}", lineNum, ex); + throw new RuntimeException("Line [" + lineNum + "]: " + "Invalid rule"); + } + } + } + } + return rules; } /** @@ -233,43 +269,33 @@ public static List getWordList(Environment env, Settings settings, Strin * @throws IllegalArgumentException * If the word list cannot be found at either key. */ - public static List getWordList( - Environment env, - Settings settings, - String settingPath, - String settingList, - boolean removeComments - ) { + private static List getWordList(Environment env, Settings settings, String settingPath, String settingList) { String wordListPath = settings.get(settingPath, null); if (wordListPath == null) { - List explicitWordList = settings.getAsList(settingList, null); - if (explicitWordList == null) { - return null; - } else { - return explicitWordList; - } + return settings.getAsList(settingList, null); } - final Path path = env.configFile().resolve(wordListPath); + final Path path = resolveAnalyzerPath(env, wordListPath); try { - return loadWordList(path, removeComments); + return loadWordList(path); } catch (CharacterCodingException ex) { String message = String.format( Locale.ROOT, - "Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded", - settingPath, - path.toString() + "Unsupported character encoding detected while reading %s: files must be UTF-8 encoded", + settingPath ); - throw new IllegalArgumentException(message, ex); + LOGGER.error("{}: from file: {}, exception is: {}", message, path.toString(), ex); + throw new IllegalArgumentException(message); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString()); - throw new IllegalArgumentException(message, ioe); + String message = String.format(Locale.ROOT, "IOException while reading %s: file not readable", settingPath); + LOGGER.error("{}, from file: {}, exception is: {}", message, path.toString(), ioe); + throw new IllegalArgumentException(message); } } - private static List loadWordList(Path path, boolean removeComments) throws IOException { + private static List loadWordList(Path path) throws IOException { final List result = new ArrayList<>(); try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) { String word; @@ -277,9 +303,7 @@ private static List loadWordList(Path path, boolean removeComments) thro if (Strings.hasText(word) == false) { continue; } - if (removeComments == false || word.startsWith("#") == false) { - result.add(word.trim()); - } + result.add(word.trim()); } } return result; @@ -296,21 +320,34 @@ public static Reader getReaderFromFile(Environment env, Settings settings, Strin if (filePath == null) { return null; } - final Path path = env.configFile().resolve(filePath); + final Path path = resolveAnalyzerPath(env, filePath); try { return Files.newBufferedReader(path, StandardCharsets.UTF_8); } catch (CharacterCodingException ex) { String message = String.format( Locale.ROOT, - "Unsupported character encoding detected while reading %s_path: %s files must be UTF-8 encoded", - settingPrefix, - path.toString() + "Unsupported character encoding detected while reading %s_path: files must be UTF-8 encoded", + settingPrefix ); - throw new IllegalArgumentException(message, ex); + LOGGER.error("{}: from file: {}, exception is: {}", message, path.toString(), ex); + throw new IllegalArgumentException(message); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); - throw new IllegalArgumentException(message, ioe); + String message = String.format(Locale.ROOT, "IOException while reading %s_path: file not readable", settingPrefix); + LOGGER.error("{}, from file: {}, exception is: {}", message, path.toString(), ioe); + throw new IllegalArgumentException(message); } } + public static Path resolveAnalyzerPath(Environment env, String wordListPath) { + return env.configFile().resolve(wordListPath).normalize(); + } + + private static boolean isUnderConfig(Environment env, String wordListPath) { + try { + final Path path = env.configFile().resolve(wordListPath).normalize(); + return path.startsWith(env.configFile().toAbsolutePath()); + } catch (Exception ex) { + return false; + } + } } diff --git a/server/src/main/java/org/opensearch/index/analysis/CustomMappingRuleParser.java b/server/src/main/java/org/opensearch/index/analysis/CustomMappingRuleParser.java new file mode 100644 index 0000000000000..075c6ca1b585d --- /dev/null +++ b/server/src/main/java/org/opensearch/index/analysis/CustomMappingRuleParser.java @@ -0,0 +1,21 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.analysis; + +import java.util.function.Function; + +/** + * A parser that takes a raw string and returns the parsed data of type T. + * + * @param type of parsed data + */ +@FunctionalInterface +public interface CustomMappingRuleParser extends Function { + +} diff --git a/server/src/main/java/org/opensearch/index/analysis/MappingRule.java b/server/src/main/java/org/opensearch/index/analysis/MappingRule.java new file mode 100644 index 0000000000000..92c9d2a17dc1e --- /dev/null +++ b/server/src/main/java/org/opensearch/index/analysis/MappingRule.java @@ -0,0 +1,30 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.analysis; + +/** + * Represents a mapping between two objects. + */ +public class MappingRule { + private final L left; + private final R right; + + public MappingRule(L left, R right) { + this.left = left; + this.right = right; + } + + public L getLeft() { + return left; + } + + public R getRight() { + return right; + } +} diff --git a/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java b/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java index 61191a1e49f63..20b7ec23a778e 100644 --- a/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java +++ b/server/src/main/java/org/opensearch/indices/analysis/HunspellService.java @@ -123,7 +123,8 @@ public HunspellService(final Settings settings, final Environment env, final Map try { return loadDictionary(locale, settings, env); } catch (Exception e) { - throw new IllegalStateException("failed to load hunspell dictionary for locale: " + locale, e); + logger.error("Failed to load hunspell dictionary for locale: " + locale, e); + throw new IllegalStateException("Failed to load hunspell dictionary for locale: " + locale); } }; if (!HUNSPELL_LAZY_LOAD.get(settings)) { diff --git a/server/src/test/java/org/opensearch/index/analysis/AnalysisTests.java b/server/src/test/java/org/opensearch/index/analysis/AnalysisTests.java index d6f0cb194f222..01281ea323e60 100644 --- a/server/src/test/java/org/opensearch/index/analysis/AnalysisTests.java +++ b/server/src/test/java/org/opensearch/index/analysis/AnalysisTests.java @@ -39,14 +39,10 @@ import org.opensearch.test.OpenSearchTestCase; import java.io.BufferedWriter; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; -import java.nio.charset.CharacterCodingException; -import java.nio.charset.MalformedInputException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.nio.file.NoSuchFileException; import java.nio.file.Path; import java.util.Arrays; import java.util.List; @@ -79,13 +75,10 @@ public void testParseNonExistingFile() { Environment env = TestEnvironment.newEnvironment(nodeSettings); IllegalArgumentException ex = expectThrows( IllegalArgumentException.class, - () -> Analysis.getWordList(env, nodeSettings, "foo.bar") - ); - assertEquals("IOException while reading foo.bar_path: " + tempDir.resolve("foo.dict").toString(), ex.getMessage()); - assertTrue( - ex.getCause().toString(), - ex.getCause() instanceof FileNotFoundException || ex.getCause() instanceof NoSuchFileException + () -> Analysis.parseWordList(env, nodeSettings, "foo.bar", s -> s) ); + assertEquals("IOException while reading foo.bar_path: file not readable", ex.getMessage()); + assertNull(ex.getCause()); } public void testParseFalseEncodedFile() throws IOException { @@ -99,18 +92,10 @@ public void testParseFalseEncodedFile() throws IOException { Environment env = TestEnvironment.newEnvironment(nodeSettings); IllegalArgumentException ex = expectThrows( IllegalArgumentException.class, - () -> Analysis.getWordList(env, nodeSettings, "foo.bar") - ); - assertEquals( - "Unsupported character encoding detected while reading foo.bar_path: " - + tempDir.resolve("foo.dict").toString() - + " - files must be UTF-8 encoded", - ex.getMessage() - ); - assertTrue( - ex.getCause().toString(), - ex.getCause() instanceof MalformedInputException || ex.getCause() instanceof CharacterCodingException + () -> Analysis.parseWordList(env, nodeSettings, "foo.bar", s -> s) ); + assertEquals("Unsupported character encoding detected while reading foo.bar_path: files must be UTF-8 encoded", ex.getMessage()); + assertNull(ex.getCause()); } public void testParseWordList() throws IOException { @@ -124,8 +109,42 @@ public void testParseWordList() throws IOException { writer.write('\n'); } Environment env = TestEnvironment.newEnvironment(nodeSettings); - List wordList = Analysis.getWordList(env, nodeSettings, "foo.bar"); + List wordList = Analysis.parseWordList(env, nodeSettings, "foo.bar", s -> s); assertEquals(Arrays.asList("hello", "world"), wordList); + } + public void testParseWordListError() throws IOException { + Path home = createTempDir(); + Path config = home.resolve("config"); + Files.createDirectory(config); + Path dict = config.resolve("foo.dict"); + Settings nodeSettings = Settings.builder().put("foo.bar_path", dict).put(Environment.PATH_HOME_SETTING.getKey(), home).build(); + try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) { + writer.write("abcd"); + writer.write('\n'); + } + Environment env = TestEnvironment.newEnvironment(nodeSettings); + RuntimeException ex = expectThrows( + RuntimeException.class, + () -> Analysis.parseWordList( + env, + nodeSettings, + "foo.bar", + s -> { throw new RuntimeException("Error while parsing rule = " + s); } + ) + ); + assertEquals("Line [1]: Error while parsing rule = abcd", ex.getMessage()); + } + + public void testParseWordListOutsideConfigDirError() { + Path home = createTempDir(); + Path dict = home.resolve("/etc/os-release"); + Settings nodeSettings = Settings.builder().put("foo.bar_path", dict).put(Environment.PATH_HOME_SETTING.getKey(), home).build(); + Environment env = TestEnvironment.newEnvironment(nodeSettings); + RuntimeException ex = expectThrows( + RuntimeException.class, + () -> Analysis.parseWordList(env, nodeSettings, "foo.bar", s -> { throw new RuntimeException("Error while parsing"); }) + ); + assertEquals("Line [1]: Invalid rule", ex.getMessage()); } } diff --git a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java index efec81e803f1c..e2faa078134af 100644 --- a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java +++ b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java @@ -168,11 +168,12 @@ private void testSimpleConfiguration(Settings settings) throws IOException { } public void testWordListPath() throws Exception { - Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Path home = createTempDir(); + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), home.toString()).build(); Environment env = TestEnvironment.newEnvironment(settings); String[] words = new String[] { "donau", "dampf", "schiff", "spargel", "creme", "suppe" }; - Path wordListFile = generateWordList(words); + Path wordListFile = generateWordList(home, words); settings = Settings.builder() .loadFromSource("index: \n word_list_path: " + wordListFile.toAbsolutePath(), XContentType.YAML) .build(); @@ -183,8 +184,10 @@ public void testWordListPath() throws Exception { Files.delete(wordListFile); } - private Path generateWordList(String[] words) throws Exception { - Path wordListFile = createTempDir().resolve("wordlist.txt"); + private Path generateWordList(Path home, String[] words) throws Exception { + Path config = home.resolve("config"); + Files.createDirectory(config); + Path wordListFile = config.resolve("wordlist.txt"); try (BufferedWriter writer = Files.newBufferedWriter(wordListFile, StandardCharsets.UTF_8)) { for (String word : words) { writer.write(word); diff --git a/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java b/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java index 28732b099ea30..f66045898f4a3 100644 --- a/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/analyze/HunspellServiceTests.java @@ -42,8 +42,6 @@ import static java.util.Collections.emptyMap; import static org.opensearch.indices.analysis.HunspellService.HUNSPELL_IGNORE_CASE; import static org.opensearch.indices.analysis.HunspellService.HUNSPELL_LAZY_LOAD; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.hasToString; import static org.hamcrest.Matchers.notNullValue; public class HunspellServiceTests extends OpenSearchTestCase { @@ -91,11 +89,11 @@ public void testDicWithNoAff() throws Exception { final Environment environment = new Environment(settings, getDataPath("/indices/analyze/no_aff_conf_dir")); new HunspellService(settings, environment, emptyMap()).getDictionary("en_US"); }); - assertEquals("failed to load hunspell dictionary for locale: en_US", e.getMessage()); - assertThat(e.getCause(), hasToString(containsString("Missing affix file"))); + assertEquals("Failed to load hunspell dictionary for locale: en_US", e.getMessage()); + assertNull(e.getCause()); } - public void testDicWithTwoAffs() throws Exception { + public void testDicWithTwoAffs() { Settings settings = Settings.builder() .put(HUNSPELL_LAZY_LOAD.getKey(), randomBoolean()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) @@ -105,7 +103,7 @@ public void testDicWithTwoAffs() throws Exception { final Environment environment = new Environment(settings, getDataPath("/indices/analyze/two_aff_conf_dir")); new HunspellService(settings, environment, emptyMap()).getDictionary("en_US"); }); - assertEquals("failed to load hunspell dictionary for locale: en_US", e.getMessage()); - assertThat(e.getCause(), hasToString(containsString("Too many affix files"))); + assertEquals("Failed to load hunspell dictionary for locale: en_US", e.getMessage()); + assertNull(e.getCause()); } }