Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Fix error handling while reading analyzer mapping rules #5125

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@

package org.opensearch.analysis.common;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.Analysis;
import org.xml.sax.InputSource;

import java.io.InputStream;
Expand All @@ -61,13 +63,15 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
}

Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath);
Path hyphenationPatternsFile = Analysis.resolveAnalyzerPath(env, hyphenationPatternsPath);

try {
InputStream in = Files.newInputStream(hyphenationPatternsFile);
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in));
} catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
LogManager.getLogger(HyphenationCompoundWordTokenFilterFactory.class)
.error("Exception while reading hyphenation_patterns_path ", e);
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractCharFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.NormalizingCharFilterFactory;

import java.io.Reader;
Expand All @@ -53,13 +54,13 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name);

List<String> rules = Analysis.getWordList(env, settings, "mappings");
List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse);
if (rules == null) {
throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
}

NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder();
parseRules(rules, normMapBuilder);
rules.forEach(rule -> normMapBuilder.add(rule.getLeft(), rule.getRight()));
normMap = normMapBuilder.build();
}

Expand All @@ -71,18 +72,13 @@ public Reader create(Reader tokenStream) {
// source => target
private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

/**
* parses a list of MappingCharFilter style rules into a normalize char map
*/
private void parseRules(List<String> rules, NormalizeCharMap.Builder map) {
for (String rule : rules) {
Matcher m = rulePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
map.add(lhs, rhs);
}
private MappingRule<String, String> parse(String rule) {
Matcher m = rulePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid mapping rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal mapping.");
return new MappingRule<>(lhs, rhs);
}

char[] out = new char[256];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,31 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactory {

private static final String MAPPING_SEPARATOR = "=>";
private final StemmerOverrideMap overrideMap;

StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);

List<String> rules = Analysis.getWordList(env, settings, "rules");
List<MappingRule<List<String>, String>> rules = Analysis.parseWordList(env, settings, "rules", this::parse);
if (rules == null) {
throw new IllegalArgumentException("stemmer override filter requires either `rules` or `rules_path` to be configured");
}

StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
parseRules(rules, builder, "=>");
for (MappingRule<List<String>, String> rule : rules) {
for (String key : rule.getLeft()) {
builder.add(key, rule.getRight());
}
}
overrideMap = builder.build();

}
Expand All @@ -67,27 +74,26 @@ public TokenStream create(TokenStream tokenStream) {
return new StemmerOverrideFilter(tokenStream, overrideMap);
}

static void parseRules(List<String> rules, StemmerOverrideFilter.Builder builder, String mappingSep) {
for (String rule : rules) {
String[] sides = rule.split(mappingSep, -1);
if (sides.length != 2) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
private MappingRule<List<String>, String> parse(String rule) {
String[] sides = rule.split(MAPPING_SEPARATOR, -1);
if (sides.length != 2) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}

String[] keys = sides[0].split(",", -1);
String override = sides[1].trim();
if (override.isEmpty() || override.indexOf(',') != -1) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
String[] keys = sides[0].split(",", -1);
String override = sides[1].trim();
if (override.isEmpty() || override.indexOf(',') != -1) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}

for (String key : keys) {
String trimmedKey = key.trim();
if (trimmedKey.isEmpty()) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
builder.add(trimmedKey, override);
List<String> trimmedKeys = new ArrayList<>();
for (String key : keys) {
String trimmedKey = key.trim();
if (trimmedKey.isEmpty()) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}
trimmedKeys.add(trimmedKey);
}
return new MappingRule<>(trimmedKeys, override);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

package org.opensearch.analysis.common;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
Expand Down Expand Up @@ -155,14 +156,15 @@ SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
}
return parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
LogManager.getLogger(SynonymTokenFilterFactory.class).error("Failed to build synonyms: ", e);
throw new IllegalArgumentException("Failed to build synonyms");
}
}

Reader getRulesFromSettings(Environment env) {
Reader rulesReader;
if (settings.getAsList("synonyms", null) != null) {
List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
List<String> rulesList = Analysis.parseWordList(env, settings, "synonyms", s -> s);
StringBuilder sb = new StringBuilder();
for (String line : rulesList) {
sb.append(line).append(System.lineSeparator());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.TokenFilterFactory;

import java.util.List;
Expand Down Expand Up @@ -73,7 +74,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
env,
settings,
"type_table",
WordDelimiterTokenFilterFactory::parse
);
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.TokenFilterFactory;

import java.util.Collection;
Expand Down Expand Up @@ -76,7 +77,12 @@ public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
env,
settings,
"type_table",
WordDelimiterTokenFilterFactory::parse
);
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
Expand Down Expand Up @@ -127,19 +133,23 @@ public int getFlag(int flag, Settings settings, String key, boolean defaultValue
// source => type
private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

static MappingRule<Character, Byte> parse(String rule) {
Matcher m = typePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid mapping rule: [" + rule + "]");
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Only a single character is allowed.");
if (rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal type.");
return new MappingRule<>(lhs.charAt(0), rhs);
}

/**
* parses a list of MappingCharFilter style rules into a custom byte[] type table
*/
static byte[] parseTypes(Collection<String> rules) {
static byte[] parseTypes(Collection<MappingRule<Character, Byte>> rules) {
SortedMap<Character, Byte> typeMap = new TreeMap<>();
for (String rule : rules) {
Matcher m = typePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs);
for (MappingRule<Character, Byte> rule : rules) {
typeMap.put(rule.getLeft(), rule.getRight());
}

// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,24 @@ public void testStemEnglishPossessive() throws IOException {
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOException {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.putList("index.analysis.filter.my_word_delimiter.type_table", rules)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin()
);
analysis.tokenFilter.get("my_word_delimiter");
}

public void testTypeTableParsingError() {
String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules));
assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.CharFilter;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.analysis.AnalysisTestsHelper;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.test.OpenSearchTestCase;

import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;

public class MappingCharFilterFactoryTests extends OpenSearchTestCase {
public static CharFilterFactory create(String... rules) throws IOException {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.put("index.analysis.analyzer.my_analyzer.char_filter", "my_mappings_char_filter")
.put("index.analysis.char_filter.my_mappings_char_filter.type", "mapping")
.putList("index.analysis.char_filter.my_mappings_char_filter.mappings", rules)
.build(),
new CommonAnalysisPlugin()
);

return analysis.charFilter.get("my_mappings_char_filter");
}

public void testRulesOk() throws IOException {
MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create(
"# This is a comment",
":) => _happy_",
":( => _sad_"
);
CharFilter inputReader = (CharFilter) mappingCharFilterFactory.create(new StringReader("I'm so :)"));
char[] tempBuff = new char[14];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
}
assertEquals("I'm so _happy_", output.toString());
}

public void testRuleError() {
for (String rule : Arrays.asList(
"", // empty
"a", // no arrow
"a:>b" // invalid delimiter
)) {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule));
assertEquals("Line [1]: Invalid mapping rule : [" + rule + "]", ex.getMessage());
}
}

public void testRulePartError() {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# This is a comment", ":) => _happy_", "a:b"));
assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Locale;

public class StemmerOverrideTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
@Rule
Expand Down Expand Up @@ -76,11 +75,8 @@ public void testRuleError() {
"=>a", // no keys
"a,=>b" // empty key
)) {
expectThrows(
RuntimeException.class,
String.format(Locale.ROOT, "Should fail for invalid rule: '%s'", rule),
() -> create(rule)
);
RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule));
assertEquals("Line [1]: Invalid keyword override rule: " + rule, ex.getMessage());
}
}

Expand All @@ -90,4 +86,9 @@ public void testRulesOk() throws IOException {
tokenizer.setReader(new StringReader("a b c"));
assertTokenStreamContents(tokenFilterFactory.create(tokenizer), new String[] { "1", "2", "2" });
}

public void testRulePartError() {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create("a => 1", "b,c => 2", "# This is a comment", "=>a=>b"));
assertEquals("Line [4]: Invalid keyword override rule: =>a=>b", ex.getMessage());
}
}
Loading