From 4e45c9ed68d7a4ba77c8c3406453c05bede170e2 Mon Sep 17 00:00:00 2001 From: ebraminio Date: Tue, 23 Jul 2024 00:55:43 +0330 Subject: [PATCH] Use Lucene provided Persian stem (#14847) Lucene provided Persian stem apparently isn't hooked yet and this change is doing that based on what is done for Arabic stem support. Signed-off-by: Ebrahim Byagowi Signed-off-by: Daniel (dB.) Doubrovkine Co-authored-by: Daniel (dB.) Doubrovkine --- CHANGELOG.md | 1 + .../common/CommonAnalysisModulePlugin.java | 3 ++ .../common/PersianStemTokenFilterFactory.java | 52 +++++++++++++++++++ .../common/StemmerTokenFilterFactory.java | 3 ++ .../common/CommonAnalysisFactoryTests.java | 2 + .../test/analysis-common/40_token_filters.yml | 31 +++++++++++ .../analysis/AnalysisFactoryTestCase.java | 2 +- 7 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java diff --git a/CHANGELOG.md b/CHANGELOG.md index fdce3a5e24342..66322087a73c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Optimize TransportNodesAction to not send DiscoveryNodes for NodeStats, NodesInfo and ClusterStats call ([14749](https://github.com/opensearch-project/OpenSearch/pull/14749)) - Reduce logging in DEBUG for MasterService:run ([#14795](https://github.com/opensearch-project/OpenSearch/pull/14795)) - Enabling term version check on local state for all ClusterManager Read Transport Actions ([#14273](https://github.com/opensearch-project/OpenSearch/pull/14273)) +- Add persian_stem filter (([#14847](https://github.com/opensearch-project/OpenSearch/pull/14847))) ### Dependencies - Bump `org.gradle.test-retry` from 1.5.8 to 1.5.9 ([#13442](https://github.com/opensearch-project/OpenSearch/pull/13442)) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index cf2736a8583d2..f14e499081ce9 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -75,6 +75,7 @@ import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.fa.PersianStemFilter; import org.apache.lucene.analysis.fi.FinnishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.ga.IrishAnalyzer; @@ -315,6 +316,7 @@ public Map> getTokenFilters() { filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("persian_stem", PersianStemTokenFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); filters.put( "predicate_token_filter", @@ -558,6 +560,7 @@ public List getPreConfiguredTokenFilters() { ); })); filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("persian_stem", true, PersianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java new file mode 100644 index 0000000000000..afe8058343e17 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fa.PersianStemFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory { + + PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new PersianStemFilter(tokenStream); + } +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java index 5506626e40da0..e81f3c6cc09cc 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java @@ -47,6 +47,7 @@ import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.es.SpanishLightStemFilter; +import org.apache.lucene.analysis.fa.PersianStemFilter; import org.apache.lucene.analysis.fi.FinnishLightStemFilter; import org.apache.lucene.analysis.fr.FrenchLightStemFilter; import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; @@ -239,6 +240,8 @@ public TokenStream create(TokenStream tokenStream) { return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); } else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) { return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); + } else if ("persian".equalsIgnoreCase(language)) { + return new PersianStemFilter(tokenStream); // Portuguese stemmers } else if ("portuguese".equalsIgnoreCase(language)) { diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 11713f52f5b18..7e3140f8bcba3 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -158,6 +158,7 @@ protected Map> getTokenFilters() { filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class); filters.put("czechstem", CzechStemTokenFilterFactory.class); filters.put("germanstem", GermanStemTokenFilterFactory.class); + filters.put("persianstem", PersianStemTokenFilterFactory.class); filters.put("telugunormalization", TeluguNormalizationFilterFactory.class); filters.put("telugustem", TeluguStemFilterFactory.class); // this filter is not exposed and should only be used internally @@ -220,6 +221,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("ngram", null); filters.put("nGram", null); filters.put("persian_normalization", null); + filters.put("persian_stem", null); filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); filters.put("russian_stem", SnowballPorterFilterFactory.class); diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 802c79c780689..c6b075571f221 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1781,6 +1781,37 @@ - length: { tokens: 1 } - match: { tokens.0.token: abschliess } +--- +"persian_stem": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_persian_stem: + type: persian_stem + - do: + indices.analyze: + index: test + body: + text: جامدات + tokenizer: keyword + filter: [my_persian_stem] + - length: { tokens: 1 } + - match: { tokens.0.token: جامد } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: جامدات + tokenizer: keyword + filter: [persian_stem] + - length: { tokens: 1 } + - match: { tokens.0.token: جامد } + --- "russian_stem": - do: diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index 5231fe095f0f0..23cf4d47a49d9 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -139,6 +139,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("patterncapturegroup", MovedToAnalysisCommon.class) .put("patternreplace", MovedToAnalysisCommon.class) .put("persiannormalization", MovedToAnalysisCommon.class) + .put("persianstem", MovedToAnalysisCommon.class) .put("porterstem", MovedToAnalysisCommon.class) .put("portuguesestem", MovedToAnalysisCommon.class) .put("portugueselightstem", MovedToAnalysisCommon.class) @@ -219,7 +220,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("spanishpluralstem", Void.class) // LUCENE-10352 .put("daitchmokotoffsoundex", Void.class) - .put("persianstem", Void.class) // https://github.com/apache/lucene/pull/12169 .put("word2vecsynonym", Void.class) // https://github.com/apache/lucene/pull/12915