From 10858262d792927ed0514f7d78212eb3c5d8060a Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:21:24 +0800 Subject: [PATCH] =?UTF-8?q?[opt](standard95)=20the=20=E2=80=98standard95?= =?UTF-8?q?=E2=80=99=20tokenizer=20does=20not=20include=20stop=20words=20b?= =?UTF-8?q?y=20default.=20(#209)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/CLucene/analysis/AnalysisHeader.h | 10 ++++++++++ .../analysis/standard95/StandardAnalyzer.h | 16 +++++++--------- .../analysis/standard95/StandardTokenizer.h | 14 +++++++------- src/test/analysis/TestStandard95.cpp | 4 +++- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index 578d8e0061d..74aca5a5b6d 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -11,6 +11,8 @@ #include "CLucene/util/VoidList.h" #include "CLucene/LuceneThreads.h" +#include + CL_CLASS_DEF(util,Reader) CL_CLASS_DEF(util,IReader) @@ -297,6 +299,11 @@ class CLUCENE_EXPORT Analyzer{ virtual void set_lowercase(bool lowercase) { _lowercase = lowercase; } + + virtual void set_stopwords(std::unordered_set* stopwords) { + _stopwords = stopwords; + } + private: DEFINE_MUTEX(THIS_LOCK) @@ -313,7 +320,9 @@ class CLUCENE_EXPORT Analyzer{ * to save a TokenStream for later re-use by the same * thread. */ virtual void setPreviousTokenStream(TokenStream* obj); + bool _lowercase = false; + std::unordered_set* _stopwords = nullptr; public: /** @@ -350,6 +359,7 @@ class CLUCENE_EXPORT Tokenizer:public TokenStream { /** The text source for this Tokenizer. */ CL_NS(util)::Reader* input; bool lowercase = false; + std::unordered_set* stopwords = nullptr; public: /** Construct a tokenizer with null input. */ diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h index 7460c8119f2..ccfd1030e15 100644 --- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h +++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h @@ -6,18 +6,22 @@ namespace lucene::analysis::standard95 { class StandardAnalyzer : public Analyzer { public: - StandardAnalyzer() : Analyzer() { _lowercase = true; } + StandardAnalyzer() : Analyzer() { + _lowercase = true; + _stopwords = nullptr; + } + bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase); + return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords); } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { if (tokenizer_ == nullptr) { - tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase); + tokenizer_ = new StandardTokenizer(reader, _lowercase, _stopwords); } else { tokenizer_->reset(reader); } @@ -31,13 +35,7 @@ class StandardAnalyzer : public Analyzer { } } - void useStopWords(bool useStopWords) { - useStopWords_ = useStopWords; - } - private: - bool useStopWords_ = true; - StandardTokenizer* tokenizer_ = nullptr; }; diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h b/src/core/CLucene/analysis/standard95/StandardTokenizer.h index 1aac86716de..431673f00e6 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h +++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h @@ -19,15 +19,17 @@ static std::unordered_set stop_words = { class StandardTokenizer : public Tokenizer { public: - StandardTokenizer(lucene::util::Reader* in, bool useStopWords) - : Tokenizer(in), useStopWords_(useStopWords) { + StandardTokenizer(lucene::util::Reader* in) + : Tokenizer(in) { scanner_ = std::make_unique(in); Tokenizer::lowercase = true; + Tokenizer::stopwords = nullptr; } - StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool lowercase) - : Tokenizer(in), useStopWords_(useStopWords) { + StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set* stopwords) + : Tokenizer(in) { scanner_ = std::make_unique(in); Tokenizer::lowercase = lowercase; + Tokenizer::stopwords = stopwords; } Token* next(Token* token) override { @@ -47,7 +49,7 @@ class StandardTokenizer : public Tokenizer { std::transform(term.begin(), term.end(), const_cast(term.data()), [](char c) { return to_lower(c); }); } - if (useStopWords_ && stop_words.count(term)) { + if (stopwords && stopwords->count(term)) { skippedPositions++; continue; } @@ -70,8 +72,6 @@ class StandardTokenizer : public Tokenizer { }; private: - bool useStopWords_ = true; - std::unique_ptr scanner_; int32_t skippedPositions = 0; diff --git a/src/test/analysis/TestStandard95.cpp b/src/test/analysis/TestStandard95.cpp index 9c839ddbab6..80f3ba88240 100644 --- a/src/test/analysis/TestStandard95.cpp +++ b/src/test/analysis/TestStandard95.cpp @@ -3,11 +3,13 @@ #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/standard95/StandardAnalyzer.h" +#include "CLucene/analysis/standard95/StandardTokenizer.h" #include "test.h" void testCut(const std::string &str, std::vector &tokens) { auto standard = std::make_unique(); + standard->set_stopwords(&lucene::analysis::standard95::stop_words); auto tokenizer = static_cast( standard->tokenStream(L"name", nullptr)); @@ -28,7 +30,7 @@ void testCut(const std::string &str, std::vector &tokens) { void testCutLines(std::vector& datas, std::vector &tokens) { auto standard = std::make_unique(); - standard->useStopWords(false); + standard->set_stopwords(nullptr); auto tokenizer = static_cast( standard->tokenStream(L"name", nullptr));