From 10858262d792927ed0514f7d78212eb3c5d8060a Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:21:24 +0800 Subject: [PATCH 1/2] =?UTF-8?q?[opt](standard95)=20the=20=E2=80=98standard?= =?UTF-8?q?95=E2=80=99=20tokenizer=20does=20not=20include=20stop=20words?= =?UTF-8?q?=20by=20default.=20(#209)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/CLucene/analysis/AnalysisHeader.h | 10 ++++++++++ .../analysis/standard95/StandardAnalyzer.h | 16 +++++++--------- .../analysis/standard95/StandardTokenizer.h | 14 +++++++------- src/test/analysis/TestStandard95.cpp | 4 +++- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index 578d8e0061d..74aca5a5b6d 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -11,6 +11,8 @@ #include "CLucene/util/VoidList.h" #include "CLucene/LuceneThreads.h" +#include + CL_CLASS_DEF(util,Reader) CL_CLASS_DEF(util,IReader) @@ -297,6 +299,11 @@ class CLUCENE_EXPORT Analyzer{ virtual void set_lowercase(bool lowercase) { _lowercase = lowercase; } + + virtual void set_stopwords(std::unordered_set* stopwords) { + _stopwords = stopwords; + } + private: DEFINE_MUTEX(THIS_LOCK) @@ -313,7 +320,9 @@ class CLUCENE_EXPORT Analyzer{ * to save a TokenStream for later re-use by the same * thread. */ virtual void setPreviousTokenStream(TokenStream* obj); + bool _lowercase = false; + std::unordered_set* _stopwords = nullptr; public: /** @@ -350,6 +359,7 @@ class CLUCENE_EXPORT Tokenizer:public TokenStream { /** The text source for this Tokenizer. */ CL_NS(util)::Reader* input; bool lowercase = false; + std::unordered_set* stopwords = nullptr; public: /** Construct a tokenizer with null input. */ diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h index 7460c8119f2..ccfd1030e15 100644 --- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h +++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h @@ -6,18 +6,22 @@ namespace lucene::analysis::standard95 { class StandardAnalyzer : public Analyzer { public: - StandardAnalyzer() : Analyzer() { _lowercase = true; } + StandardAnalyzer() : Analyzer() { + _lowercase = true; + _stopwords = nullptr; + } + bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase); + return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords); } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { if (tokenizer_ == nullptr) { - tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase); + tokenizer_ = new StandardTokenizer(reader, _lowercase, _stopwords); } else { tokenizer_->reset(reader); } @@ -31,13 +35,7 @@ class StandardAnalyzer : public Analyzer { } } - void useStopWords(bool useStopWords) { - useStopWords_ = useStopWords; - } - private: - bool useStopWords_ = true; - StandardTokenizer* tokenizer_ = nullptr; }; diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h b/src/core/CLucene/analysis/standard95/StandardTokenizer.h index 1aac86716de..431673f00e6 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h +++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h @@ -19,15 +19,17 @@ static std::unordered_set stop_words = { class StandardTokenizer : public Tokenizer { public: - StandardTokenizer(lucene::util::Reader* in, bool useStopWords) - : Tokenizer(in), useStopWords_(useStopWords) { + StandardTokenizer(lucene::util::Reader* in) + : Tokenizer(in) { scanner_ = std::make_unique(in); Tokenizer::lowercase = true; + Tokenizer::stopwords = nullptr; } - StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool lowercase) - : Tokenizer(in), useStopWords_(useStopWords) { + StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set* stopwords) + : Tokenizer(in) { scanner_ = std::make_unique(in); Tokenizer::lowercase = lowercase; + Tokenizer::stopwords = stopwords; } Token* next(Token* token) override { @@ -47,7 +49,7 @@ class StandardTokenizer : public Tokenizer { std::transform(term.begin(), term.end(), const_cast(term.data()), [](char c) { return to_lower(c); }); } - if (useStopWords_ && stop_words.count(term)) { + if (stopwords && stopwords->count(term)) { skippedPositions++; continue; } @@ -70,8 +72,6 @@ class StandardTokenizer : public Tokenizer { }; private: - bool useStopWords_ = true; - std::unique_ptr scanner_; int32_t skippedPositions = 0; diff --git a/src/test/analysis/TestStandard95.cpp b/src/test/analysis/TestStandard95.cpp index 9c839ddbab6..80f3ba88240 100644 --- a/src/test/analysis/TestStandard95.cpp +++ b/src/test/analysis/TestStandard95.cpp @@ -3,11 +3,13 @@ #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/standard95/StandardAnalyzer.h" +#include "CLucene/analysis/standard95/StandardTokenizer.h" #include "test.h" void testCut(const std::string &str, std::vector &tokens) { auto standard = std::make_unique(); + standard->set_stopwords(&lucene::analysis::standard95::stop_words); auto tokenizer = static_cast( standard->tokenStream(L"name", nullptr)); @@ -28,7 +30,7 @@ void testCut(const std::string &str, std::vector &tokens) { void testCutLines(std::vector& datas, std::vector &tokens) { auto standard = std::make_unique(); - standard->useStopWords(false); + standard->set_stopwords(nullptr); auto tokenizer = static_cast( standard->tokenStream(L"name", nullptr)); From 12da98ba277939291b1e7787a12921b1421a9a83 Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:25:32 +0800 Subject: [PATCH 2/2] [fix](inverted index) special characters cause buffer overflow in Unicode tokenization. (#210) --- .../standard95/StandardTokenizerImpl.cpp | 52 ++++--------------- src/core/CLucene/util/stringUtil.h | 4 +- 2 files changed, 12 insertions(+), 44 deletions(-) diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp b/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp index e8ce8f77687..0dfb116557f 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp +++ b/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp @@ -58,7 +58,7 @@ const std::vector StandardTokenizerImpl::ZZ_ERROR_MSG = { "Error: pushback value was too large"}; StandardTokenizerImpl::StandardTokenizerImpl(lucene::util::Reader* reader) - : zzBuffer(ZZ_BUFFERSIZE), zzReader(reader) {} + : zzReader(reader), zzBuffer((reader == nullptr) ? 0 : reader->size()) {} std::string_view StandardTokenizerImpl::getText() { return std::string_view(zzBuffer.data() + zzStartRead, @@ -67,53 +67,20 @@ std::string_view StandardTokenizerImpl::getText() { bool StandardTokenizerImpl::zzRefill() { if (zzStartRead > 0) { - zzEndRead += zzFinalHighSurrogate; - zzFinalHighSurrogate = 0; - std::copy_n(zzBuffer.begin() + zzStartRead, zzEndRead - zzStartRead, - zzBuffer.begin()); - - zzEndRead -= zzStartRead; - zzCurrentPos -= zzStartRead; - zzMarkedPos -= zzStartRead; - zzStartRead = 0; - } - - int32_t requested = zzBuffer.size() - zzEndRead - zzFinalHighSurrogate; - if (requested == 0) { - return true; + return true; } - int32_t numRead = zzReader->readCopy(zzBuffer.data(), zzEndRead, requested); - if (numRead == 0) { - _CLTHROWA(CL_ERR_Runtime, - "Reader returned 0 characters. See JFlex examples/zero-reader " - "for a workaround."); - } + int32_t numRead = zzReader->readCopy(zzBuffer.data(), 0, zzBuffer.size()); if (numRead > 0) { - zzEndRead += numRead; - - int32_t n = - StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzEndRead)); - if (n == -1) { - yyResetPosition(); - return true; - } + assert(zzBuffer.size() == numRead); + zzEndRead += numRead; - if (n != 0) { - if (numRead == requested) { - zzEndRead -= n; - zzFinalHighSurrogate = n; - } else { - int32_t c = zzReader->read(); - if (c == -1) { + int32_t n = StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzBuffer.size())); + if (n != 0) { return true; - } else { - _CLTHROWA(CL_ERR_Runtime, "Why did you come here"); - } } - } - return false; + return false; } return true; @@ -126,6 +93,7 @@ void StandardTokenizerImpl::yyclose() { void StandardTokenizerImpl::yyreset(lucene::util::Reader* reader) { zzReader = reader; + zzBuffer.resize(reader->size()); yyResetPosition(); zzLexicalState = YYINITIAL; } @@ -181,7 +149,7 @@ int32_t StandardTokenizerImpl::getNextToken() { { while (true) { - if (zzCurrentPosL < zzEndReadL) { + if (zzCurrentPosL < zzEndReadL && (zzCurrentPosL - zzStartRead) < ZZ_BUFFERSIZE) { size_t len = 0; zzInput = decodeUtf8ToCodepoint( std::string_view(zzBufferL.data() + zzCurrentPosL, zzEndReadL), diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h index 4a022e3e24c..6fa29822d46 100644 --- a/src/core/CLucene/util/stringUtil.h +++ b/src/core/CLucene/util/stringUtil.h @@ -296,10 +296,10 @@ class StringUtil { } else { if ((c & 0xC0) != 0x80) return -1; codepoint = (codepoint << 6) | (c & 0x3F); - if (!is_valid_codepoint(codepoint)) { + bytes_in_char--; + if (bytes_in_char == 0 && !is_valid_codepoint(codepoint)) { return -1; } - bytes_in_char--; surplus_bytes++; } }