Skip to content

Commit

Permalink
[Fix](analyzer) add reader ownership for chinese and standard analyzer (
Browse files Browse the repository at this point in the history
  • Loading branch information
airborne12 committed Jun 13, 2024
1 parent 2532463 commit a23a45e
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase);
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase, Analyzer::_ownReader);
} else {
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();

Expand Down
4 changes: 3 additions & 1 deletion src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ CL_NS_USE(util)
ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m) : Tokenizer(reader), mode(m) {
reset(reader);
Tokenizer::lowercase = false;
Tokenizer::ownReader = false;
}

ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase) : Tokenizer(reader), mode(m) {
ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase, bool ownReader) : Tokenizer(reader), mode(m) {
reset(reader);
Tokenizer::lowercase = lowercase;
Tokenizer::ownReader = ownReader;
}

void ChineseTokenizer::init(const ChineseDict* chineseDict) {
Expand Down
2 changes: 1 addition & 1 deletion src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class ChineseTokenizer : public lucene::analysis::Tokenizer {
public:
// Constructor
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase);
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase, bool ownReader=false);
static void init(const ChineseDict* chineseDict);

// Destructor
Expand Down
3 changes: 2 additions & 1 deletion src/core/CLucene/analysis/standard95/StandardAnalyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ class StandardAnalyzer : public Analyzer {
public:
StandardAnalyzer() : Analyzer() {
_lowercase = true;
_ownReader = false;
_stopwords = nullptr;
}

bool isSDocOpt() override { return true; }

TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords);
return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords, _ownReader);
}

TokenStream* reusableTokenStream(const TCHAR* fieldName,
Expand Down
4 changes: 3 additions & 1 deletion src/core/CLucene/analysis/standard95/StandardTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ class StandardTokenizer : public Tokenizer {
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = true;
Tokenizer::lowercase = false;
Tokenizer::stopwords = nullptr;
}
StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords)
StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords, bool ownReader=false)
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = lowercase;
Tokenizer::stopwords = stopwords;
Tokenizer::ownReader = ownReader;
}

Token* next(Token* token) override {
Expand Down

0 comments on commit a23a45e

Please sign in to comment.