Skip to content

Commit

Permalink
[opt](standard95) the ‘standard95’ tokenizer does not include stop wo…
Browse files Browse the repository at this point in the history
…rds by default.
  • Loading branch information
zzzxl1993 committed Apr 22, 2024
1 parent 9f849a4 commit 0a142a4
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 16 deletions.
7 changes: 7 additions & 0 deletions src/core/CLucene/analysis/AnalysisHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,11 @@ class CLUCENE_EXPORT Analyzer{
virtual void set_lowercase(bool lowercase) {
_lowercase = lowercase;
}

virtual void use_stopwords(bool use_stopwords) {
_use_stopwords = use_stopwords;
}

private:

DEFINE_MUTEX(THIS_LOCK)
Expand All @@ -314,6 +319,7 @@ class CLUCENE_EXPORT Analyzer{
* thread. */
virtual void setPreviousTokenStream(TokenStream* obj);
bool _lowercase = false;
bool _use_stopwords = true;

public:
/**
Expand Down Expand Up @@ -350,6 +356,7 @@ class CLUCENE_EXPORT Tokenizer:public TokenStream {
/** The text source for this Tokenizer. */
CL_NS(util)::Reader* input;
bool lowercase = false;
bool use_stopwords = true;

public:
/** Construct a tokenizer with null input. */
Expand Down
16 changes: 7 additions & 9 deletions src/core/CLucene/analysis/standard95/StandardAnalyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,22 @@ namespace lucene::analysis::standard95 {

class StandardAnalyzer : public Analyzer {
public:
StandardAnalyzer() : Analyzer() { _lowercase = true; }
StandardAnalyzer() : Analyzer() {
_lowercase = true;
_use_stopwords = true;
}

bool isSDocOpt() override { return true; }

TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase);
return _CLNEW StandardTokenizer(reader, _lowercase, _use_stopwords);
}

TokenStream* reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
if (tokenizer_ == nullptr) {
tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase);
tokenizer_ = new StandardTokenizer(reader, _lowercase, _use_stopwords);
} else {
tokenizer_->reset(reader);
}
Expand All @@ -31,13 +35,7 @@ class StandardAnalyzer : public Analyzer {
}
}

void useStopWords(bool useStopWords) {
useStopWords_ = useStopWords;
}

private:
bool useStopWords_ = true;

StandardTokenizer* tokenizer_ = nullptr;
};

Expand Down
14 changes: 7 additions & 7 deletions src/core/CLucene/analysis/standard95/StandardTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,17 @@ static std::unordered_set<std::string_view> stop_words = {

class StandardTokenizer : public Tokenizer {
public:
StandardTokenizer(lucene::util::Reader* in, bool useStopWords)
: Tokenizer(in), useStopWords_(useStopWords) {
StandardTokenizer(lucene::util::Reader* in)
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = true;
Tokenizer::use_stopwords = true;
}
StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool lowercase)
: Tokenizer(in), useStopWords_(useStopWords) {
StandardTokenizer(lucene::util::Reader* in, bool lowercase, bool use_stopwords)
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = lowercase;
Tokenizer::use_stopwords = use_stopwords;
}

Token* next(Token* token) override {
Expand All @@ -47,7 +49,7 @@ class StandardTokenizer : public Tokenizer {
std::transform(term.begin(), term.end(), const_cast<char*>(term.data()),
[](char c) { return to_lower(c); });
}
if (useStopWords_ && stop_words.count(term)) {
if (Tokenizer::use_stopwords && stop_words.count(term)) {
skippedPositions++;
continue;
}
Expand All @@ -70,8 +72,6 @@ class StandardTokenizer : public Tokenizer {
};

private:
bool useStopWords_ = true;

std::unique_ptr<StandardTokenizerImpl> scanner_;

int32_t skippedPositions = 0;
Expand Down

0 comments on commit 0a142a4

Please sign in to comment.