Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xapian_wrap.cpp: query verbatim for terms using fullwidth form #4782

Merged
merged 1 commit into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!perl
use Cassandane::Tiny;

sub test_email_query_cjk_fullwidth
:min_version_3_9 :needs_component_jmap :JMAPExtensions
{
my ($self) = @_;
my $jmap = $self->{jmap};

use utf8;
my $res = $jmap->CallMethods([
['Email/set', {
create => {
email1 => {
mailboxIds => {
'$inbox' => JSON::true
},
from => [{ email => 'foo@local' }],
to => [{ email => 'bar@local' }],
subject => $_->{id},
bodyStructure => {
type => 'text/plain',
partId => 'part1',
},
bodyValues => {
part1 => {
value => <<'EOF'
三菱UFJファクター株式会社
EOF
},
},
},
},
}, 'R1'],
]);
my $email1Id = $res->[0][1]{created}{email1}{id};
$self->assert_not_null($email1Id);

xlog $self, "run squatter";
$self->{instance}->run_command({cyrus => 1}, 'squatter');

my $res = $jmap->CallMethods([
['Email/query', {
filter => {
body => "UFJ",
},
}, 'R1'],
['Email/query', {
filter => {
body => "三菱UFJファクター株式会社",
},
}, 'R2'],
['Email/query', {
filter => {
body => "三菱UFJ",
},
}, 'R3'],
['Email/query', {
filter => {
body => "三菱",
},
}, 'R4'],
]);
$self->assert_deep_equals([$email1Id], $res->[0][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[1][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[2][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[3][1]{ids});

no utf8;
}
81 changes: 62 additions & 19 deletions imap/xapian_wrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,24 +720,30 @@ struct xapian_dbw
std::vector<std::string> *subjects;
};


static int xapian_dbw_init(xapian_dbw_t *dbw)
static Xapian::TermGenerator *new_term_generator(void)
{
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
dbw->default_stopper = get_stopper("en");
dbw->term_generator = new Xapian::TermGenerator;
dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
auto termgen = new Xapian::TermGenerator;
termgen->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
/* Always enable CJK word tokenization */
#if defined(USE_XAPIAN_WORD_BREAKS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
termgen->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
#elif defined(USE_XAPIAN_CJK_WORDS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
#endif
return termgen;
}


static int xapian_dbw_init(xapian_dbw_t *dbw)
{
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
dbw->default_stopper = get_stopper("en");
dbw->term_generator = new_term_generator();
dbw->doclangs = new std::set<std::string>;
dbw->subjects = new std::vector<std::string>;
return 0;
Expand Down Expand Up @@ -1868,6 +1874,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused
return new Xapian::Query(q);
}

static Xapian::Query* xapian_query_new_match_cjk(const xapian_db_t *db, const char *str, const char *prefix)
{
Xapian::Query *q = new Xapian::Query {db->parser->parse_query(
str,
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS,
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif
prefix)};

#if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS)
// There is a bug in Xapian v1.5 and CJK word segmentation, in which
// a term starting with a fullwidth Latin character such as U+FF21
// is indexed with capitalization, but queried in small letter form
// when parsed with parse_query.
//
// As a workaround, the following code checks if the query string
// contains any characters in the Halfwidth and Fullwidth Forms
// Unicode block. If so, it queries for either the query as
// generated by the query parser, or the terms as generated during
// indexing.
//
// XXX this should better be handled by normalizing characters in
// that Unicode block during indexing.
for (auto it = Xapian::Utf8Iterator(str); it != Xapian::Utf8Iterator(); ++it) {
if (*it >= 0xff00 && *it <= 0xffef) {
Xapian::TermGenerator *termgen = new_term_generator();
Xapian::Document doc;
termgen->set_document(doc);
termgen->index_text(str, 1, prefix);
Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin(), doc.termlist_end()};
if (qq.get_length()) {
*q |= qq;
}
delete termgen;
break; // stop at first occurence of full/half-width character
}
}
#endif

return q;
}

EXPORTED Xapian::Query *
xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
{
Expand Down Expand Up @@ -1898,16 +1950,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *
// Don't stem queries for Thaana codepage (0780) or higher.
for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
if (*p > 221) //has highbit
return new Xapian::Query {db->parser->parse_query(
str,
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS,
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif
prefix)};
return xapian_query_new_match_cjk(db, str, prefix);
}

// Stemable codepage.
Expand Down