Skip to content

Commit

Permalink
Merge pull request #4782 from cyrusimap/xapian_query_fullwidth_halfwidth
Browse files Browse the repository at this point in the history
xapian_wrap.cpp: query verbatim for terms using fullwidth form
  • Loading branch information
rsto authored Mar 12, 2024
2 parents c6f3b56 + 6c0fc9b commit 803e081
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 19 deletions.
70 changes: 70 additions & 0 deletions cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!perl
use Cassandane::Tiny;

sub test_email_query_cjk_fullwidth
:min_version_3_9 :needs_component_jmap :JMAPExtensions
{
my ($self) = @_;
my $jmap = $self->{jmap};

use utf8;
my $res = $jmap->CallMethods([
['Email/set', {
create => {
email1 => {
mailboxIds => {
'$inbox' => JSON::true
},
from => [{ email => 'foo@local' }],
to => [{ email => 'bar@local' }],
subject => $_->{id},
bodyStructure => {
type => 'text/plain',
partId => 'part1',
},
bodyValues => {
part1 => {
value => <<'EOF'
三菱UFJファクター株式会社
EOF
},
},
},
},
}, 'R1'],
]);
my $email1Id = $res->[0][1]{created}{email1}{id};
$self->assert_not_null($email1Id);

xlog $self, "run squatter";
$self->{instance}->run_command({cyrus => 1}, 'squatter');

my $res = $jmap->CallMethods([
['Email/query', {
filter => {
body => "UFJ",
},
}, 'R1'],
['Email/query', {
filter => {
body => "三菱UFJファクター株式会社",
},
}, 'R2'],
['Email/query', {
filter => {
body => "三菱UFJ",
},
}, 'R3'],
['Email/query', {
filter => {
body => "三菱",
},
}, 'R4'],
]);
$self->assert_deep_equals([$email1Id], $res->[0][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[1][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[2][1]{ids});
$self->assert_deep_equals([$email1Id], $res->[3][1]{ids});

no utf8;
}
81 changes: 62 additions & 19 deletions imap/xapian_wrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,24 +718,30 @@ struct xapian_dbw
std::vector<std::string> *subjects;
};


static int xapian_dbw_init(xapian_dbw_t *dbw)
static Xapian::TermGenerator *new_term_generator(void)
{
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
dbw->default_stopper = get_stopper("en");
dbw->term_generator = new Xapian::TermGenerator;
dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
auto termgen = new Xapian::TermGenerator;
termgen->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
/* Always enable CJK word tokenization */
#if defined(USE_XAPIAN_WORD_BREAKS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
termgen->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
#elif defined(USE_XAPIAN_CJK_WORDS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
#endif
return termgen;
}


static int xapian_dbw_init(xapian_dbw_t *dbw)
{
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
dbw->default_stopper = get_stopper("en");
dbw->term_generator = new_term_generator();
dbw->doclangs = new std::set<std::string>;
dbw->subjects = new std::vector<std::string>;
return 0;
Expand Down Expand Up @@ -1866,6 +1872,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused
return new Xapian::Query(q);
}

static Xapian::Query* xapian_query_new_match_cjk(const xapian_db_t *db, const char *str, const char *prefix)
{
Xapian::Query *q = new Xapian::Query {db->parser->parse_query(
str,
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS,
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif
prefix)};

#if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS)
// There is a bug in Xapian v1.5 and CJK word segmentation, in which
// a term starting with a fullwidth Latin character such as U+FF21
// is indexed with capitalization, but queried in small letter form
// when parsed with parse_query.
//
// As a workaround, the following code checks if the query string
// contains any characters in the Halfwidth and Fullwidth Forms
// Unicode block. If so, it queries for either the query as
// generated by the query parser, or the terms as generated during
// indexing.
//
// XXX this should better be handled by normalizing characters in
// that Unicode block during indexing.
for (auto it = Xapian::Utf8Iterator(str); it != Xapian::Utf8Iterator(); ++it) {
if (*it >= 0xff00 && *it <= 0xffef) {
Xapian::TermGenerator *termgen = new_term_generator();
Xapian::Document doc;
termgen->set_document(doc);
termgen->index_text(str, 1, prefix);
Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin(), doc.termlist_end()};
if (qq.get_length()) {
*q |= qq;
}
delete termgen;
break; // stop at first occurence of full/half-width character
}
}
#endif

return q;
}

EXPORTED Xapian::Query *
xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
{
Expand Down Expand Up @@ -1896,16 +1948,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *
// Don't stem queries for Thaana codepage (0780) or higher.
for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
if (*p > 221) //has highbit
return new Xapian::Query {db->parser->parse_query(
str,
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS,
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif
prefix)};
return xapian_query_new_match_cjk(db, str, prefix);
}

// Stemable codepage.
Expand Down

0 comments on commit 803e081

Please sign in to comment.