diff --git a/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth b/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth new file mode 100644 index 0000000000..b1529f637b --- /dev/null +++ b/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth @@ -0,0 +1,70 @@ +#!perl +use Cassandane::Tiny; + +sub test_email_query_cjk_fullwidth + :min_version_3_9 :needs_component_jmap :JMAPExtensions +{ + my ($self) = @_; + my $jmap = $self->{jmap}; + +use utf8; + my $res = $jmap->CallMethods([ + ['Email/set', { + create => { + email1 => { + mailboxIds => { + '$inbox' => JSON::true + }, + from => [{ email => 'foo@local' }], + to => [{ email => 'bar@local' }], + subject => $_->{id}, + bodyStructure => { + type => 'text/plain', + partId => 'part1', + }, + bodyValues => { + part1 => { + value => <<'EOF' +三菱UFJファクター株式会社 +EOF + }, + }, + }, + }, + }, 'R1'], + ]); + my $email1Id = $res->[0][1]{created}{email1}{id}; + $self->assert_not_null($email1Id); + + xlog $self, "run squatter"; + $self->{instance}->run_command({cyrus => 1}, 'squatter'); + + my $res = $jmap->CallMethods([ + ['Email/query', { + filter => { + body => "UFJ", + }, + }, 'R1'], + ['Email/query', { + filter => { + body => "三菱UFJファクター株式会社", + }, + }, 'R2'], + ['Email/query', { + filter => { + body => "三菱UFJ", + }, + }, 'R3'], + ['Email/query', { + filter => { + body => "三菱", + }, + }, 'R4'], + ]); + $self->assert_deep_equals([$email1Id], $res->[0][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[1][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[2][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[3][1]{ids}); + +no utf8; +} diff --git a/imap/xapian_wrap.cpp b/imap/xapian_wrap.cpp index eae11b563d..010184bc62 100644 --- a/imap/xapian_wrap.cpp +++ b/imap/xapian_wrap.cpp @@ -720,24 +720,30 @@ struct xapian_dbw std::vector *subjects; }; - -static int xapian_dbw_init(xapian_dbw_t *dbw) +static Xapian::TermGenerator *new_term_generator(void) { - dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer); - dbw->default_stopper = get_stopper("en"); - dbw->term_generator = new Xapian::TermGenerator; - dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH); + auto termgen = new Xapian::TermGenerator; + termgen->set_max_word_length(XAPIAN_MAX_TERM_LENGTH); /* Always enable CJK word tokenization */ #if defined(USE_XAPIAN_WORD_BREAKS) - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS, + termgen->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS, ~Xapian::TermGenerator::FLAG_WORD_BREAKS); #elif defined(USE_XAPIAN_CJK_WORDS) - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS, + termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS, ~Xapian::TermGenerator::FLAG_CJK_WORDS); #else - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM, + termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM, ~Xapian::TermGenerator::FLAG_CJK_NGRAM); #endif + return termgen; +} + + +static int xapian_dbw_init(xapian_dbw_t *dbw) +{ + dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer); + dbw->default_stopper = get_stopper("en"); + dbw->term_generator = new_term_generator(); dbw->doclangs = new std::set; dbw->subjects = new std::vector; return 0; @@ -1868,6 +1874,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused return new Xapian::Query(q); } +static Xapian::Query* xapian_query_new_match_cjk(const xapian_db_t *db, const char *str, const char *prefix) +{ + Xapian::Query *q = new Xapian::Query {db->parser->parse_query( + str, +#if defined(USE_XAPIAN_WORD_BREAKS) + Xapian::QueryParser::FLAG_WORD_BREAKS, +#elif defined(USE_XAPIAN_CJK_WORDS) + Xapian::QueryParser::FLAG_CJK_WORDS, +#else + Xapian::QueryParser::FLAG_CJK_NGRAM, +#endif + prefix)}; + +#if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS) + // There is a bug in Xapian v1.5 and CJK word segmentation, in which + // a term starting with a fullwidth Latin character such as U+FF21 + // is indexed with capitalization, but queried in small letter form + // when parsed with parse_query. + // + // As a workaround, the following code checks if the query string + // contains any characters in the Halfwidth and Fullwidth Forms + // Unicode block. If so, it queries for either the query as + // generated by the query parser, or the terms as generated during + // indexing. + // + // XXX this should better be handled by normalizing characters in + // that Unicode block during indexing. + for (auto it = Xapian::Utf8Iterator(str); it != Xapian::Utf8Iterator(); ++it) { + if (*it >= 0xff00 && *it <= 0xffef) { + Xapian::TermGenerator *termgen = new_term_generator(); + Xapian::Document doc; + termgen->set_document(doc); + termgen->index_text(str, 1, prefix); + Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin(), doc.termlist_end()}; + if (qq.get_length()) { + *q |= qq; + } + delete termgen; + break; // stop at first occurence of full/half-width character + } + } +#endif + + return q; +} + EXPORTED Xapian::Query * xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str) { @@ -1898,16 +1950,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char * // Don't stem queries for Thaana codepage (0780) or higher. for (const unsigned char *p = (const unsigned char *)str; *p; p++) { if (*p > 221) //has highbit - return new Xapian::Query {db->parser->parse_query( - str, -#if defined(USE_XAPIAN_WORD_BREAKS) - Xapian::QueryParser::FLAG_WORD_BREAKS, -#elif defined(USE_XAPIAN_CJK_WORDS) - Xapian::QueryParser::FLAG_CJK_WORDS, -#else - Xapian::QueryParser::FLAG_CJK_NGRAM, -#endif - prefix)}; + return xapian_query_new_match_cjk(db, str, prefix); } // Stemable codepage.