From 6c0fc9b84ff4cd9c9d75084a2922d727a609b4d6 Mon Sep 17 00:00:00 2001 From: Robert Stepanek Date: Thu, 4 Jan 2024 18:12:26 +0100 Subject: [PATCH] xapian_wrap.cpp: query verbatim for terms using fullwidth form This works around an issue with Xapian 1.5 when indexing and querying for text containing codepoints in the Halfwidth and Fullwidth Forms Unicode block. It does so by querying for both the terms as generated by the query parser and the form as generated by the term generator during indexing. This only is applied if the query actually contains at least on codepoint in that Unicode block. The issue has been reported to the Xapian mailing list at https://lists.xapian.org/pipermail/xapian-devel/2024-January/003410.html Signed-off-by: Robert Stepanek --- .../JMAPEmail/email_query_cjk_fullwidth | 70 ++++++++++++++++ imap/xapian_wrap.cpp | 81 ++++++++++++++----- 2 files changed, 132 insertions(+), 19 deletions(-) create mode 100644 cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth diff --git a/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth b/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth new file mode 100644 index 0000000000..b1529f637b --- /dev/null +++ b/cassandane/tiny-tests/JMAPEmail/email_query_cjk_fullwidth @@ -0,0 +1,70 @@ +#!perl +use Cassandane::Tiny; + +sub test_email_query_cjk_fullwidth + :min_version_3_9 :needs_component_jmap :JMAPExtensions +{ + my ($self) = @_; + my $jmap = $self->{jmap}; + +use utf8; + my $res = $jmap->CallMethods([ + ['Email/set', { + create => { + email1 => { + mailboxIds => { + '$inbox' => JSON::true + }, + from => [{ email => 'foo@local' }], + to => [{ email => 'bar@local' }], + subject => $_->{id}, + bodyStructure => { + type => 'text/plain', + partId => 'part1', + }, + bodyValues => { + part1 => { + value => <<'EOF' +三菱UFJファクター株式会社 +EOF + }, + }, + }, + }, + }, 'R1'], + ]); + my $email1Id = $res->[0][1]{created}{email1}{id}; + $self->assert_not_null($email1Id); + + xlog $self, "run squatter"; + $self->{instance}->run_command({cyrus => 1}, 'squatter'); + + my $res = $jmap->CallMethods([ + ['Email/query', { + filter => { + body => "UFJ", + }, + }, 'R1'], + ['Email/query', { + filter => { + body => "三菱UFJファクター株式会社", + }, + }, 'R2'], + ['Email/query', { + filter => { + body => "三菱UFJ", + }, + }, 'R3'], + ['Email/query', { + filter => { + body => "三菱", + }, + }, 'R4'], + ]); + $self->assert_deep_equals([$email1Id], $res->[0][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[1][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[2][1]{ids}); + $self->assert_deep_equals([$email1Id], $res->[3][1]{ids}); + +no utf8; +} diff --git a/imap/xapian_wrap.cpp b/imap/xapian_wrap.cpp index eae11b563d..010184bc62 100644 --- a/imap/xapian_wrap.cpp +++ b/imap/xapian_wrap.cpp @@ -720,24 +720,30 @@ struct xapian_dbw std::vector *subjects; }; - -static int xapian_dbw_init(xapian_dbw_t *dbw) +static Xapian::TermGenerator *new_term_generator(void) { - dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer); - dbw->default_stopper = get_stopper("en"); - dbw->term_generator = new Xapian::TermGenerator; - dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH); + auto termgen = new Xapian::TermGenerator; + termgen->set_max_word_length(XAPIAN_MAX_TERM_LENGTH); /* Always enable CJK word tokenization */ #if defined(USE_XAPIAN_WORD_BREAKS) - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS, + termgen->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS, ~Xapian::TermGenerator::FLAG_WORD_BREAKS); #elif defined(USE_XAPIAN_CJK_WORDS) - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS, + termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS, ~Xapian::TermGenerator::FLAG_CJK_WORDS); #else - dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM, + termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM, ~Xapian::TermGenerator::FLAG_CJK_NGRAM); #endif + return termgen; +} + + +static int xapian_dbw_init(xapian_dbw_t *dbw) +{ + dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer); + dbw->default_stopper = get_stopper("en"); + dbw->term_generator = new_term_generator(); dbw->doclangs = new std::set; dbw->subjects = new std::vector; return 0; @@ -1868,6 +1874,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused return new Xapian::Query(q); } +static Xapian::Query* xapian_query_new_match_cjk(const xapian_db_t *db, const char *str, const char *prefix) +{ + Xapian::Query *q = new Xapian::Query {db->parser->parse_query( + str, +#if defined(USE_XAPIAN_WORD_BREAKS) + Xapian::QueryParser::FLAG_WORD_BREAKS, +#elif defined(USE_XAPIAN_CJK_WORDS) + Xapian::QueryParser::FLAG_CJK_WORDS, +#else + Xapian::QueryParser::FLAG_CJK_NGRAM, +#endif + prefix)}; + +#if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS) + // There is a bug in Xapian v1.5 and CJK word segmentation, in which + // a term starting with a fullwidth Latin character such as U+FF21 + // is indexed with capitalization, but queried in small letter form + // when parsed with parse_query. + // + // As a workaround, the following code checks if the query string + // contains any characters in the Halfwidth and Fullwidth Forms + // Unicode block. If so, it queries for either the query as + // generated by the query parser, or the terms as generated during + // indexing. + // + // XXX this should better be handled by normalizing characters in + // that Unicode block during indexing. + for (auto it = Xapian::Utf8Iterator(str); it != Xapian::Utf8Iterator(); ++it) { + if (*it >= 0xff00 && *it <= 0xffef) { + Xapian::TermGenerator *termgen = new_term_generator(); + Xapian::Document doc; + termgen->set_document(doc); + termgen->index_text(str, 1, prefix); + Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin(), doc.termlist_end()}; + if (qq.get_length()) { + *q |= qq; + } + delete termgen; + break; // stop at first occurence of full/half-width character + } + } +#endif + + return q; +} + EXPORTED Xapian::Query * xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str) { @@ -1898,16 +1950,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char * // Don't stem queries for Thaana codepage (0780) or higher. for (const unsigned char *p = (const unsigned char *)str; *p; p++) { if (*p > 221) //has highbit - return new Xapian::Query {db->parser->parse_query( - str, -#if defined(USE_XAPIAN_WORD_BREAKS) - Xapian::QueryParser::FLAG_WORD_BREAKS, -#elif defined(USE_XAPIAN_CJK_WORDS) - Xapian::QueryParser::FLAG_CJK_WORDS, -#else - Xapian::QueryParser::FLAG_CJK_NGRAM, -#endif - prefix)}; + return xapian_query_new_match_cjk(db, str, prefix); } // Stemable codepage.