Skip to content

Commit 990b350

Browse files
committed
xapian_wrap.cpp: query verbatim for terms using fullwidth form
This works around an issue with Xapian 1.5 when indexing and querying for text containing codepoints in the Halfwidth and Fullwidth Forms Unicode block. It does so by querying for both the terms as generated by the query parser and the form as generated by the term generator during indexing. This only is applied if the query actually contains at least on codepoint in that Unicode block. The issue has been reported to the Xapian mailing list at https://lists.xapian.org/pipermail/xapian-devel/2024-January/003410.html Signed-off-by: Robert Stepanek <[email protected]>
1 parent c78ab7b commit 990b350

File tree

2 files changed

+145
-20
lines changed

2 files changed

+145
-20
lines changed
+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!perl
2+
use Cassandane::Tiny;
3+
4+
sub test_email_query_cjk_fullwidth
5+
:min_version_3_9 :needs_component_jmap :JMAPExtensions
6+
{
7+
my ($self) = @_;
8+
my $jmap = $self->{jmap};
9+
10+
use utf8;
11+
my $res = $jmap->CallMethods([
12+
['Email/set', {
13+
create => {
14+
email1 => {
15+
mailboxIds => {
16+
'$inbox' => JSON::true
17+
},
18+
from => [{ email => 'foo@local' }],
19+
to => [{ email => 'bar@local' }],
20+
subject => $_->{id},
21+
bodyStructure => {
22+
type => 'text/plain',
23+
partId => 'part1',
24+
},
25+
bodyValues => {
26+
part1 => {
27+
value => <<'EOF'
28+
三菱UFJファクター株式会社
29+
EOF
30+
},
31+
},
32+
},
33+
},
34+
}, 'R1'],
35+
]);
36+
my $email1Id = $res->[0][1]{created}{email1}{id};
37+
$self->assert_not_null($email1Id);
38+
39+
xlog $self, "run squatter";
40+
$self->{instance}->run_command({cyrus => 1}, 'squatter');
41+
42+
my $res = $jmap->CallMethods([
43+
['Email/query', {
44+
filter => {
45+
body => "UFJ",
46+
},
47+
}, 'R1'],
48+
['Email/query', {
49+
filter => {
50+
body => "三菱UFJファクター株式会社",
51+
},
52+
}, 'R2'],
53+
['Email/query', {
54+
filter => {
55+
body => "三菱UFJ",
56+
},
57+
}, 'R3'],
58+
['Email/query', {
59+
filter => {
60+
body => "三菱",
61+
},
62+
}, 'R4'],
63+
]);
64+
$self->assert_deep_equals([$email1Id], $res->[0][1]{ids});
65+
$self->assert_deep_equals([$email1Id], $res->[1][1]{ids});
66+
$self->assert_deep_equals([$email1Id], $res->[2][1]{ids});
67+
$self->assert_deep_equals([$email1Id], $res->[3][1]{ids});
68+
69+
no utf8;
70+
}

Diff for: imap/xapian_wrap.cpp

+75-20
Original file line numberDiff line numberDiff line change
@@ -720,21 +720,30 @@ struct xapian_dbw
720720
std::vector<std::string> *subjects;
721721
};
722722

723-
724-
static int xapian_dbw_init(xapian_dbw_t *dbw)
723+
static Xapian::TermGenerator *new_term_generator(void)
725724
{
726-
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
727-
dbw->default_stopper = get_stopper("en");
728-
dbw->term_generator = new Xapian::TermGenerator;
729-
dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
725+
auto termgen = new Xapian::TermGenerator;
726+
termgen->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
730727
/* Always enable CJK word tokenization */
731-
#ifdef USE_XAPIAN_CJK_WORDS
732-
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
728+
#if defined(USE_XAPIAN_WORD_BREAKS)
729+
termgen->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
730+
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
731+
#elif defined(USE_XAPIAN_CJK_WORDS)
732+
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
733733
~Xapian::TermGenerator::FLAG_CJK_WORDS);
734734
#else
735-
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
735+
termgen->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
736736
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
737737
#endif
738+
return termgen;
739+
}
740+
741+
742+
static int xapian_dbw_init(xapian_dbw_t *dbw)
743+
{
744+
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
745+
dbw->default_stopper = get_stopper("en");
746+
dbw->term_generator = new_term_generator();
738747
dbw->doclangs = new std::set<std::string>;
739748
dbw->subjects = new std::vector<std::string>;
740749
return 0;
@@ -1865,6 +1874,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused
18651874
return new Xapian::Query(q);
18661875
}
18671876

1877+
static Xapian::Query* xapian_query_new_match_cjk(const xapian_db_t *db, const char *str, const char *prefix)
1878+
{
1879+
Xapian::Query *q = new Xapian::Query {db->parser->parse_query(
1880+
str,
1881+
#if defined(USE_XAPIAN_WORD_BREAKS)
1882+
Xapian::QueryParser::FLAG_WORD_BREAKS,
1883+
#elif defined(USE_XAPIAN_CJK_WORDS)
1884+
Xapian::QueryParser::FLAG_CJK_WORDS,
1885+
#else
1886+
Xapian::QueryParser::FLAG_CJK_NGRAM,
1887+
#endif
1888+
prefix)};
1889+
1890+
#if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS)
1891+
// There is a bug in Xapian v1.5 and CJK word segmentation, in which
1892+
// a term starting with a fullwidth Latin character such as U+FF21
1893+
// is indexed with capitalization, but queried in small letter form
1894+
// when parsed with parse_query.
1895+
//
1896+
// As a workaround, the following code checks if the query string
1897+
// contains any characters in the Halfwidth and Fullwidth Forms
1898+
// Unicode block. If so, it queries for either the query as
1899+
// generated by the query parser, or the terms as generated during
1900+
// indexing.
1901+
//
1902+
// XXX this should better be handled by normalizing characters in
1903+
// that Unicode block during indexing.
1904+
for (auto it = Xapian::Utf8Iterator(str); it != Xapian::Utf8Iterator(); ++it) {
1905+
if (*it >= 0xff00 && *it <= 0xffef) {
1906+
Xapian::TermGenerator *termgen = new_term_generator();
1907+
Xapian::Document doc;
1908+
termgen->set_document(doc);
1909+
termgen->index_text(str, 1, prefix);
1910+
Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin(), doc.termlist_end()};
1911+
if (qq.get_length()) {
1912+
*q |= qq;
1913+
}
1914+
delete termgen;
1915+
break; // stop at first occurence of full/half-width character
1916+
}
1917+
}
1918+
#endif
1919+
1920+
return q;
1921+
}
1922+
18681923
EXPORTED Xapian::Query *
18691924
xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
18701925
{
@@ -1895,14 +1950,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *
18951950
// Don't stem queries for Thaana codepage (0780) or higher.
18961951
for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
18971952
if (*p > 221) //has highbit
1898-
return new Xapian::Query {db->parser->parse_query(
1899-
str,
1900-
#ifdef USE_XAPIAN_CJK_WORDS
1901-
Xapian::QueryParser::FLAG_CJK_WORDS,
1902-
#else
1903-
Xapian::QueryParser::FLAG_CJK_NGRAM,
1904-
#endif
1905-
prefix)};
1953+
return xapian_query_new_match_cjk(db, str, prefix);
19061954
}
19071955

19081956
// Stemable codepage.
@@ -2182,7 +2230,10 @@ static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapia
21822230
if (snipgen->loose_terms) {
21832231
/* Add loose query terms */
21842232
term_generator.set_stemmer(stemmer);
2185-
#ifdef USE_XAPIAN_CJK_WORDS
2233+
#if defined(USE_XAPIAN_WORD_BREAKS)
2234+
term_generator.set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
2235+
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
2236+
#elif defined(USE_XAPIAN_CJK_WORDS)
21862237
term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
21872238
~Xapian::TermGenerator::FLAG_CJK_WORDS);
21882239
#else
@@ -2203,7 +2254,9 @@ static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapia
22032254
/* Add phrase queries */
22042255
unsigned flags = Xapian::QueryParser::FLAG_PHRASE|
22052256
Xapian::QueryParser::FLAG_WILDCARD|
2206-
#ifdef USE_XAPIAN_CJK_WORDS
2257+
#if defined(USE_XAPIAN_WORD_BREAKS)
2258+
Xapian::QueryParser::FLAG_WORD_BREAKS;
2259+
#elif defined(USE_XAPIAN_CJK_WORDS)
22072260
Xapian::QueryParser::FLAG_CJK_WORDS;
22082261
#else
22092262
Xapian::QueryParser::FLAG_CJK_NGRAM;
@@ -2267,7 +2320,9 @@ EXPORTED int xapian_snipgen_make_snippet(xapian_snipgen_t *snipgen,
22672320

22682321
unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE |
22692322
Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
2270-
#ifdef USE_XAPIAN_CJK_WORDS
2323+
#if defined(USE_XAPIAN_WORD_BREAKS)
2324+
flags |= Xapian::MSet::SNIPPET_WORD_BREAKS;
2325+
#elif defined(USE_XAPIAN_CJK_WORDS)
22712326
flags |= Xapian::MSet::SNIPPET_CJK_WORDS;
22722327
#endif
22732328

0 commit comments

Comments
 (0)