Skip to content

Commit

Permalink
xapian_wrap.cpp: adapt word break flags to Xapian API change
Browse files Browse the repository at this point in the history
Xapian development version 1.5 changed the flag names of CJK word
break segmentation. This patch adapst our Xapian wrapper to use the
new names but keeps falling back using the previous ones for
older installations.

Note that this only matters when using any of the 1.5 development
versions, the flag names are not part of any stable Xapian API so
far.

See xapian/xapian@13295e9

Signed-off-by: Robert Stepanek <[email protected]>
  • Loading branch information
rsto committed Jan 5, 2024
1 parent c78ab7b commit 272d10e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
23 changes: 19 additions & 4 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -668,16 +668,31 @@ if test "x$enable_xapian" != xno ; then
AC_LINK_IFELSE(
[AC_LANG_PROGRAM(
[[#include <xapian.h>]],
[[unsigned cjk_flags = Xapian::TermGenerator::FLAG_CJK_WORDS | Xapian::QueryParser::FLAG_CJK_WORDS | Xapian::MSet::SNIPPET_CJK_WORDS; (void) cjk_flags; ]])],
[[unsigned cjk_flags = Xapian::TermGenerator::FLAG_WORD_BREAKS | Xapian::QueryParser::FLAG_WORD_BREAKS | Xapian::MSet::SNIPPET_WORD_BREAKS; (void) cjk_flags; ]])],
[xapian_cjkwords="yes"],
[xapian_cjkwords="no"])
AC_MSG_RESULT($xapian_cjkwords)
if test $xapian_cjkwords = yes; then
AC_DEFINE([USE_XAPIAN_CJK_WORDS], [], [Use Xapian CJK word tokenizer, rather than n-grams?])
AC_DEFINE([USE_XAPIAN_WORD_BREAKS], [], [Use Xapian CJK word tokenizer, rather than n-grams?])
xapian_cjk_tokens=words
else
AC_MSG_NOTICE([Your Xapian does not support CJK word tokenization. CJK ngram tokenization will be used instead.])
xapian_cjk_tokens=ngrams
dnl Xapian upstream version 1.5 used different flag names to enable
dnl word break tokenization until March 2023.
dnl See https://github.com/xapian/xapian/commit/13295e9142f56911d4876fb92271df348759c34b
AC_LINK_IFELSE(
[AC_LANG_PROGRAM(
[[#include <xapian.h>]],
[[unsigned cjk_flags = Xapian::TermGenerator::FLAG_CJK_WORDS | Xapian::QueryParser::FLAG_CJK_WORDS | Xapian::MSet::SNIPPET_CJK_WORDS; (void) cjk_flags; ]])],
[xapian_cjkwords="yes"],
[xapian_cjkwords="no"])
AC_MSG_RESULT($xapian_cjkwords)
if test $xapian_cjkwords = yes; then
AC_DEFINE([USE_XAPIAN_CJK_WORDS], [], [Use Xapian CJK word tokenizer, rather than n-grams?])
xapian_cjk_tokens=words
else
AC_MSG_NOTICE([Your Xapian does not support CJK word tokenization. CJK ngram tokenization will be used instead.])
xapian_cjk_tokens=ngrams
fi
fi
LDFLAGS=$ORIG_LDFLAGS
CXXFLAGS=$ORIG_CXXFLAGS
Expand Down
22 changes: 17 additions & 5 deletions imap/xapian_wrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,10 @@ static int xapian_dbw_init(xapian_dbw_t *dbw)
dbw->term_generator = new Xapian::TermGenerator;
dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
/* Always enable CJK word tokenization */
#ifdef USE_XAPIAN_CJK_WORDS
#if defined(USE_XAPIAN_WORD_BREAKS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
#elif defined(USE_XAPIAN_CJK_WORDS)
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
Expand Down Expand Up @@ -1897,7 +1900,9 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *
if (*p > 221) //has highbit
return new Xapian::Query {db->parser->parse_query(
str,
#ifdef USE_XAPIAN_CJK_WORDS
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS,
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
Expand Down Expand Up @@ -2182,7 +2187,10 @@ static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapia
if (snipgen->loose_terms) {
/* Add loose query terms */
term_generator.set_stemmer(stemmer);
#ifdef USE_XAPIAN_CJK_WORDS
#if defined(USE_XAPIAN_WORD_BREAKS)
term_generator.set_flags(Xapian::TermGenerator::FLAG_WORD_BREAKS,
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
#elif defined(USE_XAPIAN_CJK_WORDS)
term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
Expand All @@ -2203,7 +2211,9 @@ static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapia
/* Add phrase queries */
unsigned flags = Xapian::QueryParser::FLAG_PHRASE|
Xapian::QueryParser::FLAG_WILDCARD|
#ifdef USE_XAPIAN_CJK_WORDS
#if defined(USE_XAPIAN_WORD_BREAKS)
Xapian::QueryParser::FLAG_WORD_BREAKS;
#elif defined(USE_XAPIAN_CJK_WORDS)
Xapian::QueryParser::FLAG_CJK_WORDS;
#else
Xapian::QueryParser::FLAG_CJK_NGRAM;
Expand Down Expand Up @@ -2267,7 +2277,9 @@ EXPORTED int xapian_snipgen_make_snippet(xapian_snipgen_t *snipgen,

unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE |
Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
#ifdef USE_XAPIAN_CJK_WORDS
#if defined(USE_XAPIAN_WORD_BREAKS)
flags |= Xapian::MSet::SNIPPET_WORD_BREAKS;
#elif defined(USE_XAPIAN_CJK_WORDS)
flags |= Xapian::MSet::SNIPPET_CJK_WORDS;
#endif

Expand Down

0 comments on commit 272d10e

Please sign in to comment.