@@ -720,24 +720,30 @@ struct xapian_dbw
720
720
std::vector<std::string> *subjects;
721
721
};
722
722
723
-
724
- static int xapian_dbw_init (xapian_dbw_t *dbw)
723
+ static Xapian::TermGenerator *new_term_generator (void )
725
724
{
726
- dbw->default_stemmer = new Xapian::Stem (new CyrusSearchStemmer);
727
- dbw->default_stopper = get_stopper (" en" );
728
- dbw->term_generator = new Xapian::TermGenerator;
729
- dbw->term_generator ->set_max_word_length (XAPIAN_MAX_TERM_LENGTH);
725
+ auto termgen = new Xapian::TermGenerator;
726
+ termgen->set_max_word_length (XAPIAN_MAX_TERM_LENGTH);
730
727
/* Always enable CJK word tokenization */
731
728
#if defined(USE_XAPIAN_WORD_BREAKS)
732
- dbw-> term_generator ->set_flags (Xapian::TermGenerator::FLAG_WORD_BREAKS,
729
+ termgen ->set_flags (Xapian::TermGenerator::FLAG_WORD_BREAKS,
733
730
~Xapian::TermGenerator::FLAG_WORD_BREAKS);
734
731
#elif defined(USE_XAPIAN_CJK_WORDS)
735
- dbw-> term_generator ->set_flags (Xapian::TermGenerator::FLAG_CJK_WORDS,
732
+ termgen ->set_flags (Xapian::TermGenerator::FLAG_CJK_WORDS,
736
733
~Xapian::TermGenerator::FLAG_CJK_WORDS);
737
734
#else
738
- dbw-> term_generator ->set_flags (Xapian::TermGenerator::FLAG_CJK_NGRAM,
735
+ termgen ->set_flags (Xapian::TermGenerator::FLAG_CJK_NGRAM,
739
736
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
740
737
#endif
738
+ return termgen;
739
+ }
740
+
741
+
742
+ static int xapian_dbw_init (xapian_dbw_t *dbw)
743
+ {
744
+ dbw->default_stemmer = new Xapian::Stem (new CyrusSearchStemmer);
745
+ dbw->default_stopper = get_stopper (" en" );
746
+ dbw->term_generator = new_term_generator ();
741
747
dbw->doclangs = new std::set<std::string>;
742
748
dbw->subjects = new std::vector<std::string>;
743
749
return 0 ;
@@ -1868,6 +1874,52 @@ static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused
1868
1874
return new Xapian::Query (q);
1869
1875
}
1870
1876
1877
+ static Xapian::Query* xapian_query_new_match_cjk (const xapian_db_t *db, const char *str, const char *prefix)
1878
+ {
1879
+ Xapian::Query *q = new Xapian::Query {db->parser ->parse_query (
1880
+ str,
1881
+ #if defined(USE_XAPIAN_WORD_BREAKS)
1882
+ Xapian::QueryParser::FLAG_WORD_BREAKS,
1883
+ #elif defined(USE_XAPIAN_CJK_WORDS)
1884
+ Xapian::QueryParser::FLAG_CJK_WORDS,
1885
+ #else
1886
+ Xapian::QueryParser::FLAG_CJK_NGRAM,
1887
+ #endif
1888
+ prefix)};
1889
+
1890
+ #if defined(USE_XAPIAN_WORD_BREAKS) || defined(USE_XAPIAN_CJK_WORDS)
1891
+ // There is a bug in Xapian v1.5 and CJK word segmentation, in which
1892
+ // a term starting with a fullwidth Latin character such as U+FF21
1893
+ // is indexed with capitalization, but queried in small letter form
1894
+ // when parsed with parse_query.
1895
+ //
1896
+ // As a workaround, the following code checks if the query string
1897
+ // contains any characters in the Halfwidth and Fullwidth Forms
1898
+ // Unicode block. If so, it queries for either the query as
1899
+ // generated by the query parser, or the terms as generated during
1900
+ // indexing.
1901
+ //
1902
+ // XXX this should better be handled by normalizing characters in
1903
+ // that Unicode block during indexing.
1904
+ for (auto it = Xapian::Utf8Iterator (str); it != Xapian::Utf8Iterator (); ++it) {
1905
+ if (*it >= 0xff00 && *it <= 0xffef ) {
1906
+ Xapian::TermGenerator *termgen = new_term_generator ();
1907
+ Xapian::Document doc;
1908
+ termgen->set_document (doc);
1909
+ termgen->index_text (str, 1 , prefix);
1910
+ Xapian::Query qq{Xapian::Query::OP_AND, doc.termlist_begin (), doc.termlist_end ()};
1911
+ if (qq.get_length ()) {
1912
+ *q |= qq;
1913
+ }
1914
+ delete termgen;
1915
+ break ; // stop at first occurence of full/half-width character
1916
+ }
1917
+ }
1918
+ #endif
1919
+
1920
+ return q;
1921
+ }
1922
+
1871
1923
EXPORTED Xapian::Query *
1872
1924
xapian_query_new_match_internal (const xapian_db_t *db, int partnum, const char *str)
1873
1925
{
@@ -1898,16 +1950,7 @@ xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *
1898
1950
// Don't stem queries for Thaana codepage (0780) or higher.
1899
1951
for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
1900
1952
if (*p > 221 ) // has highbit
1901
- return new Xapian::Query {db->parser ->parse_query (
1902
- str,
1903
- #if defined(USE_XAPIAN_WORD_BREAKS)
1904
- Xapian::QueryParser::FLAG_WORD_BREAKS,
1905
- #elif defined(USE_XAPIAN_CJK_WORDS)
1906
- Xapian::QueryParser::FLAG_CJK_WORDS,
1907
- #else
1908
- Xapian::QueryParser::FLAG_CJK_NGRAM,
1909
- #endif
1910
- prefix)};
1953
+ return xapian_query_new_match_cjk (db, str, prefix);
1911
1954
}
1912
1955
1913
1956
// Stemable codepage.
0 commit comments