From 3958ef4eb97dd2899ac634a11a458e9e4301b052 Mon Sep 17 00:00:00 2001 From: Matthew Steedman Date: Sun, 15 Oct 2023 12:23:10 +0900 Subject: [PATCH] Add basic support for wildcard queries --- lib/index.js | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test/test.js | 45 ++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/lib/index.js b/lib/index.js index e31213e..f36f1f6 100644 --- a/lib/index.js +++ b/lib/index.js @@ -205,6 +205,57 @@ exports.search = utils.toPromise(function (opts, callback) { if (typeof stale === 'string') { queryOpts.stale = stale; } + // + // Wildcards + // 1. Detect if the query term contains an asterisk + // 2. Remove keys property from the queryOpts + // 3. Perform _search_query with limit (100?) + // 4. Match the key with the query term. + // 5. If wildcard is in the front, compare last n chars of both strings + // 6. If wildcard is in the back, compare first n strings of both strings + // 7. If wildcard is in the middle, compare first n and last n of both strings. + + // 1. Split the query terms on '*'. Are there any query terms with a wildcard? + // * + var wildcardTerms = queryTerms.filter(function(queryTerm) { + let sections = queryTerm.split('*'); + // Term needs to contain something other than '*'. + return sections.length > 1 && sections.filter((s) => s.length > 0 ).length > 0; + }) + var hasWildCard = wildcardTerms.length > 0; + + if (hasWildCard) { + delete queryOpts.keys; + // For v0 let's not bother setting a limit. + return pouch._search_query(mapFun, queryOpts).then(function (res) { + return res.rows.filter(function(d) { + var text = d.key.substring(1), + term = wildcardTerms[0], + sections = term.split("*"); + return matchWildcard(term, sections, text); + }); + // Copied from step 3 below. + }).then(function (rows) { + total_rows = rows.length; + // filter before fetching docs or applying highlighting + // for a slight optimization, since for now we've only fetched ids/scores + return (typeof limit === 'number' && limit >= 0) ? + rows.slice(skip, skip + limit) : skip > 0 ? rows.slice(skip) : rows; + }).then(function (rows) { + if (includeDocs) { + return applyIncludeDocs(pouch, rows); + } + return rows; + }).then(function (rows) { + if (highlighting) { + return applyHighlighting(pouch, opts, rows, fieldBoosts, docIdsToFieldsToQueryTerms); + } + return rows; + + }).then(function (rows) { + callback(null, {total_rows: total_rows, rows: rows}); + }); + } // search algorithm, basically classic TF-IDF // @@ -453,3 +504,29 @@ function isFiltered(doc, filter, db) { if (typeof PouchDB !== 'undefined') { PouchDB.plugin(exports); } + +function matchWildcard(term, sections, text) { + // Supports *oobar foo*ar fooba* + // ;; TODO: Support *ooba* + // var sections = term.split("*"); + if (term[0] === '*' && term[term.length - 1] === '*' && sections.length === 3) { + var matchable = sections[1], + hasMatch = false; + for (var i = 0, len = text.length - matchable.length; i < len; i++) { + if (text.substring(i, i + matchable.length) === matchable) { + hasMatch = true; + break; + } + } + return hasMatch; + } else if (sections.length > 2) { + return false; + } else { + var front = sections[0], + back = sections[1]; + // TODO Don't need to check both here. + var matchFront = text.substring(0, front.length) === front; + var matchBack = text.substring(text.length - back.length) === back; + return matchFront && matchBack; + } +} diff --git a/test/test.js b/test/test.js index e03b1cc..f40d99a 100644 --- a/test/test.js +++ b/test/test.js @@ -49,6 +49,51 @@ function tests(dbName, dbType) { var db; + describe('wildcard matching', function() { + this.timeout(30000); + + beforeEach(function () { + db = new Pouch(dbName); + return db; + }); + afterEach(function () { + return db.destroy(); + }); + + var wildcardSearch = function({query, text}, shouldFind) { + return db.bulkDocs({docs: {text: 'foobar'}}).then(function () { + var opts = { + fields: ['text'], + query: query + }; + return db.search(opts); + }).then(function (res) { + if (shouldFind) { + res.rows.length.should.equal(1); + } else { + res.rows.length.should.equal(0); + } + }); + }; + + it ('matches leading wildcards', function() { + wildcardSearch({query: '*oobar', text: 'foobar'}, true); + wildcardSearch({query: '*bar', text: 'foobar'}, true); + }); + it ('matches trailing wildcards', function() { + wildcardSearch({query: 'fooba*', text: 'foobar'}, true); + wildcardSearch({query: 'foo*', text: 'foobar'}, true); + }); + it ('matches center wildcards', function() { + wildcardSearch({query: 'foo*ar', text: 'foobar'}, true); + wildcardSearch({query: 'foo*ar', text: 'foo'}, false); + }); + it ('matches surrounding wildcards', function() { + wildcardSearch({query: '*ooba*', text: 'foobar'}, true); + wildcardSearch({query: '*oob*', text: 'foobar'}, true); + }); + }); + describe(dbType + ': search test suite', function () { this.timeout(30000);