From a65bac9e65f767967a1444364052ac847cd4d726 Mon Sep 17 00:00:00 2001 From: Luca Ongaro Date: Tue, 16 Sep 2025 14:24:58 +0200 Subject: [PATCH] Relax the return type of extractField To address https://github.com/lucaong/minisearch/issues/302 Also, in order to maintain type safety for indexed fields, and to allow more customization options, add a new `stringifyField` option to control how field values are turned into strings for indexing. --- CHANGELOG.md | 7 +++ src/MiniSearch.test.js | 100 ++++++++++++++++++++++++++++------------- src/MiniSearch.ts | 46 ++++++++++++++++--- 3 files changed, 117 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4419bc76..abfe59f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ `MiniSearch` follows [semantic versioning](https://semver.org/spec/v2.0.0.html). +## Upcoming + + - [fix] Relax the return type of `extractField` to allow non-string values + (when a field is stored but not indexed, it can be any type) + - Add `stringifyField` option to customize how field values are turned into strings + for indexing + ## v7.1.2 - [fix] Correctly specify that MiniSearch targets ES9 (ES2018), not ES6 diff --git a/src/MiniSearch.test.js b/src/MiniSearch.test.js index a675069b..5ed0b896 100644 --- a/src/MiniSearch.test.js +++ b/src/MiniSearch.test.js @@ -86,6 +86,30 @@ describe('MiniSearch', () => { expect(tokenize).toHaveBeenCalledWith('true', 'isBlinky') }) + it('turns the field to string before tokenization using a custom stringifyField function, if given', () => { + const tokenize = jest.fn(x => x.split(/\W+/)) + const stringifyField = jest.fn((value, fieldName) => { + if (fieldName === 'tags') { + return value.join('|') + } else if (typeof value === 'boolean') { + return value ? 'T' : 'F' + } + return value.toString() + }) + const ms = new MiniSearch({ fields: ['id', 'tags', 'isBlinky'], tokenize, stringifyField }) + expect(() => { + ms.add({ id: 123, tags: ['foo', 'bar'], isBlinky: false }) + ms.add({ id: 321, isBlinky: true }) + }).not.toThrowError() + + expect(tokenize).toHaveBeenCalledWith('123', 'id') + expect(tokenize).toHaveBeenCalledWith('foo|bar', 'tags') + expect(tokenize).toHaveBeenCalledWith('F', 'isBlinky') + + expect(tokenize).toHaveBeenCalledWith('321', 'id') + expect(tokenize).toHaveBeenCalledWith('T', 'isBlinky') + }) + it('passes document and field name to the field extractor', () => { const extractField = jest.fn((document, fieldName) => { if (fieldName === 'pubDate') { @@ -290,39 +314,47 @@ describe('MiniSearch', () => { expect(ms.search('bar')).toHaveLength(0) }) - describe('when using custom per-field extraction/tokenizer/processing', () => { + describe('when using custom per-field extraction/stringification/tokenizer/processing', () => { const documents = [ - { id: 1, title: 'Divina Commedia', tags: 'dante,virgilio', author: { name: 'Dante Alighieri' } }, - { id: 2, title: 'I Promessi Sposi', tags: 'renzo,lucia', author: { name: 'Alessandro Manzoni' } }, - { id: 3, title: 'Vita Nova', author: { name: 'Dante Alighieri' } } + { id: 1, title: 'Divina Commedia', tags: ['dante', 'virgilio'], author: { name: 'Dante Alighieri' }, available: true }, + { id: 2, title: 'I Promessi Sposi', tags: ['renzo', 'lucia'], author: { name: 'Alessandro Manzoni' }, available: false }, + { id: 3, title: 'Vita Nova', tags: ['dante'], author: { name: 'Dante Alighieri' }, available: true } ] + const options = { + fields: ['title', 'tags', 'authorName', 'available'], + extractField: (doc, fieldName) => { + if (fieldName === 'authorName') { + return doc.author.name + } else { + return doc[fieldName] + } + }, + stringifyField: (fieldValue, fieldName) => { + if (fieldName === 'available') { + return fieldValue ? 'yes' : 'no' + } else { + return fieldValue.toString() + } + }, + tokenize: (field, fieldName) => { + if (fieldName === 'tags') { + return field.split(',') + } else { + return field.split(/\s+/) + } + }, + processTerm: (term, fieldName) => { + if (fieldName === 'tags') { + return term.toUpperCase() + } else { + return term.toLowerCase() + } + } + } let ms, _warn beforeEach(() => { - ms = new MiniSearch({ - fields: ['title', 'tags', 'authorName'], - extractField: (doc, fieldName) => { - if (fieldName === 'authorName') { - return doc.author.name - } else { - return doc[fieldName] - } - }, - tokenize: (field, fieldName) => { - if (fieldName === 'tags') { - return field.split(',') - } else { - return field.split(/\s+/) - } - }, - processTerm: (term, fieldName) => { - if (fieldName === 'tags') { - return term.toUpperCase() - } else { - return term.toLowerCase() - } - } - }) + ms = new MiniSearch(options) ms.addAll(documents) _warn = console.warn console.warn = jest.fn() @@ -332,12 +364,20 @@ describe('MiniSearch', () => { console.warn = _warn }) - it('removes the document from the index', () => { + it('removes the document and its terms from the index', () => { expect(ms.documentCount).toEqual(3) + expect(ms.search('commedia').map(({ id }) => id)).toEqual([1]) + expect(ms.search('DANTE').map(({ id }) => id)).toEqual([1, 3]) + expect(ms.search('vita').map(({ id }) => id)).toEqual([3]) + expect(ms.search('yes').map(({ id }) => id)).toEqual([1, 3]) + ms.remove(documents[0]) + expect(ms.documentCount).toEqual(2) - expect(ms.search('commedia').length).toEqual(0) + expect(ms.search('commedia').map(({ id }) => id)).toEqual([]) + expect(ms.search('DANTE').map(({ id }) => id)).toEqual([3]) expect(ms.search('vita').map(({ id }) => id)).toEqual([3]) + expect(ms.search('yes').map(({ id }) => id)).toEqual([3]) expect(console.warn).not.toHaveBeenCalled() }) }) diff --git a/src/MiniSearch.ts b/src/MiniSearch.ts index 1d7930dc..20a9ed03 100644 --- a/src/MiniSearch.ts +++ b/src/MiniSearch.ts @@ -222,7 +222,38 @@ export type Options = { * The returned string is fed into the `tokenize` function to split it up * into tokens. */ - extractField?: (document: T, fieldName: string) => string, + extractField?: (document: T, fieldName: string) => any, + + /** + * Function used to turn field values into strings for indexing + * + * The function takes as arguments the field value, and the name of the field + * to stringify, so that its logic can be customized on specific fields. By + * default, it simply calls `toString()` on the field value (which in many + * cases is already a string). + * + * ### Example: + * + * ```javascript + * // Custom stringifier that formats dates as "Tuesday, September 16, 2025" + * const miniSearch = new MiniSearch({ + * fields: ['title', 'date'], + * stringifyField: ((fieldValue, _fieldName) => { + * if (fieldValue instanceof Date) { + * return fieldValue.toLocaleDateString('en-US', { + * weekday: 'long', + * year: 'numeric', + * month: 'long', + * day: 'numeric' + * }) + * } else { + * return fieldValue.toString() + * } + * } + * }) + * ``` + */ + stringifyField?: (fieldValue: any, fieldName: string) => string, /** * Function used to split a field value into individual terms to be indexed. @@ -322,7 +353,9 @@ type OptionsWithDefaults = Options & { idField: string - extractField: (document: T, fieldName: string) => string + extractField: (document: T, fieldName: string) => any + + stringifyField: (fieldValue: any, fieldName: string) => string tokenize: (text: string, fieldName: string) => string[] @@ -711,7 +744,7 @@ export default class MiniSearch { * @param document The document to be indexed */ add (document: T): void { - const { extractField, tokenize, processTerm, fields, idField } = this._options + const { extractField, stringifyField, tokenize, processTerm, fields, idField } = this._options const id = extractField(document, idField) if (id == null) { throw new Error(`MiniSearch: document does not have ID field "${idField}"`) @@ -728,7 +761,7 @@ export default class MiniSearch { const fieldValue = extractField(document, field) if (fieldValue == null) continue - const tokens = tokenize(fieldValue.toString(), field) + const tokens = tokenize(stringifyField(fieldValue, field), field) const fieldId = this._fieldIds[field] const uniqueTerms = new Set(tokens).size @@ -803,7 +836,7 @@ export default class MiniSearch { * @param document The document to be removed */ remove (document: T): void { - const { tokenize, processTerm, extractField, fields, idField } = this._options + const { tokenize, processTerm, extractField, stringifyField, fields, idField } = this._options const id = extractField(document, idField) if (id == null) { @@ -820,7 +853,7 @@ export default class MiniSearch { const fieldValue = extractField(document, field) if (fieldValue == null) continue - const tokens = tokenize(fieldValue.toString(), field) + const tokens = tokenize(stringifyField(fieldValue, field), field) const fieldId = this._fieldIds[field] const uniqueTerms = new Set(tokens).size @@ -2143,6 +2176,7 @@ const termToQuerySpec = (options: SearchOptions) => (term: string, i: number, te const defaultOptions = { idField: 'id', extractField: (document: any, fieldName: string) => document[fieldName], + stringifyField: (fieldValue: any, fieldName: string) => fieldValue.toString(), tokenize: (text: string) => text.split(SPACE_OR_PUNCTUATION), processTerm: (term: string) => term.toLowerCase(), fields: undefined,