Relax the return type of extractField (#303)

lucaong · web-flow · commit ca1b170d3cf9 · 2025-09-16T14:30:18.000+02:00
To address #302 Also, in order to maintain type safety for indexed fields, and to allow more customization options, add a new `stringifyField` option to control how field values are turned into strings for indexing.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 `MiniSearch` follows [semantic versioning](https://semver.org/spec/v2.0.0.html).
 
+## Upcoming
+
+  - [fix] Relax the return type of `extractField` to allow non-string values
+    (when a field is stored but not indexed, it can be any type)
+  - Add `stringifyField` option to customize how field values are turned into strings
+    for indexing
+
 ## v7.1.2
 
   - [fix] Correctly specify that MiniSearch targets ES9 (ES2018), not ES6
diff --git a/src/MiniSearch.test.js b/src/MiniSearch.test.js
@@ -86,6 +86,30 @@ describe('MiniSearch', () => {
       expect(tokenize).toHaveBeenCalledWith('true', 'isBlinky')
     })
 
+    it('turns the field to string before tokenization using a custom stringifyField function, if given', () => {
+      const tokenize = jest.fn(x => x.split(/\W+/))
+      const stringifyField = jest.fn((value, fieldName) => {
+        if (fieldName === 'tags') {
+          return value.join('|')
+        } else if (typeof value === 'boolean') {
+          return value ? 'T' : 'F'
+        }
+        return value.toString()
+      })
+      const ms = new MiniSearch({ fields: ['id', 'tags', 'isBlinky'], tokenize, stringifyField })
+      expect(() => {
+        ms.add({ id: 123, tags: ['foo', 'bar'], isBlinky: false })
+        ms.add({ id: 321, isBlinky: true })
+      }).not.toThrowError()
+
+      expect(tokenize).toHaveBeenCalledWith('123', 'id')
+      expect(tokenize).toHaveBeenCalledWith('foo|bar', 'tags')
+      expect(tokenize).toHaveBeenCalledWith('F', 'isBlinky')
+
+      expect(tokenize).toHaveBeenCalledWith('321', 'id')
+      expect(tokenize).toHaveBeenCalledWith('T', 'isBlinky')
+    })
+
     it('passes document and field name to the field extractor', () => {
       const extractField = jest.fn((document, fieldName) => {
         if (fieldName === 'pubDate') {
@@ -290,39 +314,47 @@ describe('MiniSearch', () => {
       expect(ms.search('bar')).toHaveLength(0)
     })
 
-    describe('when using custom per-field extraction/tokenizer/processing', () => {
+    describe('when using custom per-field extraction/stringification/tokenizer/processing', () => {
       const documents = [
-        { id: 1, title: 'Divina Commedia', tags: 'dante,virgilio', author: { name: 'Dante Alighieri' } },
-        { id: 2, title: 'I Promessi Sposi', tags: 'renzo,lucia', author: { name: 'Alessandro Manzoni' } },
-        { id: 3, title: 'Vita Nova', author: { name: 'Dante Alighieri' } }
+        { id: 1, title: 'Divina Commedia', tags: ['dante', 'virgilio'], author: { name: 'Dante Alighieri' }, available: true },
+        { id: 2, title: 'I Promessi Sposi', tags: ['renzo', 'lucia'], author: { name: 'Alessandro Manzoni' }, available: false },
+        { id: 3, title: 'Vita Nova', tags: ['dante'], author: { name: 'Dante Alighieri' }, available: true }
       ]
+      const options = {
+        fields: ['title', 'tags', 'authorName', 'available'],
+        extractField: (doc, fieldName) => {
+          if (fieldName === 'authorName') {
+            return doc.author.name
+          } else {
+            return doc[fieldName]
+          }
+        },
+        stringifyField: (fieldValue, fieldName) => {
+          if (fieldName === 'available') {
+            return fieldValue ? 'yes' : 'no'
+          } else {
+            return fieldValue.toString()
+          }
+        },
+        tokenize: (field, fieldName) => {
+          if (fieldName === 'tags') {
+            return field.split(',')
+          } else {
+            return field.split(/\s+/)
+          }
+        },
+        processTerm: (term, fieldName) => {
+          if (fieldName === 'tags') {
+            return term.toUpperCase()
+          } else {
+            return term.toLowerCase()
+          }
+        }
+      }
 
       let ms, _warn
       beforeEach(() => {
-        ms = new MiniSearch({
-          fields: ['title', 'tags', 'authorName'],
-          extractField: (doc, fieldName) => {
-            if (fieldName === 'authorName') {
-              return doc.author.name
-            } else {
-              return doc[fieldName]
-            }
-          },
-          tokenize: (field, fieldName) => {
-            if (fieldName === 'tags') {
-              return field.split(',')
-            } else {
-              return field.split(/\s+/)
-            }
-          },
-          processTerm: (term, fieldName) => {
-            if (fieldName === 'tags') {
-              return term.toUpperCase()
-            } else {
-              return term.toLowerCase()
-            }
-          }
-        })
+        ms = new MiniSearch(options)
         ms.addAll(documents)
         _warn = console.warn
         console.warn = jest.fn()
@@ -332,12 +364,20 @@ describe('MiniSearch', () => {
         console.warn = _warn
       })
 
-      it('removes the document from the index', () => {
+      it('removes the document and its terms from the index', () => {
         expect(ms.documentCount).toEqual(3)
+        expect(ms.search('commedia').map(({ id }) => id)).toEqual([1])
+        expect(ms.search('DANTE').map(({ id }) => id)).toEqual([1, 3])
+        expect(ms.search('vita').map(({ id }) => id)).toEqual([3])
+        expect(ms.search('yes').map(({ id }) => id)).toEqual([1, 3])
+
         ms.remove(documents[0])
+
         expect(ms.documentCount).toEqual(2)
-        expect(ms.search('commedia').length).toEqual(0)
+        expect(ms.search('commedia').map(({ id }) => id)).toEqual([])
+        expect(ms.search('DANTE').map(({ id }) => id)).toEqual([3])
         expect(ms.search('vita').map(({ id }) => id)).toEqual([3])
+        expect(ms.search('yes').map(({ id }) => id)).toEqual([3])
         expect(console.warn).not.toHaveBeenCalled()
       })
     })
diff --git a/src/MiniSearch.ts b/src/MiniSearch.ts
@@ -222,7 +222,38 @@ export type Options<T = any> = {
     * The returned string is fed into the `tokenize` function to split it up
     * into tokens.
     */
-  extractField?: (document: T, fieldName: string) => string,
+  extractField?: (document: T, fieldName: string) => any,
+
+  /**
+   * Function used to turn field values into strings for indexing
+   *
+   * The function takes as arguments the field value, and the name of the field
+   * to stringify, so that its logic can be customized on specific fields. By
+   * default, it simply calls `toString()` on the field value (which in many
+   * cases is already a string).
+   *
+   * ### Example:
+   *
+   * ```javascript
+   * // Custom stringifier that formats dates as "Tuesday, September 16, 2025"
+   * const miniSearch = new MiniSearch({
+   *   fields: ['title', 'date'],
+   *   stringifyField: ((fieldValue, _fieldName) => {
+   *     if (fieldValue instanceof Date) {
+   *       return fieldValue.toLocaleDateString('en-US', {
+   *         weekday: 'long',
+   *         year: 'numeric',
+   *         month: 'long',
+   *         day: 'numeric'
+   *       })
+   *     } else {
+   *      return fieldValue.toString()
+   *     }
+   *   }
+   * })
+   * ```
+   */
+  stringifyField?: (fieldValue: any, fieldName: string) => string,
 
    /**
     * Function used to split a field value into individual terms to be indexed.
@@ -322,7 +353,9 @@ type OptionsWithDefaults<T = any> = Options<T> & {
 
   idField: string
 
-  extractField: (document: T, fieldName: string) => string
+  extractField: (document: T, fieldName: string) => any
+
+  stringifyField: (fieldValue: any, fieldName: string) => string
 
   tokenize: (text: string, fieldName: string) => string[]
 
@@ -711,7 +744,7 @@ export default class MiniSearch<T = any> {
    * @param document  The document to be indexed
    */
   add (document: T): void {
-    const { extractField, tokenize, processTerm, fields, idField } = this._options
+    const { extractField, stringifyField, tokenize, processTerm, fields, idField } = this._options
     const id = extractField(document, idField)
     if (id == null) {
       throw new Error(`MiniSearch: document does not have ID field "${idField}"`)
@@ -728,7 +761,7 @@ export default class MiniSearch<T = any> {
       const fieldValue = extractField(document, field)
       if (fieldValue == null) continue
 
-      const tokens = tokenize(fieldValue.toString(), field)
+      const tokens = tokenize(stringifyField(fieldValue, field), field)
       const fieldId = this._fieldIds[field]
 
       const uniqueTerms = new Set(tokens).size
@@ -803,7 +836,7 @@ export default class MiniSearch<T = any> {
    * @param document  The document to be removed
    */
   remove (document: T): void {
-    const { tokenize, processTerm, extractField, fields, idField } = this._options
+    const { tokenize, processTerm, extractField, stringifyField, fields, idField } = this._options
     const id = extractField(document, idField)
 
     if (id == null) {
@@ -820,7 +853,7 @@ export default class MiniSearch<T = any> {
       const fieldValue = extractField(document, field)
       if (fieldValue == null) continue
 
-      const tokens = tokenize(fieldValue.toString(), field)
+      const tokens = tokenize(stringifyField(fieldValue, field), field)
       const fieldId = this._fieldIds[field]
 
       const uniqueTerms = new Set(tokens).size
@@ -2143,6 +2176,7 @@ const termToQuerySpec = (options: SearchOptions) => (term: string, i: number, te
 const defaultOptions = {
   idField: 'id',
   extractField: (document: any, fieldName: string) => document[fieldName],
+  stringifyField: (fieldValue: any, fieldName: string) => fieldValue.toString(),
   tokenize: (text: string) => text.split(SPACE_OR_PUNCTUATION),
   processTerm: (term: string) => term.toLowerCase(),
   fields: undefined,