Skip to content

Commit

Permalink
fix: tentative workaround for #383
Browse files Browse the repository at this point in the history
  • Loading branch information
scambier committed Jul 31, 2024
1 parent a778937 commit 439150a
Showing 1 changed file with 27 additions and 22 deletions.
49 changes: 27 additions & 22 deletions src/search/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,41 @@ export class Tokenizer {
* @returns
*/
public tokenizeForIndexing(text: string): string[] {
const words = this.tokenizeWords(text)
let urls: string[] = []
if (this.plugin.settings.tokenizeUrls) {
try {
urls = markdownLinkExtractor(text)
} catch (e) {
logDebug('Error extracting urls', e)
try {
const words = this.tokenizeWords(text)
let urls: string[] = []
if (this.plugin.settings.tokenizeUrls) {
try {
urls = markdownLinkExtractor(text)
} catch (e) {
logDebug('Error extracting urls', e)
}
}
}

let tokens = this.tokenizeTokens(text, { skipChs: true })
let tokens = this.tokenizeTokens(text, { skipChs: true })

// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]

// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]

// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]

// Add urls
if (urls.length) {
tokens = [...tokens, ...urls]
}
// Add urls
if (urls.length) {
tokens = [...tokens, ...urls]
}

// Remove duplicates
tokens = [...new Set(tokens)]
// Remove duplicates
tokens = [...new Set(tokens)]

return tokens
return tokens
} catch (e) {
console.error('Error tokenizing text, skipping document', e)
return []
}
}

/**
Expand Down

0 comments on commit 439150a

Please sign in to comment.