Skip to content

Commit

Permalink
feat: add ideographic and fullwidth punctuation to splitter (#192)
Browse files Browse the repository at this point in the history
  • Loading branch information
tonybaloney authored Mar 14, 2024
1 parent 523f583 commit 1802c46
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions packages/indexer/src/lib/document-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ import { type BaseLogger } from 'pino';
import { getBlobNameFromFile } from './blob-storage.js';
import { type ContentPage, type ContentSection, type Section } from './document.js';

const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
const SENTENCE_ENDINGS = new Set(['.', '。', '.', '!', '?', '‼', '⁇', '⁈', '⁉']);
const WORD_BREAKS = new Set([',', '、', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
const MAX_SECTION_LENGTH = 1000;
const SENTENCE_SEARCH_LIMIT = 100;
const SECTION_OVERLAP = 100;
Expand Down

0 comments on commit 1802c46

Please sign in to comment.