Skip to content

Commit

Permalink
refactor(indexer): extract handlers
Browse files Browse the repository at this point in the history
  • Loading branch information
sinedied authored and shibbas committed Dec 12, 2023
1 parent 703d278 commit e3159ed
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 7 deletions.
9 changes: 2 additions & 7 deletions packages/indexer/src/lib/document-processor.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { type BaseLogger } from 'pino';
import { getBlobNameFromFile } from './blob-storage.js';
import { type ContentPage, type ContentSection, type Section } from './document.js';
import { extractText, extractTextFromPdf } from './formats/index.js';

const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
Expand All @@ -12,11 +11,7 @@ const SECTION_OVERLAP = 100;
export class DocumentProcessor {
formatHandlers = new Map<string, (data: Buffer) => Promise<ContentPage[]>>();

constructor(private logger: BaseLogger) {
this.registerFormatHandler('text/plain', extractText);
this.registerFormatHandler('text/markdown', extractText);
this.registerFormatHandler('application/pdf', extractTextFromPdf);
}
constructor(private logger: BaseLogger) {}

async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
const pages = await this.extractText(data, type);
Expand All @@ -25,7 +20,7 @@ export class DocumentProcessor {
return { filename, type, category, sections };
}

private registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
public registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
this.formatHandlers.set(type, handler);
}

Expand Down
4 changes: 4 additions & 0 deletions packages/indexer/src/lib/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { type AzureClients } from '../plugins/azure.js';
import { type OpenAiService } from '../plugins/openai.js';
import { wait } from './util/index.js';
import { DocumentProcessor } from './document-processor.js';
import { extractText, extractTextFromPdf } from './formats/index.js';
import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js';
import { BlobStorage } from './blob-storage.js';
import { type Section } from './document.js';
Expand Down Expand Up @@ -137,6 +138,9 @@ export class Indexer {
}

const documentProcessor = new DocumentProcessor(this.logger);
documentProcessor.registerFormatHandler('text/plain', extractText);
documentProcessor.registerFormatHandler('text/markdown', extractText);
documentProcessor.registerFormatHandler('application/pdf', extractTextFromPdf);
const document = await documentProcessor.createDocumentFromFile(filename, data, type, category);
const sections = document.sections;
if (options.useVectors) {
Expand Down

0 comments on commit e3159ed

Please sign in to comment.