refactor(indexer): extract handlers

Azure-Samples · Dec 12, 2023 · e3159ed · e3159ed
1 parent 703d278
commit e3159ed
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 7 deletions.
diff --git a/packages/indexer/src/lib/document-processor.ts b/packages/indexer/src/lib/document-processor.ts
@@ -1,7 +1,6 @@
 import { type BaseLogger } from 'pino';
 import { getBlobNameFromFile } from './blob-storage.js';
 import { type ContentPage, type ContentSection, type Section } from './document.js';
-import { extractText, extractTextFromPdf } from './formats/index.js';
 
 const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
 const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
@@ -12,11 +11,7 @@ const SECTION_OVERLAP = 100;
 export class DocumentProcessor {
   formatHandlers = new Map<string, (data: Buffer) => Promise<ContentPage[]>>();
 
-  constructor(private logger: BaseLogger) {
-    this.registerFormatHandler('text/plain', extractText);
-    this.registerFormatHandler('text/markdown', extractText);
-    this.registerFormatHandler('application/pdf', extractTextFromPdf);
-  }
+  constructor(private logger: BaseLogger) {}
 
   async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
     const pages = await this.extractText(data, type);
@@ -25,7 +20,7 @@ export class DocumentProcessor {
     return { filename, type, category, sections };
   }
 
-  private registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
+  public registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
     this.formatHandlers.set(type, handler);
   }
 

diff --git a/packages/indexer/src/lib/indexer.ts b/packages/indexer/src/lib/indexer.ts
@@ -5,6 +5,7 @@ import { type AzureClients } from '../plugins/azure.js';
 import { type OpenAiService } from '../plugins/openai.js';
 import { wait } from './util/index.js';
 import { DocumentProcessor } from './document-processor.js';
+import { extractText, extractTextFromPdf } from './formats/index.js';
 import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js';
 import { BlobStorage } from './blob-storage.js';
 import { type Section } from './document.js';
@@ -137,6 +138,9 @@ export class Indexer {
       }
 
       const documentProcessor = new DocumentProcessor(this.logger);
+      documentProcessor.registerFormatHandler('text/plain', extractText);
+      documentProcessor.registerFormatHandler('text/markdown', extractText);
+      documentProcessor.registerFormatHandler('application/pdf', extractTextFromPdf);
       const document = await documentProcessor.createDocumentFromFile(filename, data, type, category);
       const sections = document.sections;
       if (options.useVectors) {