diff --git a/src/DocumentLoaders/Abstractions/src/DataSourceExtensions.cs b/src/DocumentLoaders/Abstractions/src/DataSourceExtensions.cs index 00cd42bc..7b8defeb 100644 --- a/src/DocumentLoaders/Abstractions/src/DataSourceExtensions.cs +++ b/src/DocumentLoaders/Abstractions/src/DataSourceExtensions.cs @@ -28,7 +28,7 @@ public static IReadOnlyDictionary ToMetadata(this DataSource dat /// /// /// - public static IReadOnlyDictionary? CollectMetadata( + public static IReadOnlyDictionary? CollectMetadataIfRequired( this DocumentLoaderSettings? settings, DataSource dataSource) { @@ -43,11 +43,11 @@ public static IReadOnlyDictionary ToMetadata(this DataSource dat /// /// /// - public static IReadOnlyDictionary? With( - this IReadOnlyDictionary? metadata, + public static IReadOnlyDictionary With( + this IReadOnlyDictionary metadata, IReadOnlyDictionary additionalMetadata) { - return metadata? + return metadata .Concat(additionalMetadata) .ToDictionary(pair => pair.Key, pair => pair.Value); } diff --git a/src/DocumentLoaders/Abstractions/src/FileLoader.cs b/src/DocumentLoaders/Abstractions/src/FileLoader.cs index 7debdbb9..c6247acb 100644 --- a/src/DocumentLoaders/Abstractions/src/FileLoader.cs +++ b/src/DocumentLoaders/Abstractions/src/FileLoader.cs @@ -35,8 +35,8 @@ public async Task> LoadAsync( // } // } - var metadata = settings.CollectMetadata(dataSource); - + var metadata = settings.CollectMetadataIfRequired(dataSource); + return [new Document(content, metadata: metadata)]; } } \ No newline at end of file diff --git a/src/DocumentLoaders/Pdf/src/AsposePdfSource.cs b/src/DocumentLoaders/Pdf/src/AsposePdfSource.cs index 106fd844..d3256e5e 100644 --- a/src/DocumentLoaders/Pdf/src/AsposePdfSource.cs +++ b/src/DocumentLoaders/Pdf/src/AsposePdfSource.cs @@ -20,8 +20,8 @@ public async Task> LoadAsync( var textAbsorber = new TextAbsorber(); pdfDocument.Pages.Accept(textAbsorber); - var metadata = settings.CollectMetadata(dataSource); - + var metadata = settings.CollectMetadataIfRequired(dataSource); + return [new Document(textAbsorber.Text, metadata: metadata)]; } } \ No newline at end of file diff --git a/src/DocumentLoaders/Pdf/src/PdfPigPdfSource.cs b/src/DocumentLoaders/Pdf/src/PdfPigPdfSource.cs index ce056c15..1b2911d1 100644 --- a/src/DocumentLoaders/Pdf/src/PdfPigPdfSource.cs +++ b/src/DocumentLoaders/Pdf/src/PdfPigPdfSource.cs @@ -18,11 +18,11 @@ public async Task> LoadAsync( using var stream = await dataSource.GetStreamAsync(cancellationToken).ConfigureAwait(false); using var document = PdfDocument.Open(stream, new ParsingOptions()); - var metadata = settings.CollectMetadata(dataSource); + var metadata = settings.CollectMetadataIfRequired(dataSource); return document .GetPages() - .Select(page => new Document(page.Text, metadata.With(new Dictionary + .Select(page => new Document(page.Text, metadata?.With(new Dictionary { { "page", page.Number }, }))) diff --git a/src/DocumentLoaders/WebBase/src/HtmlLoader.cs b/src/DocumentLoaders/WebBase/src/HtmlLoader.cs index 32da5184..e9417614 100644 --- a/src/DocumentLoaders/WebBase/src/HtmlLoader.cs +++ b/src/DocumentLoaders/WebBase/src/HtmlLoader.cs @@ -33,7 +33,7 @@ public async Task> LoadAsync( document.QuerySelector("html") ?? throw new NotSupportedException("Not supported for pages without tag"); - var metadata = settings.CollectMetadata(dataSource); + var metadata = settings.CollectMetadataIfRequired(dataSource); return [new Document(html.TextContent, metadata: metadata)]; } diff --git a/src/DocumentLoaders/Word/src/WordLoader.cs b/src/DocumentLoaders/Word/src/WordLoader.cs index c069acf9..121d8f2a 100644 --- a/src/DocumentLoaders/Word/src/WordLoader.cs +++ b/src/DocumentLoaders/Word/src/WordLoader.cs @@ -35,7 +35,7 @@ public async Task> LoadAsync( } } - var metadata = settings.CollectMetadata(dataSource); + var metadata = settings.CollectMetadataIfRequired(dataSource); return documents .Select(text => new Document(text, metadata: metadata))