diff --git a/src/Core/src/Extensions/SourceExtensions.cs b/src/Core/src/Extensions/SourceExtensions.cs index cecb1fb0..3e741edd 100644 --- a/src/Core/src/Extensions/SourceExtensions.cs +++ b/src/Core/src/Extensions/SourceExtensions.cs @@ -24,7 +24,7 @@ public static async Task> LoadAndSplit( textSplitter ??= new RecursiveCharacterTextSplitter(); var documents = await documentLoader.LoadAsync(dataSource, cancellationToken).ConfigureAwait(false); - + return textSplitter.SplitDocuments(documents); } } \ No newline at end of file diff --git a/src/Core/src/Extensions/VectorDatabaseExtensions.cs b/src/Core/src/Extensions/VectorDatabaseExtensions.cs index 4ec2e663..05fa39c2 100644 --- a/src/Core/src/Extensions/VectorDatabaseExtensions.cs +++ b/src/Core/src/Extensions/VectorDatabaseExtensions.cs @@ -34,7 +34,7 @@ public static async Task AddDocumentsFromAsync( where TLoader : IDocumentLoader, new() { vectorDatabase = vectorDatabase ?? throw new ArgumentNullException(paramName: nameof(vectorDatabase)); - + var vectorCollection = await vectorDatabase.GetOrCreateCollectionAsync( collectionName: collectionName, dimensions: dimensions, @@ -74,7 +74,7 @@ public static async Task> AddDocumentsFromAsync> AddDocumentsFromAsync /// Create a VectorDatabase table from documents. /// diff --git a/src/Databases/Mongo/src/MongoVectorCollection.cs b/src/Databases/Mongo/src/MongoVectorCollection.cs index d9c38a77..c3cff94f 100644 --- a/src/Databases/Mongo/src/MongoVectorCollection.cs +++ b/src/Databases/Mongo/src/MongoVectorCollection.cs @@ -32,7 +32,7 @@ public async Task DeleteAsync(IEnumerable ids, CancellationToken c var result = await _mongoCollection.FindAsync(filter, cancellationToken: cancellationToken).ConfigureAwait(false); return result.FirstOrDefault(cancellationToken: cancellationToken); } - + public async Task IsEmptyAsync(CancellationToken cancellationToken = default) { return await _mongoCollection.EstimatedDocumentCountAsync(cancellationToken: cancellationToken).ConfigureAwait(false) == 0; diff --git a/src/Sources/Abstractions/src/DataSource.cs b/src/Sources/Abstractions/src/DataSource.cs index e8368908..ce02e81b 100644 --- a/src/Sources/Abstractions/src/DataSource.cs +++ b/src/Sources/Abstractions/src/DataSource.cs @@ -12,22 +12,22 @@ public sealed class DataSource public string? Value { get; init; } public Stream? Stream { get; init; } public Encoding Encoding { get; init; } = Encoding.UTF8; - + public static DataSource FromPath(string path) { path = path ?? throw new ArgumentNullException(nameof(path)); - + return new DataSource { Type = DataSourceType.Path, Value = path, }; } - + public static DataSource FromUri(Uri uri) { uri = uri ?? throw new ArgumentNullException(nameof(uri)); - + return new DataSource { Type = DataSourceType.Uri, @@ -38,7 +38,7 @@ public static DataSource FromUri(Uri uri) public static DataSource FromUrl(string url) { url = url ?? throw new ArgumentNullException(nameof(url)); - + return new DataSource { Type = DataSourceType.Uri, @@ -49,7 +49,7 @@ public static DataSource FromUrl(string url) public static DataSource FromStream(Stream stream) { stream = stream ?? throw new ArgumentNullException(nameof(stream)); - + return new DataSource { Type = DataSourceType.Stream, @@ -60,14 +60,14 @@ public static DataSource FromStream(Stream stream) public static DataSource FromBytes(byte[] bytes) { bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); - + return new DataSource { Type = DataSourceType.Stream, Stream = new MemoryStream(bytes), }; } - + public async Task GetStreamAsync(CancellationToken cancellationToken = default) { if (Stream is not null) @@ -86,7 +86,7 @@ public async Task GetStreamAsync(CancellationToken cancellationToken = d _ => new MemoryStream(Encoding.GetBytes(Value!)), }; } - + private static async Task DownloadAsMemoryStreamAsync( Uri uri, CancellationToken cancellationToken = default) diff --git a/src/Sources/Abstractions/src/FileLoader.cs b/src/Sources/Abstractions/src/FileLoader.cs index 9aadc2bb..e0aad716 100644 --- a/src/Sources/Abstractions/src/FileLoader.cs +++ b/src/Sources/Abstractions/src/FileLoader.cs @@ -9,7 +9,7 @@ public class FileLoader : IDocumentLoader public async Task> LoadAsync(DataSource dataSource, CancellationToken cancellationToken = default) { dataSource = dataSource ?? throw new ArgumentNullException(paramName: nameof(dataSource)); - + var content = await File2.ReadAllTextAsync(dataSource.Value!, dataSource.Encoding, cancellationToken).ConfigureAwait(false); // It makes sense for agents, but we need tests for this diff --git a/src/Sources/Pdf/src/AsposePdfSource.cs b/src/Sources/Pdf/src/AsposePdfSource.cs index 5b88365c..8ee57cf1 100644 --- a/src/Sources/Pdf/src/AsposePdfSource.cs +++ b/src/Sources/Pdf/src/AsposePdfSource.cs @@ -11,7 +11,7 @@ public sealed class AsposePdfLoader : IDocumentLoader public async Task> LoadAsync(DataSource dataSource, CancellationToken cancellationToken = default) { dataSource = dataSource ?? throw new ArgumentNullException(paramName: nameof(dataSource)); - + using var stream = await dataSource.GetStreamAsync(cancellationToken).ConfigureAwait(false); using var pdfDocument = new Aspose.Pdf.Document(stream); var textAbsorber = new TextAbsorber(); diff --git a/src/Sources/Pdf/src/PdfPigPdfSource.cs b/src/Sources/Pdf/src/PdfPigPdfSource.cs index 84756c93..7628e632 100644 --- a/src/Sources/Pdf/src/PdfPigPdfSource.cs +++ b/src/Sources/Pdf/src/PdfPigPdfSource.cs @@ -12,10 +12,10 @@ public async Task> LoadAsync(DataSource dataSource CancellationToken cancellationToken = default) { dataSource = dataSource ?? throw new ArgumentNullException(paramName: nameof(dataSource)); - + using var stream = await dataSource.GetStreamAsync(cancellationToken).ConfigureAwait(false); using var document = PdfDocument.Open(stream, new ParsingOptions()); - + return document .GetPages() .Select(page => new Document(page.Text, new Dictionary diff --git a/src/Sources/WebBase/src/HtmlLoader.cs b/src/Sources/WebBase/src/HtmlLoader.cs index 7281436a..40573b7e 100644 --- a/src/Sources/WebBase/src/HtmlLoader.cs +++ b/src/Sources/WebBase/src/HtmlLoader.cs @@ -11,12 +11,12 @@ public class HtmlLoader : IDocumentLoader public async Task> LoadAsync(DataSource dataSource, CancellationToken cancellationToken = default) { dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource)); - + if (dataSource.Type != DataSourceType.Uri) { throw new NotSupportedException("Only Uri is supported"); } - + var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); var document = await context.OpenAsync(dataSource.Value!, cancellation: cancellationToken).ConfigureAwait(false); @@ -29,7 +29,7 @@ public async Task> LoadAsync(DataSource dataSource var html = document.QuerySelector("html") ?? throw new NotSupportedException("Not supported for pages without tag"); - + return new Document[] { new(html.TextContent, new Dictionary { { "url", dataSource.Value! }, diff --git a/src/Sources/Word/src/WordLoader.cs b/src/Sources/Word/src/WordLoader.cs index bdb7c42b..0411d803 100644 --- a/src/Sources/Word/src/WordLoader.cs +++ b/src/Sources/Word/src/WordLoader.cs @@ -13,10 +13,10 @@ public sealed class WordLoader : IDocumentLoader public async Task> LoadAsync(DataSource dataSource, CancellationToken cancellationToken = default) { dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource)); - + using var stream = await dataSource.GetStreamAsync(cancellationToken).ConfigureAwait(false); using var wordDocument = WordprocessingDocument.Open(stream, isEditable: false); - + var documents = new List(); foreach (var paragraph in wordDocument.MainDocumentPart?.Document.Body?.Elements() ?? []) {