Skip to content

Commit

Permalink
changing Web and Pdf sources to use propper document
Browse files Browse the repository at this point in the history
  • Loading branch information
TesAnti committed Nov 5, 2023
1 parent f0b40a6 commit 6370d7d
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 36 deletions.
13 changes: 5 additions & 8 deletions src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Aspose.Pdf.Text;

using LangChain.Base;
using Document = LangChain.Docstore.Document;
namespace LangChain.Sources;

/// <summary>
Expand All @@ -20,13 +21,9 @@ public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellat
using var pdfDocument = new Aspose.Pdf.Document(Path);
var textAbsorber = new TextAbsorber();
pdfDocument.Pages.Accept(textAbsorber);

var documents = (Document.Empty with
{
Content = textAbsorber.Text,
}).AsArray();

return Task.FromResult(documents);

var documents = new Document[] { new(textAbsorber.Text, new Dictionary<string, object> { { "path", Path } }) };
return Task.FromResult<IReadOnlyCollection<Document>>(documents);
}
catch (Exception exception)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\LangChain.Sources.Abstractions\LangChain.Sources.Abstractions.csproj" />
<PackageReference Include="Aspose.PDF" />
<PackageReference Include="PdfPig" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Aspose.PDF" />
<PackageReference Include="PdfPig" />
<ProjectReference Include="..\..\LangChain.Core\LangChain.Core.csproj" />
</ItemGroup>

</Project>
27 changes: 17 additions & 10 deletions src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
using LangChain.Base;
using UglyToad.PdfPig;

using Document=LangChain.Docstore.Document;
namespace LangChain.Sources;

/// <summary>
///
/// </summary>
public class PdfPigPdfSource : ISource
{
/// <summary>
///
/// </summary>
public required string Path { get; init; }

public string Path { get; }

public PdfPigPdfSource(string path)
{
Path = path;
}

/// <inheritdoc/>
public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellationToken = default)
Expand All @@ -19,14 +23,17 @@ public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellat
{
using PdfDocument document = PdfDocument.Open(Path, new ParsingOptions());
var pages = document.GetPages();
var content = String.Join("\n\n", pages.Select(page => page.Text));

var documents = (Document.Empty with

var documents = pages.Select(page => new Document(page.Text, new Dictionary<string, object>
{
Content = content,
}).AsArray();
{"path",Path},
{"page",page.Number}


})).ToArray();

return Task.FromResult(documents);
return Task.FromResult<IReadOnlyCollection<Document>>(documents);
}
catch (Exception exception)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\LangChain.Sources.Abstractions\LangChain.Sources.Abstractions.csproj" />
<PackageReference Include="AngleSharp" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="AngleSharp" />
<ProjectReference Include="..\..\LangChain.Core\LangChain.Core.csproj" />
</ItemGroup>

</Project>
7 changes: 3 additions & 4 deletions src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using AngleSharp;
using LangChain.Base;
using LangChain.Docstore;

namespace LangChain.Sources;

Expand Down Expand Up @@ -46,10 +48,7 @@ protected async Task<IReadOnlyCollection<Document>> LoadCoreAsync(string url)

content = html.TextContent;

var documents = (Document.Empty with
{
Content = content
}).AsArray();
var documents = new Document[] { new(content, new Dictionary<string, object> { { "url", url } }) };

return documents;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@ public class PdfSourceTests
[TestMethod]
public async Task PdfPig_CheckText()
{
var loader = new PdfPigPdfSource
{
Path = "sample.pdf"
};
var loader = new PdfPigPdfSource("sample.pdf");

var documents = await loader.LoadAsync();

documents.Should().NotBeEmpty();
var first = documents.First();


// check text from page 1
first.Content.Should().Contain("A Simple PDF File");
documents.First().PageContent.Should().Contain("A Simple PDF File");

// check text from page 2
first.Content.Should().Contain("Simple PDF File 2");
documents.Skip(1).First().PageContent.Should().Contain("Simple PDF File 2");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public async Task CheckText()
documents.Should().NotBeEmpty();
var first = documents.First();

first.Content.Should().Contain("Web scraping, web harvesting, or web data extraction is");
first.Content.Should().Contain("This page was last edited on");
first.PageContent.Should().Contain("Web scraping, web harvesting, or web data extraction is");
first.PageContent.Should().Contain("This page was last edited on");
}
}

0 comments on commit 6370d7d

Please sign in to comment.