Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/pavlovtech/WebReaper
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Pavlov committed Aug 11, 2023
2 parents 3ed044a + add001b commit 6590533
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 50 deletions.
6 changes: 3 additions & 3 deletions Examples/BrownsfashionScraper/BrownsfashionScraper.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="7.0.1" />
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.18.1" />
<PackageReference Include="Serilog" Version="2.12.0" />
<PackageReference Include="serilog.aspnetcore" Version="6.1.0" />
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.19.5" />
<PackageReference Include="Serilog" Version="3.0.1" />
<PackageReference Include="serilog.aspnetcore" Version="7.0.0" />
<PackageReference Include="Serilog.Enrichers.Environment" Version="2.2.0" />
<PackageReference Include="Serilog.Enrichers.Process" Version="2.0.2" />
<PackageReference Include="Serilog.Enrichers.Thread" Version="3.1.0" />
Expand Down
8 changes: 4 additions & 4 deletions Examples/WebReaper.AzureFuncs/WebReaper.AzureFuncs.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
<ItemGroup>
<PackageReference Include="Microsoft.Azure.Functions.Extensions" Version="1.1.0" />
<PackageReference Include="Microsoft.Azure.WebJobs.Extensions.OpenApi" Version="1.5.1" />
<PackageReference Include="Microsoft.Azure.WebJobs.Extensions.ServiceBus" Version="5.9.0" />
<PackageReference Include="Microsoft.Extensions.Caching.StackExchangeRedis" Version="7.0.4" />
<PackageReference Include="Microsoft.Azure.WebJobs.Extensions.ServiceBus" Version="5.11.0" />
<PackageReference Include="Microsoft.Extensions.Caching.StackExchangeRedis" Version="7.0.10" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="7.0.0" />
<PackageReference Include="Microsoft.NET.Sdk.Functions" Version="4.1.3" />
<PackageReference Include="StackExchange.Redis" Version="2.6.104" />
<PackageReference Include="Microsoft.NET.Sdk.Functions" Version="4.2.0" />
<PackageReference Include="StackExchange.Redis" Version="2.6.122" />
</ItemGroup>
<ItemGroup>
<None Update="host.json">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk.Worker">
<Project Sdk="Microsoft.NET.Sdk.Worker">

<PropertyGroup>
<TargetFramework>net7.0</TargetFramework>
Expand All @@ -13,8 +13,8 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="7.0.1" />
<PackageReference Include="Serilog" Version="2.12.0" />
<PackageReference Include="serilog.aspnetcore" Version="6.1.0" />
<PackageReference Include="Serilog" Version="3.0.1" />
<PackageReference Include="serilog.aspnetcore" Version="7.0.0" />
<PackageReference Include="Serilog.Enrichers.Environment" Version="2.2.0" />
<PackageReference Include="Serilog.Enrichers.Process" Version="2.0.2" />
<PackageReference Include="Serilog.Enrichers.Thread" Version="3.1.0" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="7.0.1" />
<PackageReference Include="Serilog" Version="2.12.0" />
<PackageReference Include="serilog.aspnetcore" Version="6.1.0" />
<PackageReference Include="Serilog" Version="3.0.1" />
<PackageReference Include="serilog.aspnetcore" Version="7.0.0" />
<PackageReference Include="Serilog.Enrichers.Environment" Version="2.2.0" />
<PackageReference Include="Serilog.Enrichers.Process" Version="2.0.2" />
<PackageReference Include="Serilog.Enrichers.Thread" Version="3.1.0" />
Expand Down
39 changes: 19 additions & 20 deletions WebReaper.Tests/WebReaper.IntegrationTests/ScraperTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,29 @@ public async Task StartScrapingWithMultipleStartUrls()

var startUrls = new[]
{
"https://www.reddit.com/r/dotnet/",
"https://www.reddit.com/r/worldnews/",
"https://www.reddit.com/r/ukraine/"
"https://www.alexpavlov.dev/blog/tags/csharp",
"https://www.alexpavlov.dev/blog/tags/ukraine",
"https://www.alexpavlov.dev/blog/tags/web"
};

var engine = await new ScraperEngineBuilder()
.Get(startUrls)
.Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
.Follow(".text-gray-900.transition")
.Parse(new()
{
new("title", "._eYtD2XCVieq6emjKBH3m"),
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
new("title", ".text-3xl.font-bold"),
new("text", ".max-w-max.prose.prose-dark")
})
.WithLogger(new TestOutputLogger(this.output))
.Subscribe(x => result.Add(x))
.BuildAsync();

_ = engine.RunAsync();

await Task.Delay(15000);
await Task.Delay(25000);

Assert.NotEmpty(result);
Assert.True(result.Any(r => r.Url.StartsWith(startUrls[0])));
Assert.True(result.Any(r => r.Url.StartsWith(startUrls[1])));
Assert.True(result.Any(r => r.Url.StartsWith(startUrls[2])));
Assert.True(result.Count > 1);
}

[Fact]
Expand All @@ -56,23 +54,24 @@ public async Task SimpleTest()
var result = new List<ParsedData>();

var engine = await new ScraperEngineBuilder()
.Get("https://www.reddit.com/r/dotnet/")
.Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
.Get("https://www.alexpavlov.dev/blog")
.Follow(".text-gray-900.transition")
.Parse(new()
{
new("title", "._eYtD2XCVieq6emjKBH3m"),
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
new("title", ".text-3xl.font-bold"),
new("text", ".max-w-max.prose.prose-dark")
})
.WithLogger(new TestOutputLogger(output))
.Subscribe(x => result.Add(x))
.Subscribe(result.Add)
.WithParallelismDegree(1)
.BuildAsync();

_ = engine.RunAsync();

await Task.Delay(10000);
await Task.Delay(15000);

Assert.NotEmpty(result);
Assert.True(result.Count > 1);
}

[Fact (Skip = "No stable proxy at the moment")]
Expand Down Expand Up @@ -114,12 +113,12 @@ public async Task SimpleTestWithSPA()
var result = new List<ParsedData>();

var engine = await new ScraperEngineBuilder()
.GetWithBrowser(new []{"https://www.reddit.com/r/dotnet/"})
.FollowWithBrowser("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
.GetWithBrowser(new []{ "https://www.alexpavlov.dev/blog" })
.FollowWithBrowser(".text-gray-900.transition")
.Parse(new()
{
new("title", "._eYtD2XCVieq6emjKBH3m"),
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
new("title", ".text-3xl.font-bold"),
new("text", ".max-w-max.prose.prose-dark")
})
.WithLogger(new TestOutputLogger(this.output))
.Subscribe(x => result.Add(x))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.5.0" />
<PackageReference Include="xunit" Version="2.4.2" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.7.0" />
<PackageReference Include="xunit" Version="2.5.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="coverlet.collector" Version="3.2.0">
<PackageReference Include="coverlet.collector" Version="6.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.5.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.7.0" />
<PackageReference Include="System.Text.Encoding.CodePages" Version="7.0.0" />
<PackageReference Include="xunit" Version="2.4.2" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
<PackageReference Include="xunit" Version="2.5.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="coverlet.collector" Version="3.2.0">
<PackageReference Include="coverlet.collector" Version="6.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
Expand Down
3 changes: 2 additions & 1 deletion WebReaper/Core/Spider/Concrete/Spider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ public async Task<List<Job>> CrawlAsync(Job job, CancellationToken cancellationT
var doc = job.PageType switch
{
PageType.Static => await LoadStaticPage(job),
PageType.Dynamic => await LoadDynamicPage(job, config.Headless)
PageType.Dynamic => await LoadDynamicPage(job, config.Headless),
_ => throw new NotImplementedException()
};

if (job.PageCategory == PageCategory.TargetPage)
Expand Down
16 changes: 8 additions & 8 deletions WebReaper/WebReaper.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@
<None Remove="Azure.Messaging.ServiceBus" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="AngleSharp" Version="1.0.1" />
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.32.3" />
<PackageReference Include="AngleSharp" Version="1.0.4" />
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.35.2" />
<PackageReference Include="Microsoft.Extensions.Http" Version="7.0.0" />
<PackageReference Include="MongoDB.Driver" Version="2.19.1" />
<PackageReference Include="MongoDB.Driver" Version="2.20.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="Polly" Version="7.2.3" />
<PackageReference Include="Polly" Version="7.2.4" />
<PackageReference Include="PuppeteerExtraSharp" Version="2.0.0" />
<PackageReference Include="PuppeteerSharp" Version="9.1.0" />
<PackageReference Include="StackExchange.Redis" Version="2.6.104" />
<PackageReference Include="PuppeteerSharp" Version="10.1.2" />
<PackageReference Include="StackExchange.Redis" Version="2.6.122" />
<PackageReference Include="System.Text.Encoding.CodePages" Version="7.0.0" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.0" />
<PackageReference Include="Azure.Messaging.ServiceBus" Version="7.13.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.1" />
<PackageReference Include="Azure.Messaging.ServiceBus" Version="7.16.0" />
<PackageReference Include="Vsxmd" Version="1.4.5">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
Expand Down
3 changes: 2 additions & 1 deletion WebReaper/todo.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@
- [ ] Sitemap crawling support
- [ ] Add LogTo method with Console and File support
- [ ] Request auto throttling
- [ ] Add bloom filter for revisiting same urls
- [ ] Add bloom filter for revisiting same urls
- [ ] Fix base url bug

0 comments on commit 6590533

Please sign in to comment.