Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
David Robinson committed Oct 17, 2024
1 parent 59357ac commit 0adedd1
Show file tree
Hide file tree
Showing 16 changed files with 193 additions and 38 deletions.
41 changes: 38 additions & 3 deletions CocoCrawler/Builders/PageCrawlJobBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using CocoCrawler.Job.PageTasks;
using CocoCrawler.Outputs;
using CocoCrawler.Parser;
using Newtonsoft.Json.Linq;
using System.Collections.Immutable;

namespace CocoCrawler.Builders;
Expand All @@ -19,6 +20,9 @@ public class PageCrawlJobBuilder
private List<IPageCrawlTask> Tasks { get; set; } = [];
private PageActionsBuilder? PageActionsBuilder { get; set; }
internal List<ICrawlOutput> Outputs { get; private set; } = [];
private int PageLoadTimeoutInMs { get; set; } = 1000;
private string? Parent { get; set; }
public string? PageName { get; private set; }

/// <summary>
/// Initializes a new instance of the <see cref="PageCrawlJobBuilder"/> class with the specified URL.
Expand All @@ -43,7 +47,7 @@ public PageCrawlJobBuilder(string url)
/// </summary>
internal PageCrawlJobBuilder()
{
}
}

/// <summary>
/// Configures the page actions for the crawl job.
Expand Down Expand Up @@ -71,7 +75,7 @@ public PageCrawlJobBuilder ConfigurePageActions(Action<PageActionsBuilder> optio
/// <param name="options">The action to configure the page actions for the openLinks tasks.</param>
/// <param name="linksSelectorFunc">A function to execute for each matching element, that produces the URL to follow.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null, Func<IElement, string?>? linksSelectorFunc = null)
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null, Func<IElement, Task<string?>>? linksSelectorFunc = null)
{
PageActionsBuilder? pageActionsBuilder = null;

Expand Down Expand Up @@ -128,6 +132,30 @@ public PageCrawlJobBuilder ExtractList(string containersSelector, List<CssSelect
return this;
}

/// <summary>
/// Sets the page load timeout
/// </summary>
/// <param name="pageLoadTimeoutInMs">The page load timeout in miliseconds.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder WithPageLoadTimeoutInMs(int pageLoadTimeoutInMs)
{
PageLoadTimeoutInMs = pageLoadTimeoutInMs;

return this;
}

/// <summary>
/// Sets the page name
/// </summary>
/// <param name="pageName">The page name.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder WithPageName(string pageName)
{
PageName = pageName;

return this;
}

/// <summary>
/// Adds a file crawl output to the page crawl job.
/// </summary>
Expand Down Expand Up @@ -169,6 +197,12 @@ public PageCrawlJobBuilder AddOutput(params ICrawlOutput[] outputAction)
return this;
}

internal PageCrawlJobBuilder WithParent(string parent)
{
Parent = parent;
return this;
}

/// <summary>
/// Used by openLinks action. Sets the URL later, keep internal.
/// </summary>
Expand Down Expand Up @@ -219,6 +253,7 @@ internal PageCrawlJob Build()

var pageActions = PageActionsBuilder?.Build();

return new PageCrawlJob(Url, [.. Tasks], [.. Outputs], pageActions);
return new PageCrawlJob(Url, [.. Tasks], [.. Outputs], pageActions, PageLoadTimeoutInMs, PageName, Parent);
}

}
4 changes: 3 additions & 1 deletion CocoCrawler/CocoCrawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

<ItemGroup>
<PackageReference Include="AngleSharp" Version="1.1.2" />
<PackageReference Include="PuppeteerSharp" Version="18.0.2" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="Polly" Version="8.4.2" />
<PackageReference Include="PuppeteerSharp" Version="20.0.2" />
</ItemGroup>

<ItemGroup>
Expand Down
9 changes: 7 additions & 2 deletions CocoCrawler/CrawlJob/PageCrawlJob.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
using CocoCrawler.Job.PageBrowserActions;
using CocoCrawler.Job.PageTasks;
using CocoCrawler.Outputs;
using Newtonsoft.Json.Linq;
using System.Collections.Immutable;

namespace CocoCrawler.Job;

public record PageCrawlJob(
string Url,
ImmutableArray<IPageCrawlTask> Tasks,
ImmutableArray<ICrawlOutput> Outputs,
PageActions? BrowserActions = null);
ImmutableArray<ICrawlOutput> Outputs,
PageActions? BrowserActions = null,
int PageLoadTimeoutInMs = 1000,
string? PageName = null,
string? Parent = null);

3 changes: 2 additions & 1 deletion CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractListTask.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using CocoCrawler.Parser;
using AngleSharp.Dom;
using CocoCrawler.Parser;

namespace CocoCrawler.Job.PageTasks;

Expand Down
3 changes: 2 additions & 1 deletion CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractObjectTask.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using CocoCrawler.Parser;
using AngleSharp.Dom;
using CocoCrawler.Parser;

namespace CocoCrawler.Job.PageTasks;

Expand Down
4 changes: 2 additions & 2 deletions CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public class CrawlPageOpenLinksTask : IPageCrawlTask
{
public string OpenLinksSelector { get; init; }
public PageActions? PageActions { get; init; }
public Func<IElement, string?>? LinkProcessor { get; }
public Func<IElement, Task<string?>>? LinkProcessor { get; }
public PageCrawlJobBuilder JobBuilder { get; init; }

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null)
Expand All @@ -18,7 +18,7 @@ public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder,
JobBuilder = builder;
}

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func<IElement, string?>? linkProcessor = null)
public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func<IElement, Task<string?>>? linkProcessor = null)
{
OpenLinksSelector = linksSelector;
PageActions = pageActions;
Expand Down
21 changes: 21 additions & 0 deletions CocoCrawler/CrawlOutputs/ConsoleCrawlHierarchicalOutput.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using CocoCrawler.Job;
using CocoCrawler.Outputs;
using Newtonsoft.Json.Linq;

namespace CocoCrawler.CrawlOutputs;

public class ConsoleCrawlHierarchicalOutput : IHierarchicalCrawlOutput
{
public Task Initialize(CancellationToken cancellationToken) => Task.CompletedTask;

public Task WriteAsync(JObject obj, CancellationToken _) => throw new NotImplementedException();


public Task WriteAsync(JObject jObject, PageCrawlJob job, CancellationToken cancellationToken)
{

Console.WriteLine(JToken.FromObject(new { job.PageName, job.Parent, job.Url, children = jObject}));

return Task.CompletedTask;
}
}
2 changes: 1 addition & 1 deletion CocoCrawler/CrawlOutputs/ConsoleCrawlOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ namespace CocoCrawler.CrawlOutputs;

public class ConsoleCrawlOutput : ICrawlOutput
{
public Task Initiaize(CancellationToken cancellationToken) => Task.CompletedTask;
public Task Initialize(CancellationToken cancellationToken) => Task.CompletedTask;

public Task WriteAsync(JObject obj, CancellationToken _)
{
Expand Down
2 changes: 1 addition & 1 deletion CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public class CsvFileCrawlOutput(string filePath, bool cleanOnStartup) : ICrawlOu
public bool CleanOnStartup { get; init; } = cleanOnStartup;
private readonly SemaphoreSlim _semaphore = new(1, 1);

public virtual Task Initiaize(CancellationToken cancellationToken)
public virtual Task Initialize(CancellationToken cancellationToken)
{
if (CleanOnStartup && File.Exists(filePath))
{
Expand Down
2 changes: 1 addition & 1 deletion CocoCrawler/CrawlOutputs/ICrawlOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ namespace CocoCrawler.Outputs;

public interface ICrawlOutput
{
Task Initiaize(CancellationToken cancellationToken);
Task Initialize(CancellationToken cancellationToken);
Task WriteAsync(JObject jObject, CancellationToken cancellationToken);
}
9 changes: 9 additions & 0 deletions CocoCrawler/CrawlOutputs/IHierarchicalCrawlOutput.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
using CocoCrawler.Job;
using Newtonsoft.Json.Linq;

namespace CocoCrawler.Outputs;

public interface IHierarchicalCrawlOutput: ICrawlOutput
{
Task WriteAsync(JObject jObject, PageCrawlJob job, CancellationToken cancellationToken);
}
83 changes: 72 additions & 11 deletions CocoCrawler/Crawler/PuppeteerCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,90 @@
using CocoCrawler.Parser;
using Microsoft.Extensions.Logging;
using Newtonsoft.Json.Linq;
using Polly.Retry;
using Polly;
using PuppeteerSharp;
using Polly.Timeout;
using System.Net.Security;

namespace CocoCrawler.Crawler;

public class PuppeteerCrawler : ICrawler
{
private ResiliencePipeline retryPipeline;

public PuppeteerCrawler()
{
retryPipeline = new ResiliencePipelineBuilder()
.AddRetry(new RetryStrategyOptions() {
BackoffType = DelayBackoffType.Constant,
MaxRetryAttempts = 4,
Delay = TimeSpan.FromSeconds(1),
ShouldHandle = (args) => ValueTask.FromResult(true)
}) // Add retry
//.AddTimeout(TimeSpan.FromSeconds(20)) // Add 10 seconds timeout
.Build(); // Builds the resilience pipeline
}

private ILogger? Logger { get; set; }
private IParser? Parser { get; set; }

public virtual async Task<CrawlResult> Crawl(IPage browserTab, PageCrawlJob currentPageJob)
{
Logger?.LogInformation("Getting page {Url}", currentPageJob.Url);
var browser = browserTab.Browser ?? throw new Exception("Browser is null");
IPage? tab = browserTab;
int i = 0;
try
{
int retries = 0;
do
{
Logger?.LogInformation("Getting page {Url} Attempt: {Attempt}", currentPageJob.Url, i++);
if (tab == null)
{
tab = await browser.NewPageAsync();
tab.DefaultNavigationTimeout = currentPageJob.PageLoadTimeoutInMs;
tab.DefaultTimeout = currentPageJob.PageLoadTimeoutInMs;
}
if (tab == null)
throw new Exception("Can't get new page");

try
{
await tab.GoToAsync(currentPageJob.Url, currentPageJob.PageLoadTimeoutInMs, [WaitUntilNavigation.Load, WaitUntilNavigation.Networkidle0, WaitUntilNavigation.DOMContentLoaded]);

await ExecutePageActions(tab, currentPageJob.BrowserActions);

var newJobs = new List<PageCrawlJob>();
var jArray = new JArray();

await Parse(currentPageJob, await tab.GetContentAsync(), newJobs, jArray);
Logger?.LogInformation("Finished getting {Url}. Total new jobs: {totaljobs}. Total new objects: {totalobj}", currentPageJob.Url, newJobs.Count, jArray.Count);

return new CrawlResult([.. newJobs], jArray);
} catch (Exception ex)
{
Logger?.LogError(ex, "Error while navigating");
tab.Dispose();
tab = null;
Thread.Sleep(TimeSpan.FromSeconds(3));
retries++;
if (retries > 20)
throw;
}
} while(true);

await browserTab.GoToAsync(currentPageJob.Url);

await ExecutePageActions(browserTab, currentPageJob.BrowserActions);
}
catch (Exception ex)
{
Logger?.LogError(ex, "Error while navigating");
throw;
}

var newJobs = new List<PageCrawlJob>();
var jArray = new JArray();

await Parse(currentPageJob, await browserTab.GetContentAsync(), newJobs, jArray);

Logger?.LogInformation("Finished getting {Url}. Total new jobs: {totaljobs}. Total new objects: {totalobj}", currentPageJob.Url, newJobs.Count, jArray.Count);

return new CrawlResult([.. newJobs], jArray);
}

protected virtual async Task ExecutePageActions(IPage page, PageActions? browserActions)
Expand Down Expand Up @@ -66,7 +125,7 @@ protected virtual async Task Parse(PageCrawlJob job, string html, List<PageCrawl
switch (task)
{
case CrawlPageOpenLinksTask openLinks:
HandleOpenLinksTask(openLinks, job, newJobs);
HandleOpenLinksTask(openLinks, job, newJobs, jArray);
break;
case CrawlPagePaginateTask paginate:
HandlePaginateTask(paginate, job, newJobs);
Expand Down Expand Up @@ -113,7 +172,7 @@ protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCr
newJobs.AddRange(newPages);
}

protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs)
protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs, JArray jArray)
{
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor);

Expand All @@ -123,9 +182,11 @@ protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, Pag
{
var newPageBuilder = openLinks.JobBuilder;

newPageBuilder.WithParent(job.Url);
newPageBuilder.WithUrl(url);
newPageBuilder.AddOutput([.. job.Outputs]);
newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray());
//TODO Why are extractobject tasks inherited?
//newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray());

var newPage = openLinks.JobBuilder.Build();

Expand Down
Loading

0 comments on commit 0adedd1

Please sign in to comment.