Skip to content

Commit

Permalink
Add the possibility to transform the URL in "OpenLink".
Browse files Browse the repository at this point in the history
  • Loading branch information
David Robinson committed Sep 26, 2024
1 parent eb5d18e commit 59357ac
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 13 deletions.
10 changes: 7 additions & 3 deletions CocoCrawler/Builders/PageCrawlJobBuilder.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using CocoCrawler.CrawlOutputs;
using AngleSharp.Dom;
using CocoCrawler.CrawlOutputs;
using CocoCrawler.Exceptions;
using CocoCrawler.Job;
using CocoCrawler.Job.PageBrowserActions;
Expand Down Expand Up @@ -60,14 +61,17 @@ public PageCrawlJobBuilder ConfigurePageActions(Action<PageActionsBuilder> optio
return this;
}



/// <summary>
/// Adds a task to open a page and perform openLinks tasks.
/// </summary>
/// <param name="linksSelector">The CSS selector to select the element to openLinks.</param>
/// <param name="tasks">The array of openLinks tasks to perform.</param>
/// <param name="options">The action to configure the page actions for the openLinks tasks.</param>
/// <param name="linksSelectorFunc">A function to execute for each matching element, that produces the URL to follow.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null)
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null, Func<IElement, string?>? linksSelectorFunc = null)
{
PageActionsBuilder? pageActionsBuilder = null;

Expand All @@ -82,7 +86,7 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBu

jobOptions(builder);

Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build()));
Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build(), linksSelectorFunc));

return this;
}
Expand Down
27 changes: 22 additions & 5 deletions CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
using CocoCrawler.Builders;
using AngleSharp.Dom;
using CocoCrawler.Builders;
using CocoCrawler.Job.PageBrowserActions;

namespace CocoCrawler.Job.PageTasks;

public class CrawlPageOpenLinksTask(string paginationSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null) : IPageCrawlTask
public class CrawlPageOpenLinksTask : IPageCrawlTask
{
public string OpenLinksSelector { get; init; } = paginationSelector;
public PageActions? PageActions { get; init; } = pageActions;
public PageCrawlJobBuilder JobBuilder { get; init; } = builder;
public string OpenLinksSelector { get; init; }
public PageActions? PageActions { get; init; }
public Func<IElement, string?>? LinkProcessor { get; }
public PageCrawlJobBuilder JobBuilder { get; init; }

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null)
{
OpenLinksSelector = linksSelector;
PageActions = pageActions;
JobBuilder = builder;
}

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func<IElement, string?>? linkProcessor = null)
{
OpenLinksSelector = linksSelector;
PageActions = pageActions;
LinkProcessor = linkProcessor;
JobBuilder = builder;
}
}

2 changes: 1 addition & 1 deletion CocoCrawler/Crawler/PuppeteerCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCr

protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs)
{
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector);
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor);

Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length);

Expand Down
6 changes: 4 additions & 2 deletions CocoCrawler/Parser/AngleSharpParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ public virtual async Task Init(string html)
_document = await context.OpenAsync(req => req.Content(html));
}

public virtual string[] ParseForLinks(string linksSelector)
public virtual string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null)
{
linkProcessor ??= (element) => element.GetAttribute("href");

return _document!.QuerySelectorAll(linksSelector)
.Select(link => link.GetAttribute("href"))
.Select(link => linkProcessor(link))
.Where(link => link is not null)
.Select(link => link!)
.ToArray();
Expand Down
5 changes: 3 additions & 2 deletions CocoCrawler/Parser/IParser.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
using CocoCrawler.Job.PageTasks;
using AngleSharp.Dom;
using CocoCrawler.Job.PageTasks;
using Newtonsoft.Json.Linq;

namespace CocoCrawler.Parser;

public interface IParser
{
Task Init(string html);
string[] ParseForLinks(string linksSelector);
string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null);
JArray ExtractList(CrawlPageExtractListTask scrapeList);
JObject ExtractObject(CrawlPageExtractObjectTask task);
}

0 comments on commit 59357ac

Please sign in to comment.