Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the possibility to transform the URL in "OpenLink". #1

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions CocoCrawler/Builders/PageCrawlJobBuilder.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using CocoCrawler.CrawlOutputs;
using AngleSharp.Dom;
using CocoCrawler.CrawlOutputs;
using CocoCrawler.Exceptions;
using CocoCrawler.Job;
using CocoCrawler.Job.PageBrowserActions;
Expand Down Expand Up @@ -60,14 +61,17 @@ public PageCrawlJobBuilder ConfigurePageActions(Action<PageActionsBuilder> optio
return this;
}



/// <summary>
/// Adds a task to open a page and perform openLinks tasks.
/// </summary>
/// <param name="linksSelector">The CSS selector to select the element to openLinks.</param>
/// <param name="tasks">The array of openLinks tasks to perform.</param>
/// <param name="options">The action to configure the page actions for the openLinks tasks.</param>
/// <param name="linksSelectorFunc">A function to execute for each matching element, that produces the URL to follow.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null)
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null, Func<IElement, string?>? linksSelectorFunc = null)
{
PageActionsBuilder? pageActionsBuilder = null;

Expand All @@ -82,7 +86,7 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBu

jobOptions(builder);

Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build()));
Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build(), linksSelectorFunc));

return this;
}
Expand Down
27 changes: 22 additions & 5 deletions CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
using CocoCrawler.Builders;
using AngleSharp.Dom;
using CocoCrawler.Builders;
using CocoCrawler.Job.PageBrowserActions;

namespace CocoCrawler.Job.PageTasks;

public class CrawlPageOpenLinksTask(string paginationSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null) : IPageCrawlTask
public class CrawlPageOpenLinksTask : IPageCrawlTask
{
public string OpenLinksSelector { get; init; } = paginationSelector;
public PageActions? PageActions { get; init; } = pageActions;
public PageCrawlJobBuilder JobBuilder { get; init; } = builder;
public string OpenLinksSelector { get; init; }
public PageActions? PageActions { get; init; }
public Func<IElement, string?>? LinkProcessor { get; }
public PageCrawlJobBuilder JobBuilder { get; init; }

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null)
{
OpenLinksSelector = linksSelector;
PageActions = pageActions;
JobBuilder = builder;
}

public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func<IElement, string?>? linkProcessor = null)
{
OpenLinksSelector = linksSelector;
PageActions = pageActions;
LinkProcessor = linkProcessor;
JobBuilder = builder;
}
}

2 changes: 1 addition & 1 deletion CocoCrawler/Crawler/PuppeteerCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCr

protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs)
{
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector);
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor);

Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length);

Expand Down
6 changes: 4 additions & 2 deletions CocoCrawler/Parser/AngleSharpParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ public virtual async Task Init(string html)
_document = await context.OpenAsync(req => req.Content(html));
}

public virtual string[] ParseForLinks(string linksSelector)
public virtual string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null)
{
linkProcessor ??= (element) => element.GetAttribute("href");

return _document!.QuerySelectorAll(linksSelector)
.Select(link => link.GetAttribute("href"))
.Select(link => linkProcessor(link))
.Where(link => link is not null)
.Select(link => link!)
.ToArray();
Expand Down
5 changes: 3 additions & 2 deletions CocoCrawler/Parser/IParser.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
using CocoCrawler.Job.PageTasks;
using AngleSharp.Dom;
using CocoCrawler.Job.PageTasks;
using Newtonsoft.Json.Linq;

namespace CocoCrawler.Parser;

public interface IParser
{
Task Init(string html);
string[] ParseForLinks(string linksSelector);
string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null);
JArray ExtractList(CrawlPageExtractListTask scrapeList);
JObject ExtractObject(CrawlPageExtractObjectTask task);
}
Loading