From 0adedd1b7dbff37c6371c83ab6c092ce0cacc747 Mon Sep 17 00:00:00 2001 From: David Robinson Date: Thu, 17 Oct 2024 23:10:31 +0200 Subject: [PATCH] Init --- CocoCrawler/Builders/PageCrawlJobBuilder.cs | 41 ++++++++- CocoCrawler/CocoCrawler.csproj | 4 +- CocoCrawler/CrawlJob/PageCrawlJob.cs | 9 +- .../PageTasks/CrawlPageExtractListTask.cs | 3 +- .../PageTasks/CrawlPageExtractObjectTask.cs | 3 +- .../PageTasks/CrawlPageOpenLinksTask.cs | 4 +- .../ConsoleCrawlHierarchicalOutput.cs | 21 +++++ .../CrawlOutputs/ConsoleCrawlOutput.cs | 2 +- .../CrawlOutputs/CsvFileCrawlOutput.cs | 2 +- CocoCrawler/CrawlOutputs/ICrawlOutput.cs | 2 +- .../CrawlOutputs/IHierarchicalCrawlOutput.cs | 9 ++ CocoCrawler/Crawler/PuppeteerCrawler.cs | 83 ++++++++++++++++--- CocoCrawler/CrawlerEngine.cs | 31 +++++-- CocoCrawler/Parser/AngleSharpParser.cs | 8 +- CocoCrawler/Parser/CssSelector.cs | 7 +- CocoCrawler/Parser/IParser.cs | 2 +- 16 files changed, 193 insertions(+), 38 deletions(-) create mode 100644 CocoCrawler/CrawlOutputs/ConsoleCrawlHierarchicalOutput.cs create mode 100644 CocoCrawler/CrawlOutputs/IHierarchicalCrawlOutput.cs diff --git a/CocoCrawler/Builders/PageCrawlJobBuilder.cs b/CocoCrawler/Builders/PageCrawlJobBuilder.cs index 3ba47c9..7a5f36d 100644 --- a/CocoCrawler/Builders/PageCrawlJobBuilder.cs +++ b/CocoCrawler/Builders/PageCrawlJobBuilder.cs @@ -6,6 +6,7 @@ using CocoCrawler.Job.PageTasks; using CocoCrawler.Outputs; using CocoCrawler.Parser; +using Newtonsoft.Json.Linq; using System.Collections.Immutable; namespace CocoCrawler.Builders; @@ -19,6 +20,9 @@ public class PageCrawlJobBuilder private List Tasks { get; set; } = []; private PageActionsBuilder? PageActionsBuilder { get; set; } internal List Outputs { get; private set; } = []; + private int PageLoadTimeoutInMs { get; set; } = 1000; + private string? Parent { get; set; } + public string? PageName { get; private set; } /// /// Initializes a new instance of the class with the specified URL. @@ -43,7 +47,7 @@ public PageCrawlJobBuilder(string url) /// internal PageCrawlJobBuilder() { - } + } /// /// Configures the page actions for the crawl job. @@ -71,7 +75,7 @@ public PageCrawlJobBuilder ConfigurePageActions(Action optio /// The action to configure the page actions for the openLinks tasks. /// A function to execute for each matching element, that produces the URL to follow. /// The updated instance. - public PageCrawlJobBuilder OpenLinks(string linksSelector, Action jobOptions, Action? options = null, Func? linksSelectorFunc = null) + public PageCrawlJobBuilder OpenLinks(string linksSelector, Action jobOptions, Action? options = null, Func>? linksSelectorFunc = null) { PageActionsBuilder? pageActionsBuilder = null; @@ -128,6 +132,30 @@ public PageCrawlJobBuilder ExtractList(string containersSelector, List + /// Sets the page load timeout + /// + /// The page load timeout in miliseconds. + /// The updated instance. + public PageCrawlJobBuilder WithPageLoadTimeoutInMs(int pageLoadTimeoutInMs) + { + PageLoadTimeoutInMs = pageLoadTimeoutInMs; + + return this; + } + + /// + /// Sets the page name + /// + /// The page name. + /// The updated instance. + public PageCrawlJobBuilder WithPageName(string pageName) + { + PageName = pageName; + + return this; + } + /// /// Adds a file crawl output to the page crawl job. /// @@ -169,6 +197,12 @@ public PageCrawlJobBuilder AddOutput(params ICrawlOutput[] outputAction) return this; } + internal PageCrawlJobBuilder WithParent(string parent) + { + Parent = parent; + return this; + } + /// /// Used by openLinks action. Sets the URL later, keep internal. /// @@ -219,6 +253,7 @@ internal PageCrawlJob Build() var pageActions = PageActionsBuilder?.Build(); - return new PageCrawlJob(Url, [.. Tasks], [.. Outputs], pageActions); + return new PageCrawlJob(Url, [.. Tasks], [.. Outputs], pageActions, PageLoadTimeoutInMs, PageName, Parent); } + } diff --git a/CocoCrawler/CocoCrawler.csproj b/CocoCrawler/CocoCrawler.csproj index d222306..b3a3d25 100644 --- a/CocoCrawler/CocoCrawler.csproj +++ b/CocoCrawler/CocoCrawler.csproj @@ -16,7 +16,9 @@ - + + + diff --git a/CocoCrawler/CrawlJob/PageCrawlJob.cs b/CocoCrawler/CrawlJob/PageCrawlJob.cs index 14582aa..ff36ce4 100644 --- a/CocoCrawler/CrawlJob/PageCrawlJob.cs +++ b/CocoCrawler/CrawlJob/PageCrawlJob.cs @@ -1,6 +1,7 @@ using CocoCrawler.Job.PageBrowserActions; using CocoCrawler.Job.PageTasks; using CocoCrawler.Outputs; +using Newtonsoft.Json.Linq; using System.Collections.Immutable; namespace CocoCrawler.Job; @@ -8,5 +9,9 @@ namespace CocoCrawler.Job; public record PageCrawlJob( string Url, ImmutableArray Tasks, - ImmutableArray Outputs, - PageActions? BrowserActions = null); + ImmutableArray Outputs, + PageActions? BrowserActions = null, + int PageLoadTimeoutInMs = 1000, + string? PageName = null, + string? Parent = null); + diff --git a/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractListTask.cs b/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractListTask.cs index 6bb74fc..08f4f49 100644 --- a/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractListTask.cs +++ b/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractListTask.cs @@ -1,4 +1,5 @@ -using CocoCrawler.Parser; +using AngleSharp.Dom; +using CocoCrawler.Parser; namespace CocoCrawler.Job.PageTasks; diff --git a/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractObjectTask.cs b/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractObjectTask.cs index 7f7e891..b69d6fa 100644 --- a/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractObjectTask.cs +++ b/CocoCrawler/CrawlJob/PageTasks/CrawlPageExtractObjectTask.cs @@ -1,4 +1,5 @@ -using CocoCrawler.Parser; +using AngleSharp.Dom; +using CocoCrawler.Parser; namespace CocoCrawler.Job.PageTasks; diff --git a/CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs b/CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs index 937135d..7aa6bbf 100644 --- a/CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs +++ b/CocoCrawler/CrawlJob/PageTasks/CrawlPageOpenLinksTask.cs @@ -8,7 +8,7 @@ public class CrawlPageOpenLinksTask : IPageCrawlTask { public string OpenLinksSelector { get; init; } public PageActions? PageActions { get; init; } - public Func? LinkProcessor { get; } + public Func>? LinkProcessor { get; } public PageCrawlJobBuilder JobBuilder { get; init; } public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null) @@ -18,7 +18,7 @@ public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, JobBuilder = builder; } - public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func? linkProcessor = null) + public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func>? linkProcessor = null) { OpenLinksSelector = linksSelector; PageActions = pageActions; diff --git a/CocoCrawler/CrawlOutputs/ConsoleCrawlHierarchicalOutput.cs b/CocoCrawler/CrawlOutputs/ConsoleCrawlHierarchicalOutput.cs new file mode 100644 index 0000000..d3b74f8 --- /dev/null +++ b/CocoCrawler/CrawlOutputs/ConsoleCrawlHierarchicalOutput.cs @@ -0,0 +1,21 @@ +using CocoCrawler.Job; +using CocoCrawler.Outputs; +using Newtonsoft.Json.Linq; + +namespace CocoCrawler.CrawlOutputs; + +public class ConsoleCrawlHierarchicalOutput : IHierarchicalCrawlOutput +{ + public Task Initialize(CancellationToken cancellationToken) => Task.CompletedTask; + + public Task WriteAsync(JObject obj, CancellationToken _) => throw new NotImplementedException(); + + + public Task WriteAsync(JObject jObject, PageCrawlJob job, CancellationToken cancellationToken) + { + + Console.WriteLine(JToken.FromObject(new { job.PageName, job.Parent, job.Url, children = jObject})); + + return Task.CompletedTask; + } +} diff --git a/CocoCrawler/CrawlOutputs/ConsoleCrawlOutput.cs b/CocoCrawler/CrawlOutputs/ConsoleCrawlOutput.cs index aaffce6..56c59bc 100644 --- a/CocoCrawler/CrawlOutputs/ConsoleCrawlOutput.cs +++ b/CocoCrawler/CrawlOutputs/ConsoleCrawlOutput.cs @@ -5,7 +5,7 @@ namespace CocoCrawler.CrawlOutputs; public class ConsoleCrawlOutput : ICrawlOutput { - public Task Initiaize(CancellationToken cancellationToken) => Task.CompletedTask; + public Task Initialize(CancellationToken cancellationToken) => Task.CompletedTask; public Task WriteAsync(JObject obj, CancellationToken _) { diff --git a/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs b/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs index 4902dc4..d3a771f 100644 --- a/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs +++ b/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs @@ -8,7 +8,7 @@ public class CsvFileCrawlOutput(string filePath, bool cleanOnStartup) : ICrawlOu public bool CleanOnStartup { get; init; } = cleanOnStartup; private readonly SemaphoreSlim _semaphore = new(1, 1); - public virtual Task Initiaize(CancellationToken cancellationToken) + public virtual Task Initialize(CancellationToken cancellationToken) { if (CleanOnStartup && File.Exists(filePath)) { diff --git a/CocoCrawler/CrawlOutputs/ICrawlOutput.cs b/CocoCrawler/CrawlOutputs/ICrawlOutput.cs index 37c32f6..37a2d64 100644 --- a/CocoCrawler/CrawlOutputs/ICrawlOutput.cs +++ b/CocoCrawler/CrawlOutputs/ICrawlOutput.cs @@ -4,6 +4,6 @@ namespace CocoCrawler.Outputs; public interface ICrawlOutput { - Task Initiaize(CancellationToken cancellationToken); + Task Initialize(CancellationToken cancellationToken); Task WriteAsync(JObject jObject, CancellationToken cancellationToken); } diff --git a/CocoCrawler/CrawlOutputs/IHierarchicalCrawlOutput.cs b/CocoCrawler/CrawlOutputs/IHierarchicalCrawlOutput.cs new file mode 100644 index 0000000..c54b07c --- /dev/null +++ b/CocoCrawler/CrawlOutputs/IHierarchicalCrawlOutput.cs @@ -0,0 +1,9 @@ +using CocoCrawler.Job; +using Newtonsoft.Json.Linq; + +namespace CocoCrawler.Outputs; + +public interface IHierarchicalCrawlOutput: ICrawlOutput +{ + Task WriteAsync(JObject jObject, PageCrawlJob job, CancellationToken cancellationToken); +} diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs index 629b7d7..94e68f5 100644 --- a/CocoCrawler/Crawler/PuppeteerCrawler.cs +++ b/CocoCrawler/Crawler/PuppeteerCrawler.cs @@ -4,31 +4,90 @@ using CocoCrawler.Parser; using Microsoft.Extensions.Logging; using Newtonsoft.Json.Linq; +using Polly.Retry; +using Polly; using PuppeteerSharp; +using Polly.Timeout; +using System.Net.Security; namespace CocoCrawler.Crawler; public class PuppeteerCrawler : ICrawler { + private ResiliencePipeline retryPipeline; + + public PuppeteerCrawler() + { + retryPipeline = new ResiliencePipelineBuilder() + .AddRetry(new RetryStrategyOptions() { + BackoffType = DelayBackoffType.Constant, + MaxRetryAttempts = 4, + Delay = TimeSpan.FromSeconds(1), + ShouldHandle = (args) => ValueTask.FromResult(true) + }) // Add retry + //.AddTimeout(TimeSpan.FromSeconds(20)) // Add 10 seconds timeout + .Build(); // Builds the resilience pipeline + } + private ILogger? Logger { get; set; } private IParser? Parser { get; set; } public virtual async Task Crawl(IPage browserTab, PageCrawlJob currentPageJob) { Logger?.LogInformation("Getting page {Url}", currentPageJob.Url); + var browser = browserTab.Browser ?? throw new Exception("Browser is null"); + IPage? tab = browserTab; + int i = 0; + try + { + int retries = 0; + do + { + Logger?.LogInformation("Getting page {Url} Attempt: {Attempt}", currentPageJob.Url, i++); + if (tab == null) + { + tab = await browser.NewPageAsync(); + tab.DefaultNavigationTimeout = currentPageJob.PageLoadTimeoutInMs; + tab.DefaultTimeout = currentPageJob.PageLoadTimeoutInMs; + } + if (tab == null) + throw new Exception("Can't get new page"); + + try + { + await tab.GoToAsync(currentPageJob.Url, currentPageJob.PageLoadTimeoutInMs, [WaitUntilNavigation.Load, WaitUntilNavigation.Networkidle0, WaitUntilNavigation.DOMContentLoaded]); + + await ExecutePageActions(tab, currentPageJob.BrowserActions); + + var newJobs = new List(); + var jArray = new JArray(); + + await Parse(currentPageJob, await tab.GetContentAsync(), newJobs, jArray); + Logger?.LogInformation("Finished getting {Url}. Total new jobs: {totaljobs}. Total new objects: {totalobj}", currentPageJob.Url, newJobs.Count, jArray.Count); + + return new CrawlResult([.. newJobs], jArray); + } catch (Exception ex) + { + Logger?.LogError(ex, "Error while navigating"); + tab.Dispose(); + tab = null; + Thread.Sleep(TimeSpan.FromSeconds(3)); + retries++; + if (retries > 20) + throw; + } + } while(true); - await browserTab.GoToAsync(currentPageJob.Url); - - await ExecutePageActions(browserTab, currentPageJob.BrowserActions); + } + catch (Exception ex) + { + Logger?.LogError(ex, "Error while navigating"); + throw; + } - var newJobs = new List(); - var jArray = new JArray(); - await Parse(currentPageJob, await browserTab.GetContentAsync(), newJobs, jArray); - Logger?.LogInformation("Finished getting {Url}. Total new jobs: {totaljobs}. Total new objects: {totalobj}", currentPageJob.Url, newJobs.Count, jArray.Count); - return new CrawlResult([.. newJobs], jArray); } protected virtual async Task ExecutePageActions(IPage page, PageActions? browserActions) @@ -66,7 +125,7 @@ protected virtual async Task Parse(PageCrawlJob job, string html, List newJobs) + protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List newJobs, JArray jArray) { var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor); @@ -123,9 +182,11 @@ protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, Pag { var newPageBuilder = openLinks.JobBuilder; + newPageBuilder.WithParent(job.Url); newPageBuilder.WithUrl(url); newPageBuilder.AddOutput([.. job.Outputs]); - newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray()); + //TODO Why are extractobject tasks inherited? + //newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray()); var newPage = openLinks.JobBuilder.Build(); diff --git a/CocoCrawler/CrawlerEngine.cs b/CocoCrawler/CrawlerEngine.cs index 2367dd3..80813ef 100644 --- a/CocoCrawler/CrawlerEngine.cs +++ b/CocoCrawler/CrawlerEngine.cs @@ -46,6 +46,7 @@ public virtual async Task RunAsync(CancellationToken cancellationToken = default await Parallel.ForEachAsync(settings.Scheduler.GetAll(cancellationToken), parallelOptions, async (job, token) => { using var page = await browser.NewPageAsync(); + await AddUserAgent(page, settings.UserAgent); await AddCookies(page, settings.Cookies); @@ -92,14 +93,17 @@ protected virtual async Task AddCookies(IPage page, ImmutableArray cooki } } + protected virtual async Task CrawlPage(IPage page, PageCrawlJob job, EngineSettings engine, CancellationToken token) { await AddUrlToHistoryAndCheckLimit(job.Url, engine.VisitedUrlTracker, engine.MaxPagesToCrawl, token); + page.DefaultNavigationTimeout = job.PageLoadTimeoutInMs; + page.DefaultTimeout = job.PageLoadTimeoutInMs; var result = await engine.Crawler.Crawl(page, job); + await HandleParsedResults(result.ScrapedData, job, token); await HandleNewJobs(result.NewJobs, engine, token); - await HandleParsedResults(result.ScrapedData, job.Outputs, token); } protected virtual async Task HandleNewJobs(IList newJobs, EngineSettings engine, CancellationToken token) @@ -111,13 +115,20 @@ protected virtual async Task HandleNewJobs(IList newJobs, EngineSe await engine.Scheduler.Add(jobs.ToImmutableArray(), token); } - protected virtual async Task HandleParsedResults(JArray jArray, ImmutableArray outputs, CancellationToken token) + protected virtual async Task HandleParsedResults(JArray jArray, PageCrawlJob job, CancellationToken token) { - foreach (var output in outputs) + foreach (var output in job.Outputs) { foreach (var obj in jArray.Cast()) { - await output.WriteAsync(obj, token); + if (output is IHierarchicalCrawlOutput hierarchicalCrawlOutput) + { + await hierarchicalCrawlOutput.WriteAsync(obj, job, token); + } + else + { + await output.WriteAsync(obj, token); + } } } } @@ -134,14 +145,20 @@ protected virtual async Task AddUrlToHistoryAndCheckLimit(string url, IVisitedUr protected virtual async Task DownloadAndLaunchBrowser(EngineSettings settings) { - var browserFetcher = new BrowserFetcher(); + PuppeteerSharp.Helpers.TaskHelper.DefaultTimeout = 5000; + var browserFetcher = new BrowserFetcher(SupportedBrowser.Chromium); await browserFetcher.DownloadAsync(); var launchOptions = new LaunchOptions() { - Headless = settings.IsHeadless + Browser = SupportedBrowser.Chromium, + Headless = settings.IsHeadless, + //Args = ["--disable-features=site-per-process"] , + ProtocolTimeout = 180000 + }; + return await Puppeteer.LaunchAsync(launchOptions); } @@ -150,7 +167,7 @@ private static async Task Initialize(EngineSettings settings, ImmutableArray tasks = [ settings.Scheduler.Initialize(jobs, cancellationToken), settings.VisitedUrlTracker.Initialize(cancellationToken), - ..jobs.SelectMany(j => j.Outputs.Select(x => x.Initiaize(cancellationToken))) + ..jobs.SelectMany(j => j.Outputs.Select(x => x.Initialize(cancellationToken))) ]; await Task.WhenAll(tasks); diff --git a/CocoCrawler/Parser/AngleSharpParser.cs b/CocoCrawler/Parser/AngleSharpParser.cs index b4156bd..f7749ea 100644 --- a/CocoCrawler/Parser/AngleSharpParser.cs +++ b/CocoCrawler/Parser/AngleSharpParser.cs @@ -17,12 +17,12 @@ public virtual async Task Init(string html) _document = await context.OpenAsync(req => req.Content(html)); } - public virtual string[] ParseForLinks(string linksSelector, Func? linkProcessor = null) + public virtual string[] ParseForLinks(string linksSelector, Func>? linkProcessor = null) { - linkProcessor ??= (element) => element.GetAttribute("href"); + linkProcessor ??= (element) => Task.FromResult(element.GetAttribute("href")); return _document!.QuerySelectorAll(linksSelector) - .Select(link => linkProcessor(link)) + .Select(link => linkProcessor(link).GetAwaiter().GetResult()) .Where(link => link is not null) .Select(link => link!) .ToArray(); @@ -58,7 +58,7 @@ protected virtual JObject ParseObject(IElement node, IEnumerable cs foreach (var selector in cssSelectors) { - jObject[selector.Name] = GetSelectorValue(node, selector.Selector, selector.Attribute); + jObject[selector.Name] = selector.ValueSelector == null ? GetSelectorValue(node, selector.Selector, selector.Attribute) : selector.ValueSelector(node, selector.Selector, selector.Attribute); } return jObject; diff --git a/CocoCrawler/Parser/CssSelector.cs b/CocoCrawler/Parser/CssSelector.cs index 0f479fe..a7de080 100644 --- a/CocoCrawler/Parser/CssSelector.cs +++ b/CocoCrawler/Parser/CssSelector.cs @@ -1,3 +1,6 @@ -namespace CocoCrawler.Parser; +using AngleSharp.Dom; +using Newtonsoft.Json.Linq; -public record CssSelector(string Name, string Selector, string? Attribute = null); \ No newline at end of file +namespace CocoCrawler.Parser; + +public record CssSelector(string Name, string Selector, string? Attribute = null, Func? ValueSelector = null); \ No newline at end of file diff --git a/CocoCrawler/Parser/IParser.cs b/CocoCrawler/Parser/IParser.cs index 4669d9b..37a1500 100644 --- a/CocoCrawler/Parser/IParser.cs +++ b/CocoCrawler/Parser/IParser.cs @@ -7,7 +7,7 @@ namespace CocoCrawler.Parser; public interface IParser { Task Init(string html); - string[] ParseForLinks(string linksSelector, Func? linkProcessor = null); + string[] ParseForLinks(string linksSelector, Func>? linkProcessor = null); JArray ExtractList(CrawlPageExtractListTask scrapeList); JObject ExtractObject(CrawlPageExtractObjectTask task); }