From 19c24b90060877e71225f30c98e3dfbd33866c7b Mon Sep 17 00:00:00 2001
From: Alex Osborne <aosborne@nla.gov.au>
Date: Sun, 1 Dec 2024 00:10:28 +0900
Subject: [PATCH] Add Groovy crawl configs

This enables crawl configuration files to use Spring's [Groovy Bean Definition DSL] as an optional alternative to Spring XML. It uses the same bean configuration model but the syntax is more terse and human-readable. No more need for `&amp;` in seed URLs. :-)

```groovy
   checkpointService(CheckpointService) {
        checkpointIntervalMinutes = 15
        checkpointsDir = 'checkpoints'
        forgetAllButLatest = true
   }
```

It also enables some powerful scripting capabilities. For example, defining a custom DecideRule directly in the crawl scope:

```groovy
scope(DecideRuleSequence) {
    rules = [
        new RejectDecideRule(),
        // ACCEPT everything linked from a .pdf file
        new PredicatedDecideRule() {
             boolean evaluate(CrawlURI uri) {
                 return uri.via?.path?.endsWith(".pdf")
             }
        },
        // ...
    ]
}
```

The main downsides are defining nested inner beans can be a bit awkward, some of the errors can be cryptic, and you can't just manipulate the config files with an XML parser.

This commit includes a Groovy version of the default crawl profile for reference, but doesn't expose a way to use it in the UI yet. For now, you need to manually create a `crawler-beans.groovy` file in your job directory.

[Groovy Bean Definition DSL]: https://docs.spring.io/spring-framework/reference/core/beans/basics.html#beans-factory-groovy
---
 commons/pom.xml                               |   5 +
 .../archive/spring/PathSharingContext.java    |  46 +-
 .../spring/PathSharingContextTest.java        |  49 ++
 .../spring/PathSharingContextTestBeans.cxml   |  14 +
 .../spring/PathSharingContextTestBeans.groovy |   8 +
 .../archive/crawler/framework/CrawlJob.java   |  10 +-
 .../org/archive/crawler/framework/Engine.java |   2 +-
 .../restlet/profile-crawler-beans.groovy      | 687 ++++++++++++++++++
 .../restlet/ProfileCrawlerBeansTest.java      |  35 +
 modules/pom.xml                               |   1 -
 pom.xml                                       |   1 +
 11 files changed, 852 insertions(+), 6 deletions(-)
 create mode 100644 commons/src/test/java/org/archive/spring/PathSharingContextTest.java
 create mode 100644 commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml
 create mode 100644 commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy
 create mode 100644 engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
 create mode 100644 engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java
diff --git a/commons/pom.xml b/commons/pom.xml
index 28acdcb1d..dadaa8e4e 100644
--- a/commons/pom.xml
+++ b/commons/pom.xml
@@ -153,6 +153,11 @@
 			<artifactId>jsch</artifactId>
 			<version>0.2.21</version>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.groovy</groupId>
+			<artifactId>groovy</artifactId>
+			<version>${groovy.version}</version>
+		</dependency>
 	</dependencies>
 	<build>
 		<resources>
diff --git a/commons/src/main/java/org/archive/spring/PathSharingContext.java b/commons/src/main/java/org/archive/spring/PathSharingContext.java
index edd1898fd..a793afcbb 100644
--- a/commons/src/main/java/org/archive/spring/PathSharingContext.java
+++ b/commons/src/main/java/org/archive/spring/PathSharingContext.java
@@ -34,9 +34,15 @@
 import org.apache.commons.io.FileUtils;
 import org.archive.util.ArchiveUtils;
 import org.springframework.beans.BeansException;
+import org.springframework.beans.factory.BeanDefinitionStoreException;
 import org.springframework.beans.factory.config.ConfigurableListableBeanFactory;
+import org.springframework.beans.factory.groovy.GroovyBeanDefinitionReader;
+import org.springframework.beans.factory.xml.XmlBeanDefinitionReader;
 import org.springframework.context.ApplicationContext;
+import org.springframework.context.annotation.AnnotationConfigUtils;
 import org.springframework.context.support.FileSystemXmlApplicationContext;
+import org.springframework.core.io.Resource;
+import org.springframework.core.io.support.EncodedResource;
 import org.springframework.validation.BeanPropertyBindingResult;
 import org.springframework.validation.Errors;
 import org.springframework.validation.Validator;
@@ -46,9 +52,13 @@
  * 
  * Notable extensions:
  * 
- * Remembers its primary XML configuration file, and can report its filesystem
+ * Remembers its primary configuration file, and can report its filesystem
  * path.
- * 
+ *
+ * Supports both Spring XML and Groovy Bean Definition DSL.
+ *
+ * Automatically enables annotation processing (&lt;context:annotation-config/&gt;).
+ *
  * Reports a summary of Errors collected from self-Validating Beans.
  * 
  * Generates launchId from timestamp, creates launch directory
@@ -212,5 +222,37 @@ public ConcurrentHashMap<Object, Object> getData() {
         return data;
     }
 
+    /**
+     * Load bean definitions from XML or Groovy.
+     */
+    @Override
+    protected void loadBeanDefinitions(XmlBeanDefinitionReader xmlReader) throws BeansException, IOException {
+        // This is essentially <context:annotation-config/>
+        // By doing it here we don't need to include it in every crawl config.
+        AnnotationConfigUtils.registerAnnotationConfigProcessors(xmlReader.getRegistry());
+
+        GroovyBeanDefinitionReader groovyReader = new GroovyBeanDefinitionReader(xmlReader.getRegistry()) {
+            // By default, the Groovy reader loads XML from .xml and Groovy for everything else, but
+            // Heritrix uses .cxml so we override it to only use the Groovy reader for .groovy files
+            // and the XML reader for everything else.
+            @Override
+            public int loadBeanDefinitions(EncodedResource encodedResource) throws BeanDefinitionStoreException {
+                String filename = encodedResource.getResource().getFilename();
+                if (filename != null && filename.endsWith(".groovy")) {
+                    return super.loadBeanDefinitions(encodedResource);
+                }
+                return xmlReader.loadBeanDefinitions(encodedResource);
+            }
+        };
+        groovyReader.setEnvironment(getEnvironment());
 
+        Resource[] configResources = getConfigResources();
+        if (configResources != null) {
+            groovyReader.loadBeanDefinitions(configResources);
+        }
+        String[] configLocations = getConfigLocations();
+        if (configLocations != null) {
+            groovyReader.loadBeanDefinitions(configLocations);
+        }
+    }
 }
diff --git a/commons/src/test/java/org/archive/spring/PathSharingContextTest.java b/commons/src/test/java/org/archive/spring/PathSharingContextTest.java
new file mode 100644
index 000000000..4c535d5de
--- /dev/null
+++ b/commons/src/test/java/org/archive/spring/PathSharingContextTest.java
@@ -0,0 +1,49 @@
+package org.archive.spring;
+
+import org.junit.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import static org.junit.Assert.*;
+
+public class PathSharingContextTest {
+    @Test
+    public void testGroovyConfig() {
+        testConfig("groovy", "classpath:org/archive/spring/PathSharingContextTestBeans.groovy");
+    }
+
+    @Test
+    public void testXmlConfig() {
+        testConfig("xml", "classpath:org/archive/spring/PathSharingContextTestBeans.cxml");
+    }
+
+    private static void testConfig(String name, String configPath) {
+        try (var context = new PathSharingContext(configPath)) {
+            context.validate();
+            assertTrue("should be no validation errors", context.getAllErrors().isEmpty());
+            assertEquals("primaryConfiguationPath should be correct", configPath, context.getPrimaryConfigurationPath());
+            Bean1 bean1 = context.getBean("bean1", Bean1.class);
+            Bean2 bean2 = context.getBean("bean2", Bean2.class);
+            assertNotNull("bean1 should not be null", bean1);
+            assertNotNull("bean2 should not be null", bean2);
+            assertEquals("bean1.name should be set", name, bean1.name);
+            assertEquals("bean1 should be autowired into bean2", bean1, bean2.bean1);
+        }
+    }
+
+    public static class Bean1 {
+        private String name;
+
+        public void setName(String name) {
+            this.name = name;
+        }
+    }
+
+    public static class Bean2 {
+        private Bean1 bean1;
+
+        @Autowired
+        public void setBean1(Bean1 bean1) {
+            this.bean1 = bean1;
+        }
+    }
+}
\ No newline at end of file
diff --git a/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml
new file mode 100644
index 000000000..3bac16a5d
--- /dev/null
+++ b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:context="http://www.springframework.org/schema/context"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
+           http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
+
+    <context:annotation-config/>
+
+    <bean id="bean1" class="org.archive.spring.PathSharingContextTest$Bean1">
+        <property name="name" value="xml"/>
+    </bean>
+    <bean id="bean2" class="org.archive.spring.PathSharingContextTest$Bean2"/>
+</beans>
\ No newline at end of file
diff --git a/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy
new file mode 100644
index 000000000..657375c91
--- /dev/null
+++ b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy
@@ -0,0 +1,8 @@
+import org.archive.spring.PathSharingContextTest
+
+beans {
+    bean1(PathSharingContextTest.Bean1) {
+        name = "groovy"
+    }
+    bean2(PathSharingContextTest.Bean2)
+}
\ No newline at end of file
diff --git a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java
index ded08b85e..4e19ab668 100644
--- a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java
+++ b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java
@@ -251,8 +251,14 @@ public void writeHtmlTo(PrintWriter pw, String uriPrefix) {
     public void checkXML() {
         // TODO: suppress check if XML unchanged? job.log when XML changed? 
 
-        Instant testTime = Instant.ofEpochMilli(getPrimaryConfig().lastModified());
-        Document doc = getDomDocument(getPrimaryConfig());
+        File primaryConfig = getPrimaryConfig();
+        Instant testTime = Instant.ofEpochMilli(primaryConfig.lastModified());
+        if (primaryConfig.toString().endsWith(".groovy")) {
+            // just assume Groovy configs are OK
+            xmlOkAt = testTime;
+            return;
+        }
+        Document doc = getDomDocument(primaryConfig);
         // TODO: check for other minimal requirements, like
         // presence of a few key components (CrawlController etc.)? 
         if(doc!=null) {
diff --git a/engine/src/main/java/org/archive/crawler/framework/Engine.java b/engine/src/main/java/org/archive/crawler/framework/Engine.java
index 875c3e6b0..b331efe14 100644
--- a/engine/src/main/java/org/archive/crawler/framework/Engine.java
+++ b/engine/src/main/java/org/archive/crawler/framework/Engine.java
@@ -138,7 +138,7 @@ public boolean addJobDirectory(File dir) {
         }
         File[] candidateConfigs = dir.listFiles(new FilenameFilter() {
             public boolean accept(File dir, String name) {
-                return name.endsWith(".cxml");
+                return name.endsWith(".cxml") || name.equals("crawler-beans.groovy");
             }});
         if(candidateConfigs==null || candidateConfigs.length == 0) {
             // no CXML file found!
diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
new file mode 100644
index 000000000..e55785585
--- /dev/null
+++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
@@ -0,0 +1,687 @@
+/*
+ * HERITRIX 3 CRAWL JOB CONFIGURATION FILE
+ *
+ * This is a relatively minimal configuration suitable for many crawls.
+ *
+ * Commented-out beans and properties are provided as an example; values
+ * shown in comments reflect the actual defaults which are in effect
+ * if not otherwise specified specification. (To change from the default
+ * behavior, uncomment AND alter the shown values.)
+ */
+
+import org.archive.bdb.BdbModule
+import org.archive.crawler.framework.*
+import org.archive.crawler.frontier.*
+import org.archive.crawler.monitor.DiskSpaceMonitor
+import org.archive.crawler.postprocessor.*
+import org.archive.crawler.prefetch.*
+import org.archive.crawler.reporting.*
+import org.archive.crawler.spring.*
+import org.archive.crawler.util.BdbUriUniqFilter
+import org.archive.modules.*
+import org.archive.modules.canonicalize.*
+import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule
+import org.archive.modules.extractor.*
+import org.archive.modules.fetcher.*
+import org.archive.modules.net.BdbServerCache
+import org.archive.modules.seeds.TextSeedModule
+import org.archive.modules.writer.WARCWriterChainProcessor
+import org.springframework.beans.factory.config.PropertyOverrideConfigurer
+import org.archive.modules.deciderules.*
+import org.archive.spring.*
+
+beans {
+    /*
+     * OVERRIDES
+     * Values elsewhere in the configuration may be replaced ('overridden')
+     * by a Properties map declared in a PropertiesOverrideConfigurer,
+     * using a dotted-bean-path to address individual bean properties.
+     * This allows us to collect a few of the most-often changed values
+     * in an easy-to-edit format here at the beginning of the model
+     * configuration.
+     */
+
+    /**
+     * overrides from a text property list
+     */
+    simpleOverrides(PropertyOverrideConfigurer) {
+        properties = '''
+# This Properties map is specified in the Java 'property list' text format
+# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
+
+metadata.operatorContactUrl=ENTER_AN_URL_WITH_YOUR_CONTACT_INFO_HERE_FOR_WEBMASTERS_AFFECTED_BY_YOUR_CRAWL
+metadata.jobName=basic
+metadata.description=Basic crawl starting with useful defaults
+
+##..more?..##
+'''
+    }
+
+    /**
+     * overrides from declared <prop> elements, more easily allowing
+     * multiline values or even declared beans
+     */
+    longerOverrides(PropertyOverrideConfigurer) {
+        properties = ['seeds.textSource.value': ''''
+
+# URLS HERE
+http://example.example/example
+
+        ''']
+    }
+
+    /**
+     * CRAWL METADATA: including identification of crawler/operator
+     */
+    metadata(CrawlMetadata) { bean ->
+        bean.autowire = 'byName'
+        operatorContactUrl = '[see override above]'
+        jobName = '[see override above]'
+        description = '[see override above]'
+        // robotsPolicyName = 'obey'
+        // operator = ''
+        // operatorFrom = ''
+        // organization = ''
+        // audience = ''
+        // userAgentTemplate = 'Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)'
+    }
+
+    /**
+     * SEEDS: crawl starting points
+     *
+     * ConfigString allows simple, inline specification of a moderate
+     * number of seeds; see below comment for example of using an
+     * arbitrarily-large external file.
+     */
+    seeds(TextSeedModule) {
+        textSource = new ConfigString('''
+# [see override above]
+        ''')
+        // sourceTagSeeds = false
+        // blockAwaitingSeedLines = -1
+    }
+
+    /**
+     * SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
+     * the job directory, similar to the H1 approach.
+     * Use either the above, or this, but not both.
+     */
+    /*
+    seeds(TextSeedModule) {
+        textSource = new ConfigFile(path: 'seeds.txt')
+        // sourceTagSeeds = false
+        // blockAwaitingSeedLines = -1
+    }
+    */
+
+    acceptSurts(SurtPrefixedDecideRule) {
+        // decision = 'ACCEPT'
+        // seedsAsSurtPrefixes = true
+        // alsoCheckVia = false
+        // surtsSourceFile = ''
+        // surtsDumpFile = '${launchId}/surts.dump'
+        /*
+        surtsSource = new ConfigString('''
+# example.com
+# http://www.example.edu/path1/
+# +http://(org,example,
+            ''')
+        }
+        */
+    }
+
+    /**
+     * SCOPE: rules for which discovered URIs to crawl; order is very
+     * important because last decision returned other than 'NONE' wins.
+     */
+    scope(DecideRuleSequence) {
+        logToFile = false
+        rules = [
+                // Begin by REJECTing all...
+                new RejectDecideRule(),
+                // ...then ACCEPT those within configured/seed-implied SURT prefixes...
+                new TooManyHopsDecideRule(
+                        // maxHops: 20,
+                ),
+                // ...but ACCEPT those more than a configured link-hop-count from start...
+                new TransclusionDecideRule(
+                        // maxTransHops: 2,
+                        // maxSpeculativeHops: 1,
+                ),
+                // ...but REJECT those from a configurable (initially empty) set of REJECT SURTs...
+                new SurtPrefixedDecideRule(
+                        decision: 'REJECT',
+                        seedsAsSurtPrefixes: false,
+                        surtsDumpFile: new ConfigFile(path: '${launchId}/negative-surts.dump'),
+                        // surtsSource: new ConfigFile(path: 'negative-surts.txt'),
+                ),
+                // ...and REJECT those from a configurable (initially empty) set of URI regexes...
+                new MatchesListRegexDecideRule(
+                        decision: 'REJECT',
+                        // listLogicalOr: false,
+                        // regexList: [],
+                ),
+                // ...and REJECT those with suspicious repeating path-segments...
+                new PathologicalPathDecideRule(
+                        // maxRepetitions: 2,
+                ),
+                // ...and REJECT those with more than threshold number of path-segments...
+                new TooManyPathSegmentsDecideRule(
+                        // maxPathDepth: 20,
+                ),
+                // ...but always ACCEPT those marked as prerequisitee for another URI...
+                new PrerequisiteAcceptDecideRule(),
+                // ...but always REJECT those with unsupported URI schemes
+                new SchemeNotInSetDecideRule(),
+        ]
+    }
+
+    /*
+     * PROCESSING CHAINS
+     * Much of the crawler's work is specified by the sequential
+     * application of swappable Processor modules. These Processors
+     * are collected into three 'chains'. The CandidateChain is applied
+     * to URIs being considered for inclusion, before a URI is enqueued
+     * for collection. The FetchChain is applied to URIs when their
+     * turn for collection comes up. The DispositionChain is applied
+     * after a URI is fetched and analyzed/link-extracted.
+     */
+
+    /*
+     * CANDIDATE CHAIN
+     */
+    // first, processors are declared as top-level named beans
+    candidateScoper(CandidateScoper)
+    preparer(FrontierPreparer) {
+        // preferenceDepthHops = -1
+        // preferenceEmbedHops = 1
+        // canonicalizationPolicy = ref('canonicalizationPolicy')
+        // queueAssignmentPolicy = ref('queueAssignmentPolicy')
+        // uriPrecedencePolicy = ref('uriPrecedencePolicy')
+        // costAssignmentPolicy = ref('costAssignmentPolicy')
+    }
+    // now, processors are assembled into ordered CandidateChain bean
+    candidateProcessors(CandidateChain) {
+        processors = [
+                // apply scoping rules to each individual candidate URI...
+                ref('candidateScoper'),
+                // ...then prepare those ACCEPTed to be enqueued to frontier.
+                ref('preparer'),
+        ]
+    }
+
+    /*
+     * FETCH CHAIN
+     */
+    // first, processors are declared as top-level named beans
+    preselector(Preselector) {
+        // recheckScope = false
+        // blockAll = false
+        // blockByRegex = ''
+        // allowByRegex = ''
+    }
+    preconditions(PreconditionEnforcer) {
+        // ipValidityDurationSeconds = 21600
+        // robotsValidityDurationSeconds = 86400
+        // calculateRobotsOnly = false
+    }
+    fetchDns(FetchDNS) {
+        // acceptNonDnsResolves = false
+        // digestContent = true
+        // digestAlgorithm = 'sha1'
+        // dnsOverHttpServer = 'https://dns.google/dns-query'
+    }
+    /*
+    fetchWhois(FetchWhois) {
+     specialQueryTemplates = [
+         'whois.verisign-grs.com': 'domain %s',
+         'whois.arin.net': 'z + %s',
+         'whois.denic.de': '-T dn %s'
+     ]
+    }
+    */
+    fetchHttp(FetchHTTP) {
+        // useHTTP11 = false
+        // maxLengthBytes = 0
+        // timeoutSeconds = 1200
+        // maxFetchKBSec = 0
+        // defaultEncoding = 'ISO-8859-1'
+        // shouldFetchBodyRule = new AcceptDecideRule()
+        // soTimeoutMs = 20000
+        // sendIfModifiedSince = true
+        // sendIfNoneMatch = true
+        // sendConnectionClose = true
+        // sendReferer = true
+        // sendRange = false
+        // ignoreCookies = false
+        // sslTrustLevel = 'OPEN'
+        // acceptHeaders = [
+        //     'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+        // ]
+        // httpBindAddress = ''
+        // httpProxyHost = ''
+        // httpProxyPort = 0
+        // httpProxyUser = ''
+        // httpProxyPassword = ''
+        // socksProxyHost = ''
+        // socksProxyPort = ''
+        // digestContent = true
+        // digestAlgorithm = 'sha1'
+    }
+    extractorHttp(ExtractorHTTP)
+    extractorRobotsTxt(ExtractorRobotsTxt)
+    extractorSitemap(ExtractorSitemap)
+    extractorHtml(ExtractorHTML) {
+        // extractJavascript = true
+        // extractValueAttributes = true
+        // ignoreFormActionUrls = false
+        // extractOnlyFormGets = true
+        // treatFramesAsEmbedLinks = true
+        // ignoreUnexpectedHtml = true
+        // maxElementLength = 1024
+        // maxAttributeNameLength = 1024
+        // maxAttributeValueLength = 16384
+    }
+    extractorCss(ExtractorCSS)
+    extractorJs(ExtractorJS)
+    extractorSwf(ExtractorSWF)
+    // now, processors are assembled into ordered FetchChain bean
+    fetchProcessors(FetchChain) {
+        processors = [
+                // re-check scope, if so enabled...
+                ref('preselector'),
+                // ...then verify or trigger prerequisite URIs fetched, allow crawling...
+                ref('preconditions'),
+                // ...fetch if DNS URI...
+                ref('fetchDns'),
+                // ref('fetchWhois'),
+                // ...fetch if HTTP URI...
+                ref('fetchHttp'),
+                // ...extract outlinks from HTTP headers...
+                ref('extractorHttp'),
+                // ...extract sitemap urls from robots.txt...
+                ref('extractorRobotsTxt'),
+                // ...extract links from sitemaps...
+                ref('extractorSitemap'),
+                // ...extract outlinks from HTML content...
+                ref('extractorHtml'),
+                // ...extract outlinks from CSS content...
+                ref('extractorCss'),
+                // ...extract outlinks from Javascript content...
+                ref('extractorJs'),
+                // ...extract outlinks from Flash content...
+                ref('extractorSwf')
+        ]
+    }
+
+    /*
+     * DISPOSITION CHAIN
+     */
+    // first, processors are declared as top-level named beans
+    warcWriter(WARCWriterChainProcessor) {
+        // compress = true
+        // prefix = 'IAH'
+        // maxFileSizeBytes = 1000000000
+        // poolMaxActive = 1
+        // MaxWaitForIdleMs = 500
+        // skipIdenticalDigests = false
+        // maxTotalBytesToWrite = 0
+        // directory = '${launchId}'
+        // storePaths = ['warcs']
+        // template = '${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}'
+        // startNewFilesOnCheckpoint = true
+        /*
+        chain = [
+            new org.archive.modules.warc.DnsResponseRecordBuilder(),
+            new org.archive.modules.warc.HttpResponseRecordBuilder(),
+            new org.archive.modules.warc.WhoisResponseRecordBuilder(),
+            new org.archive.modules.warc.FtpControlConversationRecordBuilder(),
+            new org.archive.modules.warc.FtpResponseRecordBuilder(),
+            new org.archive.modules.warc.RevisitRecordBuilder(),
+            new org.archive.modules.warc.HttpRequestRecordBuilder(),
+            new org.archive.modules.warc.MetadataRecordBuilder()
+        ]
+        */
+    }
+    candidates(CandidatesProcessor) {
+        // seedsRedirectNewSeeds = true
+        // processErrorOutlinks = false
+    }
+    disposition(DispositionProcessor) {
+        // delayFactor = 5.0
+        // minDelayMs = 3000
+        // respectCrawlDelayUpToSeconds = 300
+        // maxDelayMs = 30000
+        // maxPerHostBandwidthUsageKbSec = 0
+    }
+    /*
+    rescheduler(ReschedulingProcessor) {
+        rescheduleDelaySeconds = -1
+    }
+    */
+    // now, processors are assembled into ordered DispositionChain bean
+    dispositionProcessors(DispositionChain) {
+        processors = [
+                // write to aggregate archival files...
+                ref('warcWriter'),
+                // ...send each outlink candidate URI to CandidateChain,
+                // and enqueue those ACCEPTed to the frontier...
+                ref('candidates'),
+                // ...then update stats, shared-structures, frontier decisions
+                ref('disposition')
+                // ref('rescheduler')
+        ]
+    }
+
+    /**
+     * CRAWLCONTROLLER: Control interface, unifying context
+     */
+    crawlController(CrawlController) {
+        // maxToeThreads = 25
+        // pauseAtStart = true
+        // runWhileEmpty = false
+        // recorderInBufferBytes = 524288
+        // recorderOutBufferBytes = 16384
+        // scratchDir = 'scratch'
+    }
+
+    /**
+     * FRONTIER: Record of all URIs discovered and queued-for-collection
+     */
+    frontier(BdbFrontier) {
+        // queueTotalBudget = -1
+        // balanceReplenishAmount = 3000
+        // errorPenaltyAmount = 100
+        // precedenceFloor = 255
+        // queuePrecedencePolicy = new org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy()
+        // snoozeLongMs = 300000
+        // retryDelaySeconds = 900
+        // maxRetries = 30
+        // recoveryLogEnabled = true
+        // maxOutlinks = 6000
+        // extractIndependently = false
+        // outbound = new ArrayBlockingQueue(200, true)
+        // inbound = new ArrayBlockingQueue(40000, true)
+        // dumpPendingAtClose = false
+    }
+
+    /**
+     * URI UNIQ FILTER: Used by frontier to remember already-included URIs
+     */
+    uriUniqFilter(BdbUriUniqFilter)
+
+    /*
+     * EXAMPLE SETTINGS OVERLAY SHEETS
+     * Sheets allow some settings to vary by context - usually by URI context,
+     * so that different sites or sections of sites can be treated differently.
+     * Here are some example Sheets for common purposes. The SheetOverlaysManager
+     * (below) automatically collects all Sheet instances declared among the
+     * original beans, but others can be added during the crawl via the scripting
+     * interface.
+     */
+
+    /**
+     * forceRetire: any URI to which this sheet's settings are applied
+     * will force its containing queue to 'retired' status.
+     */
+    forceRetire(Sheet) {
+        map = [
+                'disposition.forceRetire': 'true'
+        ]
+    }
+
+    /**
+     * smallBudget: any URI to which this sheet's settings are applied
+     * will give its containing queue small values for balanceReplenishAmount
+     * (causing it to have shorter 'active' periods while other queues are
+     * waiting) and queueTotalBudget (causing the queue to enter 'retired'
+     * status once that expenditure is reached by URI attempts and errors)
+     */
+    smallBudget(Sheet) {
+        map = [
+                'frontier.balanceReplenishAmount': '20',
+                'frontier.queueTotalBudget': '100'
+        ]
+    }
+
+    /**
+     * veryPolite: any URI to which this sheet's settings are applied
+     * will cause its queue to take extra-long politeness snoozes
+     */
+    veryPolite(Sheet) {
+        map = [
+                'disposition.delayFactor': '10',
+                'disposition.minDelayMs': '10000',
+                'disposition.maxDelayMs': '1000000',
+                'disposition.respectCrawlDelayUpToSeconds': '3600'
+        ]
+    }
+
+    /**
+     * highPrecedence: any URI to which this sheet's settings are applied
+     * will give its containing queue a slightly-higher than default
+     * queue precedence value. That queue will then be preferred over
+     * other queues for active crawling, never waiting behind lower-
+     * precedence queues.
+     */
+    highPrecedence(Sheet) {
+        map = [
+                'frontier.balanceReplenishAmount': '20',
+                'frontier.queueTotalBudget': '100'
+        ]
+    }
+
+    /*
+     * EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
+     * A SheetAssociation says certain URIs should have certain overlay Sheets
+     * applied. This example applies two sheets to URIs matching two SURT-prefixes.
+     * New associations may also be added mid-crawl using the scripting facility.
+     */
+
+    /*
+    surtPrefixesSheetAssociation(SurtPrefixesSheetAssociation) {
+        surtPrefixes = [
+            'http://(org,example,',
+            'http://(com,example,www,)/'
+        ]
+        targetSheetNames = [
+            'veryPolite',
+            'smallBudget'
+        ]
+    }
+    */
+
+    /*
+     * OPTIONAL BUT RECOMMENDED BEANS
+     */
+
+    /**
+     * ACTIONDIRECTORY: disk directory for mid-crawl operations
+     * Running job will watch directory for new files with URIs,
+     * scripts, and other data to be processed during a crawl.
+     */
+    actionDirectory(ActionDirectory) {
+        // actionDir = 'action'
+        // doneDir = '${launchId}/actions-done'
+        // initialDelaySeconds = 10
+        // delaySeconds = 30
+    }
+
+    /**
+     * CRAWLLIMITENFORCER: stops crawl when it reaches configured limits
+     */
+    crawlLimiter(CrawlLimitEnforcer) {
+        // maxBytesDownload = 0
+        // maxDocumentsDownload = 0
+        // maxTimeSeconds = 0
+    }
+
+    /**
+     * CHECKPOINTSERVICE: checkpointing assistance
+     */
+    checkpointService(CheckpointService) {
+        // checkpointIntervalMinutes = -1
+        // checkpointOnShutdown = true
+        // checkpointsDir = 'checkpoints'
+        // forgetAllButLatest = true
+    }
+
+    /*
+     * OPTIONAL BEANS
+     *
+     * Uncomment and expand as needed, or if non-default alternate
+     * implementations are preferred.
+     */
+
+    /**
+     * CANONICALIZATION POLICY
+     */
+    /*
+    canonicalizationPolicy(RulesCanonicalizationPolicy) {
+        rules = [
+            new LowercaseRule(),
+            new StripUserinfoRule(),
+            new StripWWWNRule(),
+            new StripSessionIDs(),
+            new StripSessionCFIDs(),
+            new FixupQueryString()
+        ]
+    }
+    */
+
+    /**
+     * QUEUE ASSIGNMENT POLICY
+     */
+    /*
+    queueAssignmentPolicy(SurtAuthorityQueueAssignmentPolicy) {
+        forceQueueAssignment = ''
+        deferToPrevious = true
+        parallelQueues = 1
+    }
+    */
+
+    /**
+     * URI PRECEDENCE POLICY
+     */
+    // uriPrecedencePolicy(CostUriPrecedencePolicy)
+
+    /**
+     * COST ASSIGNMENT POLICY
+     */
+    costAssignmentPolicy(UnitCostAssignmentPolicy)
+
+    /**
+     * CREDENTIAL STORE: HTTP authentication or FORM POST credentials
+     */
+    // credentialStore(org.archive.modules.credential.CredentialStore)
+
+    /**
+     * DISK SPACE MONITOR:
+     * Pauses the crawl if disk space at monitored paths falls below minimum threshold
+     * Note: If there's less than 5 GiB free for state directory BDB will throw
+     * an error which the crawl job will likely not be able to fully recover from.
+     */
+    /*
+    diskSpaceMonitor(DiskSpaceMonitor) {
+        pauseThresholdMiB = 8192
+        monitorConfigPaths = true
+        monitorPaths = [
+                'PATH'
+        ]
+    }
+    */
+
+    /*
+     * REQUIRED STANDARD BEANS
+     * It will be very rare to replace or reconfigure the following beans.
+     */
+
+    /**
+     * STATISTICSTRACKER: standard stats/reporting collector
+     */
+    statisticsTracker(StatisticsTracker) { bean ->
+        bean.autowire = 'byName'
+        /*
+        reports = [
+                new CrawlSummaryReport(),
+                new SeedsReport(),
+                new HostsReport(
+                        maxSortSize: -1,
+                        suppressEmptyHosts: false,
+                ),
+                new SourceTagsReport(),
+                new MimetypesReport(),
+                new ResponseCodeReport(),
+                new ProcessorsReport(),
+                new FrontierSummaryReport(),
+                new FrontierNonemptyReport(),
+                new ToeThreadsReport(),
+        ]
+        */
+        // reportsDir = '${launchId}/reports'
+        // liveHostReportSize = 20
+        // intervalSeconds = 20
+        // keepSnapshotsCount = 5
+        // liveHostReportSize = 20
+    }
+
+    /**
+     * CRAWLERLOGGERMODULE: shared logging facility
+     */
+    loggerModule(CrawlerLoggerModule) {
+        // path = '${launchId}/logs'
+        // crawlLogPath = 'crawl.log'
+        // alertsLogPath = 'alerts.log'
+        // progressLogPath = 'progress-statistics.log'
+        // uriErrorsLogPath = 'uri-errors.log'
+        // runtimeErrorsLogPath = 'runtime-errors.log'
+        // nonfatalErrorsLogPath = 'nonfatal-errors.log'
+        // logExtraInfo = false
+    }
+
+    /**
+     * SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
+     * Autowired to include any SheetForSurtPrefix or
+     * SheetForDecideRuled beans
+     */
+    sheetOverlaysManager(SheetOverlaysManager) { bean ->
+        bean.autowire = 'byType'
+    }
+
+    /**
+     * BDBMODULE: shared BDB-JE disk persistence manager
+     */
+    bdb(BdbModule) {
+        // dir = 'state'
+        /*
+         * if neither cachePercent or cacheSize are specified (the default), bdb
+         * uses its own default of 60%
+         */
+        // cachePercent = 0
+        // cacheSize = 0
+        // useSharedCache = true
+        // expectedConcurrency = 25
+    }
+
+    /**
+     * BDBCOOKIESTORE: disk-based cookie storage for FetchHTTP
+     */
+    cookieStore(BdbCookieStore) {
+        // cookiesLoadFile = null
+        // cookiesSaveFile = null
+        // bdbModule = ref('bdb')
+    }
+
+    /**
+     * SERVERCACHE: shared cache of server/host info
+     */
+    serverCache(BdbServerCache) {
+        // bdb = ref('bdb')
+    }
+
+    /**
+     * CONFIG PATH CONFIGURER: required helper making crawl paths relative
+     * to crawler-beans.cxml file, and tracking crawl files for web U
+     */
+    configPathConfigurer(ConfigPathConfigurer)
+}
\ No newline at end of file
diff --git a/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java b/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java
new file mode 100644
index 000000000..398d220f6
--- /dev/null
+++ b/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java
@@ -0,0 +1,35 @@
+package org.archive.crawler.restlet;
+
+import org.archive.spring.PathSharingContext;
+import org.junit.Test;
+import org.springframework.validation.Errors;
+
+import java.util.HashMap;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class ProfileCrawlerBeansTest {
+    @Test
+    public void testXmlProfile() {
+        testProfile("classpath:/org/archive/crawler/restlet/profile-crawler-beans.cxml");
+    }
+
+    @Test
+    public void testGroovyProfile() {
+        testProfile("classpath:/org/archive/crawler/restlet/profile-crawler-beans.groovy");
+    }
+
+    private static void testProfile(String location) {
+        String profile = location.substring(location.lastIndexOf('/') + 1);
+        try (var context = new PathSharingContext(location)) {
+            context.validate();
+            HashMap<String, Errors> allErrors = context.getAllErrors();
+            assertEquals(profile + " should have one bean with errors", 1, allErrors.size());
+            Errors metadataErrors = allErrors.get("metadata");
+            assertEquals(profile + " Metadata bean should have one error", 1, metadataErrors.getErrorCount());
+            assertTrue(profile + " should have the operator contact info error",
+                    metadataErrors.getAllErrors().get(0).toString().contains("ENTER_AN_URL_WITH_YOUR_CONTACT_INFO"));
+        }
+    }
+}
diff --git a/modules/pom.xml b/modules/pom.xml
index 7d74dc300..138a6670e 100644
--- a/modules/pom.xml
+++ b/modules/pom.xml
@@ -115,6 +115,5 @@
 	</build>
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-		<groovy.version>4.0.24</groovy.version>
 	</properties>
 </project>
diff --git a/pom.xml b/pom.xml
index 4222a5db4..f5fcc1fbb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -370,6 +370,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html
 		<build.timestamp>${maven.build.timestamp}</build.timestamp>
 		<doclint>none</doclint>
 		<additionalparam>-Xdoclint:none</additionalparam>
+		<groovy.version>4.0.24</groovy.version>
 		<jetty.version>9.4.56.v20240826</jetty.version>
 		<slf4j.version>2.0.16</slf4j.version>
 		<maven.compiler.target>17</maven.compiler.target>