From 19c24b90060877e71225f30c98e3dfbd33866c7b Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Sun, 1 Dec 2024 00:10:28 +0900 Subject: [PATCH] Add Groovy crawl configs This enables crawl configuration files to use Spring's [Groovy Bean Definition DSL] as an optional alternative to Spring XML. It uses the same bean configuration model but the syntax is more terse and human-readable. No more need for `&` in seed URLs. :-) ```groovy checkpointService(CheckpointService) { checkpointIntervalMinutes = 15 checkpointsDir = 'checkpoints' forgetAllButLatest = true } ``` It also enables some powerful scripting capabilities. For example, defining a custom DecideRule directly in the crawl scope: ```groovy scope(DecideRuleSequence) { rules = [ new RejectDecideRule(), // ACCEPT everything linked from a .pdf file new PredicatedDecideRule() { boolean evaluate(CrawlURI uri) { return uri.via?.path?.endsWith(".pdf") } }, // ... ] } ``` The main downsides are defining nested inner beans can be a bit awkward, some of the errors can be cryptic, and you can't just manipulate the config files with an XML parser. This commit includes a Groovy version of the default crawl profile for reference, but doesn't expose a way to use it in the UI yet. For now, you need to manually create a `crawler-beans.groovy` file in your job directory. [Groovy Bean Definition DSL]: https://docs.spring.io/spring-framework/reference/core/beans/basics.html#beans-factory-groovy --- commons/pom.xml | 5 + .../archive/spring/PathSharingContext.java | 46 +- .../spring/PathSharingContextTest.java | 49 ++ .../spring/PathSharingContextTestBeans.cxml | 14 + .../spring/PathSharingContextTestBeans.groovy | 8 + .../archive/crawler/framework/CrawlJob.java | 10 +- .../org/archive/crawler/framework/Engine.java | 2 +- .../restlet/profile-crawler-beans.groovy | 687 ++++++++++++++++++ .../restlet/ProfileCrawlerBeansTest.java | 35 + modules/pom.xml | 1 - pom.xml | 1 + 11 files changed, 852 insertions(+), 6 deletions(-) create mode 100644 commons/src/test/java/org/archive/spring/PathSharingContextTest.java create mode 100644 commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml create mode 100644 commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy create mode 100644 engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy create mode 100644 engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java diff --git a/commons/pom.xml b/commons/pom.xml index 28acdcb1d..dadaa8e4e 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -153,6 +153,11 @@ jsch 0.2.21 + + org.apache.groovy + groovy + ${groovy.version} + diff --git a/commons/src/main/java/org/archive/spring/PathSharingContext.java b/commons/src/main/java/org/archive/spring/PathSharingContext.java index edd1898fd..a793afcbb 100644 --- a/commons/src/main/java/org/archive/spring/PathSharingContext.java +++ b/commons/src/main/java/org/archive/spring/PathSharingContext.java @@ -34,9 +34,15 @@ import org.apache.commons.io.FileUtils; import org.archive.util.ArchiveUtils; import org.springframework.beans.BeansException; +import org.springframework.beans.factory.BeanDefinitionStoreException; import org.springframework.beans.factory.config.ConfigurableListableBeanFactory; +import org.springframework.beans.factory.groovy.GroovyBeanDefinitionReader; +import org.springframework.beans.factory.xml.XmlBeanDefinitionReader; import org.springframework.context.ApplicationContext; +import org.springframework.context.annotation.AnnotationConfigUtils; import org.springframework.context.support.FileSystemXmlApplicationContext; +import org.springframework.core.io.Resource; +import org.springframework.core.io.support.EncodedResource; import org.springframework.validation.BeanPropertyBindingResult; import org.springframework.validation.Errors; import org.springframework.validation.Validator; @@ -46,9 +52,13 @@ * * Notable extensions: * - * Remembers its primary XML configuration file, and can report its filesystem + * Remembers its primary configuration file, and can report its filesystem * path. - * + * + * Supports both Spring XML and Groovy Bean Definition DSL. + * + * Automatically enables annotation processing (<context:annotation-config/>). + * * Reports a summary of Errors collected from self-Validating Beans. * * Generates launchId from timestamp, creates launch directory @@ -212,5 +222,37 @@ public ConcurrentHashMap getData() { return data; } + /** + * Load bean definitions from XML or Groovy. + */ + @Override + protected void loadBeanDefinitions(XmlBeanDefinitionReader xmlReader) throws BeansException, IOException { + // This is essentially + // By doing it here we don't need to include it in every crawl config. + AnnotationConfigUtils.registerAnnotationConfigProcessors(xmlReader.getRegistry()); + + GroovyBeanDefinitionReader groovyReader = new GroovyBeanDefinitionReader(xmlReader.getRegistry()) { + // By default, the Groovy reader loads XML from .xml and Groovy for everything else, but + // Heritrix uses .cxml so we override it to only use the Groovy reader for .groovy files + // and the XML reader for everything else. + @Override + public int loadBeanDefinitions(EncodedResource encodedResource) throws BeanDefinitionStoreException { + String filename = encodedResource.getResource().getFilename(); + if (filename != null && filename.endsWith(".groovy")) { + return super.loadBeanDefinitions(encodedResource); + } + return xmlReader.loadBeanDefinitions(encodedResource); + } + }; + groovyReader.setEnvironment(getEnvironment()); + Resource[] configResources = getConfigResources(); + if (configResources != null) { + groovyReader.loadBeanDefinitions(configResources); + } + String[] configLocations = getConfigLocations(); + if (configLocations != null) { + groovyReader.loadBeanDefinitions(configLocations); + } + } } diff --git a/commons/src/test/java/org/archive/spring/PathSharingContextTest.java b/commons/src/test/java/org/archive/spring/PathSharingContextTest.java new file mode 100644 index 000000000..4c535d5de --- /dev/null +++ b/commons/src/test/java/org/archive/spring/PathSharingContextTest.java @@ -0,0 +1,49 @@ +package org.archive.spring; + +import org.junit.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import static org.junit.Assert.*; + +public class PathSharingContextTest { + @Test + public void testGroovyConfig() { + testConfig("groovy", "classpath:org/archive/spring/PathSharingContextTestBeans.groovy"); + } + + @Test + public void testXmlConfig() { + testConfig("xml", "classpath:org/archive/spring/PathSharingContextTestBeans.cxml"); + } + + private static void testConfig(String name, String configPath) { + try (var context = new PathSharingContext(configPath)) { + context.validate(); + assertTrue("should be no validation errors", context.getAllErrors().isEmpty()); + assertEquals("primaryConfiguationPath should be correct", configPath, context.getPrimaryConfigurationPath()); + Bean1 bean1 = context.getBean("bean1", Bean1.class); + Bean2 bean2 = context.getBean("bean2", Bean2.class); + assertNotNull("bean1 should not be null", bean1); + assertNotNull("bean2 should not be null", bean2); + assertEquals("bean1.name should be set", name, bean1.name); + assertEquals("bean1 should be autowired into bean2", bean1, bean2.bean1); + } + } + + public static class Bean1 { + private String name; + + public void setName(String name) { + this.name = name; + } + } + + public static class Bean2 { + private Bean1 bean1; + + @Autowired + public void setBean1(Bean1 bean1) { + this.bean1 = bean1; + } + } +} \ No newline at end of file diff --git a/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml new file mode 100644 index 000000000..3bac16a5d --- /dev/null +++ b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.cxml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy new file mode 100644 index 000000000..657375c91 --- /dev/null +++ b/commons/src/test/resources/org/archive/spring/PathSharingContextTestBeans.groovy @@ -0,0 +1,8 @@ +import org.archive.spring.PathSharingContextTest + +beans { + bean1(PathSharingContextTest.Bean1) { + name = "groovy" + } + bean2(PathSharingContextTest.Bean2) +} \ No newline at end of file diff --git a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java index ded08b85e..4e19ab668 100644 --- a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java +++ b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java @@ -251,8 +251,14 @@ public void writeHtmlTo(PrintWriter pw, String uriPrefix) { public void checkXML() { // TODO: suppress check if XML unchanged? job.log when XML changed? - Instant testTime = Instant.ofEpochMilli(getPrimaryConfig().lastModified()); - Document doc = getDomDocument(getPrimaryConfig()); + File primaryConfig = getPrimaryConfig(); + Instant testTime = Instant.ofEpochMilli(primaryConfig.lastModified()); + if (primaryConfig.toString().endsWith(".groovy")) { + // just assume Groovy configs are OK + xmlOkAt = testTime; + return; + } + Document doc = getDomDocument(primaryConfig); // TODO: check for other minimal requirements, like // presence of a few key components (CrawlController etc.)? if(doc!=null) { diff --git a/engine/src/main/java/org/archive/crawler/framework/Engine.java b/engine/src/main/java/org/archive/crawler/framework/Engine.java index 875c3e6b0..b331efe14 100644 --- a/engine/src/main/java/org/archive/crawler/framework/Engine.java +++ b/engine/src/main/java/org/archive/crawler/framework/Engine.java @@ -138,7 +138,7 @@ public boolean addJobDirectory(File dir) { } File[] candidateConfigs = dir.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { - return name.endsWith(".cxml"); + return name.endsWith(".cxml") || name.equals("crawler-beans.groovy"); }}); if(candidateConfigs==null || candidateConfigs.length == 0) { // no CXML file found! diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy new file mode 100644 index 000000000..e55785585 --- /dev/null +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy @@ -0,0 +1,687 @@ +/* + * HERITRIX 3 CRAWL JOB CONFIGURATION FILE + * + * This is a relatively minimal configuration suitable for many crawls. + * + * Commented-out beans and properties are provided as an example; values + * shown in comments reflect the actual defaults which are in effect + * if not otherwise specified specification. (To change from the default + * behavior, uncomment AND alter the shown values.) + */ + +import org.archive.bdb.BdbModule +import org.archive.crawler.framework.* +import org.archive.crawler.frontier.* +import org.archive.crawler.monitor.DiskSpaceMonitor +import org.archive.crawler.postprocessor.* +import org.archive.crawler.prefetch.* +import org.archive.crawler.reporting.* +import org.archive.crawler.spring.* +import org.archive.crawler.util.BdbUriUniqFilter +import org.archive.modules.* +import org.archive.modules.canonicalize.* +import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule +import org.archive.modules.extractor.* +import org.archive.modules.fetcher.* +import org.archive.modules.net.BdbServerCache +import org.archive.modules.seeds.TextSeedModule +import org.archive.modules.writer.WARCWriterChainProcessor +import org.springframework.beans.factory.config.PropertyOverrideConfigurer +import org.archive.modules.deciderules.* +import org.archive.spring.* + +beans { + /* + * OVERRIDES + * Values elsewhere in the configuration may be replaced ('overridden') + * by a Properties map declared in a PropertiesOverrideConfigurer, + * using a dotted-bean-path to address individual bean properties. + * This allows us to collect a few of the most-often changed values + * in an easy-to-edit format here at the beginning of the model + * configuration. + */ + + /** + * overrides from a text property list + */ + simpleOverrides(PropertyOverrideConfigurer) { + properties = ''' +# This Properties map is specified in the Java 'property list' text format +# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 + +metadata.operatorContactUrl=ENTER_AN_URL_WITH_YOUR_CONTACT_INFO_HERE_FOR_WEBMASTERS_AFFECTED_BY_YOUR_CRAWL +metadata.jobName=basic +metadata.description=Basic crawl starting with useful defaults + +##..more?..## +''' + } + + /** + * overrides from declared elements, more easily allowing + * multiline values or even declared beans + */ + longerOverrides(PropertyOverrideConfigurer) { + properties = ['seeds.textSource.value': '''' + +# URLS HERE +http://example.example/example + + '''] + } + + /** + * CRAWL METADATA: including identification of crawler/operator + */ + metadata(CrawlMetadata) { bean -> + bean.autowire = 'byName' + operatorContactUrl = '[see override above]' + jobName = '[see override above]' + description = '[see override above]' + // robotsPolicyName = 'obey' + // operator = '' + // operatorFrom = '' + // organization = '' + // audience = '' + // userAgentTemplate = 'Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)' + } + + /** + * SEEDS: crawl starting points + * + * ConfigString allows simple, inline specification of a moderate + * number of seeds; see below comment for example of using an + * arbitrarily-large external file. + */ + seeds(TextSeedModule) { + textSource = new ConfigString(''' +# [see override above] + ''') + // sourceTagSeeds = false + // blockAwaitingSeedLines = -1 + } + + /** + * SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in + * the job directory, similar to the H1 approach. + * Use either the above, or this, but not both. + */ + /* + seeds(TextSeedModule) { + textSource = new ConfigFile(path: 'seeds.txt') + // sourceTagSeeds = false + // blockAwaitingSeedLines = -1 + } + */ + + acceptSurts(SurtPrefixedDecideRule) { + // decision = 'ACCEPT' + // seedsAsSurtPrefixes = true + // alsoCheckVia = false + // surtsSourceFile = '' + // surtsDumpFile = '${launchId}/surts.dump' + /* + surtsSource = new ConfigString(''' +# example.com +# http://www.example.edu/path1/ +# +http://(org,example, + ''') + } + */ + } + + /** + * SCOPE: rules for which discovered URIs to crawl; order is very + * important because last decision returned other than 'NONE' wins. + */ + scope(DecideRuleSequence) { + logToFile = false + rules = [ + // Begin by REJECTing all... + new RejectDecideRule(), + // ...then ACCEPT those within configured/seed-implied SURT prefixes... + new TooManyHopsDecideRule( + // maxHops: 20, + ), + // ...but ACCEPT those more than a configured link-hop-count from start... + new TransclusionDecideRule( + // maxTransHops: 2, + // maxSpeculativeHops: 1, + ), + // ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... + new SurtPrefixedDecideRule( + decision: 'REJECT', + seedsAsSurtPrefixes: false, + surtsDumpFile: new ConfigFile(path: '${launchId}/negative-surts.dump'), + // surtsSource: new ConfigFile(path: 'negative-surts.txt'), + ), + // ...and REJECT those from a configurable (initially empty) set of URI regexes... + new MatchesListRegexDecideRule( + decision: 'REJECT', + // listLogicalOr: false, + // regexList: [], + ), + // ...and REJECT those with suspicious repeating path-segments... + new PathologicalPathDecideRule( + // maxRepetitions: 2, + ), + // ...and REJECT those with more than threshold number of path-segments... + new TooManyPathSegmentsDecideRule( + // maxPathDepth: 20, + ), + // ...but always ACCEPT those marked as prerequisitee for another URI... + new PrerequisiteAcceptDecideRule(), + // ...but always REJECT those with unsupported URI schemes + new SchemeNotInSetDecideRule(), + ] + } + + /* + * PROCESSING CHAINS + * Much of the crawler's work is specified by the sequential + * application of swappable Processor modules. These Processors + * are collected into three 'chains'. The CandidateChain is applied + * to URIs being considered for inclusion, before a URI is enqueued + * for collection. The FetchChain is applied to URIs when their + * turn for collection comes up. The DispositionChain is applied + * after a URI is fetched and analyzed/link-extracted. + */ + + /* + * CANDIDATE CHAIN + */ + // first, processors are declared as top-level named beans + candidateScoper(CandidateScoper) + preparer(FrontierPreparer) { + // preferenceDepthHops = -1 + // preferenceEmbedHops = 1 + // canonicalizationPolicy = ref('canonicalizationPolicy') + // queueAssignmentPolicy = ref('queueAssignmentPolicy') + // uriPrecedencePolicy = ref('uriPrecedencePolicy') + // costAssignmentPolicy = ref('costAssignmentPolicy') + } + // now, processors are assembled into ordered CandidateChain bean + candidateProcessors(CandidateChain) { + processors = [ + // apply scoping rules to each individual candidate URI... + ref('candidateScoper'), + // ...then prepare those ACCEPTed to be enqueued to frontier. + ref('preparer'), + ] + } + + /* + * FETCH CHAIN + */ + // first, processors are declared as top-level named beans + preselector(Preselector) { + // recheckScope = false + // blockAll = false + // blockByRegex = '' + // allowByRegex = '' + } + preconditions(PreconditionEnforcer) { + // ipValidityDurationSeconds = 21600 + // robotsValidityDurationSeconds = 86400 + // calculateRobotsOnly = false + } + fetchDns(FetchDNS) { + // acceptNonDnsResolves = false + // digestContent = true + // digestAlgorithm = 'sha1' + // dnsOverHttpServer = 'https://dns.google/dns-query' + } + /* + fetchWhois(FetchWhois) { + specialQueryTemplates = [ + 'whois.verisign-grs.com': 'domain %s', + 'whois.arin.net': 'z + %s', + 'whois.denic.de': '-T dn %s' + ] + } + */ + fetchHttp(FetchHTTP) { + // useHTTP11 = false + // maxLengthBytes = 0 + // timeoutSeconds = 1200 + // maxFetchKBSec = 0 + // defaultEncoding = 'ISO-8859-1' + // shouldFetchBodyRule = new AcceptDecideRule() + // soTimeoutMs = 20000 + // sendIfModifiedSince = true + // sendIfNoneMatch = true + // sendConnectionClose = true + // sendReferer = true + // sendRange = false + // ignoreCookies = false + // sslTrustLevel = 'OPEN' + // acceptHeaders = [ + // 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + // ] + // httpBindAddress = '' + // httpProxyHost = '' + // httpProxyPort = 0 + // httpProxyUser = '' + // httpProxyPassword = '' + // socksProxyHost = '' + // socksProxyPort = '' + // digestContent = true + // digestAlgorithm = 'sha1' + } + extractorHttp(ExtractorHTTP) + extractorRobotsTxt(ExtractorRobotsTxt) + extractorSitemap(ExtractorSitemap) + extractorHtml(ExtractorHTML) { + // extractJavascript = true + // extractValueAttributes = true + // ignoreFormActionUrls = false + // extractOnlyFormGets = true + // treatFramesAsEmbedLinks = true + // ignoreUnexpectedHtml = true + // maxElementLength = 1024 + // maxAttributeNameLength = 1024 + // maxAttributeValueLength = 16384 + } + extractorCss(ExtractorCSS) + extractorJs(ExtractorJS) + extractorSwf(ExtractorSWF) + // now, processors are assembled into ordered FetchChain bean + fetchProcessors(FetchChain) { + processors = [ + // re-check scope, if so enabled... + ref('preselector'), + // ...then verify or trigger prerequisite URIs fetched, allow crawling... + ref('preconditions'), + // ...fetch if DNS URI... + ref('fetchDns'), + // ref('fetchWhois'), + // ...fetch if HTTP URI... + ref('fetchHttp'), + // ...extract outlinks from HTTP headers... + ref('extractorHttp'), + // ...extract sitemap urls from robots.txt... + ref('extractorRobotsTxt'), + // ...extract links from sitemaps... + ref('extractorSitemap'), + // ...extract outlinks from HTML content... + ref('extractorHtml'), + // ...extract outlinks from CSS content... + ref('extractorCss'), + // ...extract outlinks from Javascript content... + ref('extractorJs'), + // ...extract outlinks from Flash content... + ref('extractorSwf') + ] + } + + /* + * DISPOSITION CHAIN + */ + // first, processors are declared as top-level named beans + warcWriter(WARCWriterChainProcessor) { + // compress = true + // prefix = 'IAH' + // maxFileSizeBytes = 1000000000 + // poolMaxActive = 1 + // MaxWaitForIdleMs = 500 + // skipIdenticalDigests = false + // maxTotalBytesToWrite = 0 + // directory = '${launchId}' + // storePaths = ['warcs'] + // template = '${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}' + // startNewFilesOnCheckpoint = true + /* + chain = [ + new org.archive.modules.warc.DnsResponseRecordBuilder(), + new org.archive.modules.warc.HttpResponseRecordBuilder(), + new org.archive.modules.warc.WhoisResponseRecordBuilder(), + new org.archive.modules.warc.FtpControlConversationRecordBuilder(), + new org.archive.modules.warc.FtpResponseRecordBuilder(), + new org.archive.modules.warc.RevisitRecordBuilder(), + new org.archive.modules.warc.HttpRequestRecordBuilder(), + new org.archive.modules.warc.MetadataRecordBuilder() + ] + */ + } + candidates(CandidatesProcessor) { + // seedsRedirectNewSeeds = true + // processErrorOutlinks = false + } + disposition(DispositionProcessor) { + // delayFactor = 5.0 + // minDelayMs = 3000 + // respectCrawlDelayUpToSeconds = 300 + // maxDelayMs = 30000 + // maxPerHostBandwidthUsageKbSec = 0 + } + /* + rescheduler(ReschedulingProcessor) { + rescheduleDelaySeconds = -1 + } + */ + // now, processors are assembled into ordered DispositionChain bean + dispositionProcessors(DispositionChain) { + processors = [ + // write to aggregate archival files... + ref('warcWriter'), + // ...send each outlink candidate URI to CandidateChain, + // and enqueue those ACCEPTed to the frontier... + ref('candidates'), + // ...then update stats, shared-structures, frontier decisions + ref('disposition') + // ref('rescheduler') + ] + } + + /** + * CRAWLCONTROLLER: Control interface, unifying context + */ + crawlController(CrawlController) { + // maxToeThreads = 25 + // pauseAtStart = true + // runWhileEmpty = false + // recorderInBufferBytes = 524288 + // recorderOutBufferBytes = 16384 + // scratchDir = 'scratch' + } + + /** + * FRONTIER: Record of all URIs discovered and queued-for-collection + */ + frontier(BdbFrontier) { + // queueTotalBudget = -1 + // balanceReplenishAmount = 3000 + // errorPenaltyAmount = 100 + // precedenceFloor = 255 + // queuePrecedencePolicy = new org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy() + // snoozeLongMs = 300000 + // retryDelaySeconds = 900 + // maxRetries = 30 + // recoveryLogEnabled = true + // maxOutlinks = 6000 + // extractIndependently = false + // outbound = new ArrayBlockingQueue(200, true) + // inbound = new ArrayBlockingQueue(40000, true) + // dumpPendingAtClose = false + } + + /** + * URI UNIQ FILTER: Used by frontier to remember already-included URIs + */ + uriUniqFilter(BdbUriUniqFilter) + + /* + * EXAMPLE SETTINGS OVERLAY SHEETS + * Sheets allow some settings to vary by context - usually by URI context, + * so that different sites or sections of sites can be treated differently. + * Here are some example Sheets for common purposes. The SheetOverlaysManager + * (below) automatically collects all Sheet instances declared among the + * original beans, but others can be added during the crawl via the scripting + * interface. + */ + + /** + * forceRetire: any URI to which this sheet's settings are applied + * will force its containing queue to 'retired' status. + */ + forceRetire(Sheet) { + map = [ + 'disposition.forceRetire': 'true' + ] + } + + /** + * smallBudget: any URI to which this sheet's settings are applied + * will give its containing queue small values for balanceReplenishAmount + * (causing it to have shorter 'active' periods while other queues are + * waiting) and queueTotalBudget (causing the queue to enter 'retired' + * status once that expenditure is reached by URI attempts and errors) + */ + smallBudget(Sheet) { + map = [ + 'frontier.balanceReplenishAmount': '20', + 'frontier.queueTotalBudget': '100' + ] + } + + /** + * veryPolite: any URI to which this sheet's settings are applied + * will cause its queue to take extra-long politeness snoozes + */ + veryPolite(Sheet) { + map = [ + 'disposition.delayFactor': '10', + 'disposition.minDelayMs': '10000', + 'disposition.maxDelayMs': '1000000', + 'disposition.respectCrawlDelayUpToSeconds': '3600' + ] + } + + /** + * highPrecedence: any URI to which this sheet's settings are applied + * will give its containing queue a slightly-higher than default + * queue precedence value. That queue will then be preferred over + * other queues for active crawling, never waiting behind lower- + * precedence queues. + */ + highPrecedence(Sheet) { + map = [ + 'frontier.balanceReplenishAmount': '20', + 'frontier.queueTotalBudget': '100' + ] + } + + /* + * EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION + * A SheetAssociation says certain URIs should have certain overlay Sheets + * applied. This example applies two sheets to URIs matching two SURT-prefixes. + * New associations may also be added mid-crawl using the scripting facility. + */ + + /* + surtPrefixesSheetAssociation(SurtPrefixesSheetAssociation) { + surtPrefixes = [ + 'http://(org,example,', + 'http://(com,example,www,)/' + ] + targetSheetNames = [ + 'veryPolite', + 'smallBudget' + ] + } + */ + + /* + * OPTIONAL BUT RECOMMENDED BEANS + */ + + /** + * ACTIONDIRECTORY: disk directory for mid-crawl operations + * Running job will watch directory for new files with URIs, + * scripts, and other data to be processed during a crawl. + */ + actionDirectory(ActionDirectory) { + // actionDir = 'action' + // doneDir = '${launchId}/actions-done' + // initialDelaySeconds = 10 + // delaySeconds = 30 + } + + /** + * CRAWLLIMITENFORCER: stops crawl when it reaches configured limits + */ + crawlLimiter(CrawlLimitEnforcer) { + // maxBytesDownload = 0 + // maxDocumentsDownload = 0 + // maxTimeSeconds = 0 + } + + /** + * CHECKPOINTSERVICE: checkpointing assistance + */ + checkpointService(CheckpointService) { + // checkpointIntervalMinutes = -1 + // checkpointOnShutdown = true + // checkpointsDir = 'checkpoints' + // forgetAllButLatest = true + } + + /* + * OPTIONAL BEANS + * + * Uncomment and expand as needed, or if non-default alternate + * implementations are preferred. + */ + + /** + * CANONICALIZATION POLICY + */ + /* + canonicalizationPolicy(RulesCanonicalizationPolicy) { + rules = [ + new LowercaseRule(), + new StripUserinfoRule(), + new StripWWWNRule(), + new StripSessionIDs(), + new StripSessionCFIDs(), + new FixupQueryString() + ] + } + */ + + /** + * QUEUE ASSIGNMENT POLICY + */ + /* + queueAssignmentPolicy(SurtAuthorityQueueAssignmentPolicy) { + forceQueueAssignment = '' + deferToPrevious = true + parallelQueues = 1 + } + */ + + /** + * URI PRECEDENCE POLICY + */ + // uriPrecedencePolicy(CostUriPrecedencePolicy) + + /** + * COST ASSIGNMENT POLICY + */ + costAssignmentPolicy(UnitCostAssignmentPolicy) + + /** + * CREDENTIAL STORE: HTTP authentication or FORM POST credentials + */ + // credentialStore(org.archive.modules.credential.CredentialStore) + + /** + * DISK SPACE MONITOR: + * Pauses the crawl if disk space at monitored paths falls below minimum threshold + * Note: If there's less than 5 GiB free for state directory BDB will throw + * an error which the crawl job will likely not be able to fully recover from. + */ + /* + diskSpaceMonitor(DiskSpaceMonitor) { + pauseThresholdMiB = 8192 + monitorConfigPaths = true + monitorPaths = [ + 'PATH' + ] + } + */ + + /* + * REQUIRED STANDARD BEANS + * It will be very rare to replace or reconfigure the following beans. + */ + + /** + * STATISTICSTRACKER: standard stats/reporting collector + */ + statisticsTracker(StatisticsTracker) { bean -> + bean.autowire = 'byName' + /* + reports = [ + new CrawlSummaryReport(), + new SeedsReport(), + new HostsReport( + maxSortSize: -1, + suppressEmptyHosts: false, + ), + new SourceTagsReport(), + new MimetypesReport(), + new ResponseCodeReport(), + new ProcessorsReport(), + new FrontierSummaryReport(), + new FrontierNonemptyReport(), + new ToeThreadsReport(), + ] + */ + // reportsDir = '${launchId}/reports' + // liveHostReportSize = 20 + // intervalSeconds = 20 + // keepSnapshotsCount = 5 + // liveHostReportSize = 20 + } + + /** + * CRAWLERLOGGERMODULE: shared logging facility + */ + loggerModule(CrawlerLoggerModule) { + // path = '${launchId}/logs' + // crawlLogPath = 'crawl.log' + // alertsLogPath = 'alerts.log' + // progressLogPath = 'progress-statistics.log' + // uriErrorsLogPath = 'uri-errors.log' + // runtimeErrorsLogPath = 'runtime-errors.log' + // nonfatalErrorsLogPath = 'nonfatal-errors.log' + // logExtraInfo = false + } + + /** + * SHEETOVERLAYMANAGER: manager of sheets of contextual overlays + * Autowired to include any SheetForSurtPrefix or + * SheetForDecideRuled beans + */ + sheetOverlaysManager(SheetOverlaysManager) { bean -> + bean.autowire = 'byType' + } + + /** + * BDBMODULE: shared BDB-JE disk persistence manager + */ + bdb(BdbModule) { + // dir = 'state' + /* + * if neither cachePercent or cacheSize are specified (the default), bdb + * uses its own default of 60% + */ + // cachePercent = 0 + // cacheSize = 0 + // useSharedCache = true + // expectedConcurrency = 25 + } + + /** + * BDBCOOKIESTORE: disk-based cookie storage for FetchHTTP + */ + cookieStore(BdbCookieStore) { + // cookiesLoadFile = null + // cookiesSaveFile = null + // bdbModule = ref('bdb') + } + + /** + * SERVERCACHE: shared cache of server/host info + */ + serverCache(BdbServerCache) { + // bdb = ref('bdb') + } + + /** + * CONFIG PATH CONFIGURER: required helper making crawl paths relative + * to crawler-beans.cxml file, and tracking crawl files for web U + */ + configPathConfigurer(ConfigPathConfigurer) +} \ No newline at end of file diff --git a/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java b/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java new file mode 100644 index 000000000..398d220f6 --- /dev/null +++ b/engine/src/test/java/org/archive/crawler/restlet/ProfileCrawlerBeansTest.java @@ -0,0 +1,35 @@ +package org.archive.crawler.restlet; + +import org.archive.spring.PathSharingContext; +import org.junit.Test; +import org.springframework.validation.Errors; + +import java.util.HashMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ProfileCrawlerBeansTest { + @Test + public void testXmlProfile() { + testProfile("classpath:/org/archive/crawler/restlet/profile-crawler-beans.cxml"); + } + + @Test + public void testGroovyProfile() { + testProfile("classpath:/org/archive/crawler/restlet/profile-crawler-beans.groovy"); + } + + private static void testProfile(String location) { + String profile = location.substring(location.lastIndexOf('/') + 1); + try (var context = new PathSharingContext(location)) { + context.validate(); + HashMap allErrors = context.getAllErrors(); + assertEquals(profile + " should have one bean with errors", 1, allErrors.size()); + Errors metadataErrors = allErrors.get("metadata"); + assertEquals(profile + " Metadata bean should have one error", 1, metadataErrors.getErrorCount()); + assertTrue(profile + " should have the operator contact info error", + metadataErrors.getAllErrors().get(0).toString().contains("ENTER_AN_URL_WITH_YOUR_CONTACT_INFO")); + } + } +} diff --git a/modules/pom.xml b/modules/pom.xml index 7d74dc300..138a6670e 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -115,6 +115,5 @@ UTF-8 - 4.0.24 diff --git a/pom.xml b/pom.xml index 4222a5db4..f5fcc1fbb 100644 --- a/pom.xml +++ b/pom.xml @@ -370,6 +370,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html ${maven.build.timestamp} none -Xdoclint:none + 4.0.24 9.4.56.v20240826 2.0.16 17