Skip to content

Commit

Permalink
Merge pull request #632 from internetarchive/groovy-config
Browse files Browse the repository at this point in the history
Add Groovy crawl configs
  • Loading branch information
ato authored Dec 24, 2024
2 parents bfa8692 + 19c24b9 commit 4c4510a
Show file tree
Hide file tree
Showing 11 changed files with 852 additions and 6 deletions.
5 changes: 5 additions & 0 deletions commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@
<artifactId>jsch</artifactId>
<version>0.2.21</version>
</dependency>
<dependency>
<groupId>org.apache.groovy</groupId>
<artifactId>groovy</artifactId>
<version>${groovy.version}</version>
</dependency>
</dependencies>
<build>
<resources>
Expand Down
46 changes: 44 additions & 2 deletions commons/src/main/java/org/archive/spring/PathSharingContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,15 @@
import org.apache.commons.io.FileUtils;
import org.archive.util.ArchiveUtils;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanDefinitionStoreException;
import org.springframework.beans.factory.config.ConfigurableListableBeanFactory;
import org.springframework.beans.factory.groovy.GroovyBeanDefinitionReader;
import org.springframework.beans.factory.xml.XmlBeanDefinitionReader;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigUtils;
import org.springframework.context.support.FileSystemXmlApplicationContext;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.EncodedResource;
import org.springframework.validation.BeanPropertyBindingResult;
import org.springframework.validation.Errors;
import org.springframework.validation.Validator;
Expand All @@ -46,9 +52,13 @@
*
* Notable extensions:
*
* Remembers its primary XML configuration file, and can report its filesystem
* Remembers its primary configuration file, and can report its filesystem
* path.
*
*
* Supports both Spring XML and Groovy Bean Definition DSL.
*
* Automatically enables annotation processing (&lt;context:annotation-config/&gt;).
*
* Reports a summary of Errors collected from self-Validating Beans.
*
* Generates launchId from timestamp, creates launch directory
Expand Down Expand Up @@ -212,5 +222,37 @@ public ConcurrentHashMap<Object, Object> getData() {
return data;
}

/**
* Load bean definitions from XML or Groovy.
*/
@Override
protected void loadBeanDefinitions(XmlBeanDefinitionReader xmlReader) throws BeansException, IOException {
// This is essentially <context:annotation-config/>
// By doing it here we don't need to include it in every crawl config.
AnnotationConfigUtils.registerAnnotationConfigProcessors(xmlReader.getRegistry());

GroovyBeanDefinitionReader groovyReader = new GroovyBeanDefinitionReader(xmlReader.getRegistry()) {
// By default, the Groovy reader loads XML from .xml and Groovy for everything else, but
// Heritrix uses .cxml so we override it to only use the Groovy reader for .groovy files
// and the XML reader for everything else.
@Override
public int loadBeanDefinitions(EncodedResource encodedResource) throws BeanDefinitionStoreException {
String filename = encodedResource.getResource().getFilename();
if (filename != null && filename.endsWith(".groovy")) {
return super.loadBeanDefinitions(encodedResource);
}
return xmlReader.loadBeanDefinitions(encodedResource);
}
};
groovyReader.setEnvironment(getEnvironment());

Resource[] configResources = getConfigResources();
if (configResources != null) {
groovyReader.loadBeanDefinitions(configResources);
}
String[] configLocations = getConfigLocations();
if (configLocations != null) {
groovyReader.loadBeanDefinitions(configLocations);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package org.archive.spring;

import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;

import static org.junit.Assert.*;

public class PathSharingContextTest {
@Test
public void testGroovyConfig() {
testConfig("groovy", "classpath:org/archive/spring/PathSharingContextTestBeans.groovy");
}

@Test
public void testXmlConfig() {
testConfig("xml", "classpath:org/archive/spring/PathSharingContextTestBeans.cxml");
}

private static void testConfig(String name, String configPath) {
try (var context = new PathSharingContext(configPath)) {
context.validate();
assertTrue("should be no validation errors", context.getAllErrors().isEmpty());
assertEquals("primaryConfiguationPath should be correct", configPath, context.getPrimaryConfigurationPath());
Bean1 bean1 = context.getBean("bean1", Bean1.class);
Bean2 bean2 = context.getBean("bean2", Bean2.class);
assertNotNull("bean1 should not be null", bean1);
assertNotNull("bean2 should not be null", bean2);
assertEquals("bean1.name should be set", name, bean1.name);
assertEquals("bean1 should be autowired into bean2", bean1, bean2.bean1);
}
}

public static class Bean1 {
private String name;

public void setName(String name) {
this.name = name;
}
}

public static class Bean2 {
private Bean1 bean1;

@Autowired
public void setBean1(Bean1 bean1) {
this.bean1 = bean1;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">

<context:annotation-config/>

<bean id="bean1" class="org.archive.spring.PathSharingContextTest$Bean1">
<property name="name" value="xml"/>
</bean>
<bean id="bean2" class="org.archive.spring.PathSharingContextTest$Bean2"/>
</beans>
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import org.archive.spring.PathSharingContextTest

beans {
bean1(PathSharingContextTest.Bean1) {
name = "groovy"
}
bean2(PathSharingContextTest.Bean2)
}
10 changes: 8 additions & 2 deletions engine/src/main/java/org/archive/crawler/framework/CrawlJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,14 @@ public void writeHtmlTo(PrintWriter pw, String uriPrefix) {
public void checkXML() {
// TODO: suppress check if XML unchanged? job.log when XML changed?

Instant testTime = Instant.ofEpochMilli(getPrimaryConfig().lastModified());
Document doc = getDomDocument(getPrimaryConfig());
File primaryConfig = getPrimaryConfig();
Instant testTime = Instant.ofEpochMilli(primaryConfig.lastModified());
if (primaryConfig.toString().endsWith(".groovy")) {
// just assume Groovy configs are OK
xmlOkAt = testTime;
return;
}
Document doc = getDomDocument(primaryConfig);
// TODO: check for other minimal requirements, like
// presence of a few key components (CrawlController etc.)?
if(doc!=null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ public boolean addJobDirectory(File dir) {
}
File[] candidateConfigs = dir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".cxml");
return name.endsWith(".cxml") || name.equals("crawler-beans.groovy");
}});
if(candidateConfigs==null || candidateConfigs.length == 0) {
// no CXML file found!
Expand Down
Loading

0 comments on commit 4c4510a

Please sign in to comment.