diff --git a/commons/pom.xml b/commons/pom.xml
index 4fdedfc60..6729c616d 100644
--- a/commons/pom.xml
+++ b/commons/pom.xml
@@ -110,12 +110,6 @@
2.5.1-final-20040804
compile
-
- com.lowagie
- itext
- 1.3
- compile
-
junit
junit
diff --git a/contrib/pom.xml b/contrib/pom.xml
index 4841396fe..6c3e61d51 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -58,9 +58,9 @@
4.8.0
- com.itextpdf
- itextpdf
- 5.5.12
+ org.apache.pdfbox
+ pdfbox
+ 3.0.1
org.easymock
diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java
index b09bde3a6..7972211c5 100644
--- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java
+++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java
@@ -18,27 +18,22 @@
*/
package org.archive.modules.extractor;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.archive.modules.CrawlURI;
+
+import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.httpclient.URIException;
-import org.archive.modules.CrawlURI;
-
-import com.itextpdf.text.pdf.PdfReader;
-import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
-import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
-import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
-
/**
* PDF Content Extractor. This will parse the text content of a PDF and apply a
* regex to search for links within the body of the text.
*
- * Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar
- *
* @author adam
*/
public class ExtractorPDFContent extends ContentExtractor {
@@ -48,7 +43,7 @@ public class ExtractorPDFContent extends ContentExtractor {
private static final Logger LOGGER =
Logger.getLogger(ExtractorPDFContent.class.getName());
-
+
public static final Pattern URLPattern = Pattern.compile(
"(?i)\\(?(https?):\\/\\/"+ // protocol
"(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+ // username
@@ -65,7 +60,7 @@ public class ExtractorPDFContent extends ContentExtractor {
"((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // continue possible path
"(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+ // or possible query
")?");
-
+
{
setMaxSizeToParse(10*1024*1024L); // 10MB
}
@@ -79,47 +74,56 @@ public long getMaxSizeToParse() {
public void setMaxSizeToParse(long threshold) {
kp.put("maxSizeToParse",threshold);
}
-
-
+
+
public ExtractorPDFContent() {
}
protected boolean innerExtract(CrawlURI curi){
- PdfReader documentReader;
ArrayList uris = new ArrayList();
-
+ File tempFile = null;
try {
- documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream());
-
- for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1
- String pageParseText = extractPageText(documentReader,i);
- Matcher matcher = URLPattern.matcher(pageParseText);
-
- while(matcher.find()) {
- String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim();
-
- //handle URLs wrapped in parentheses
- if(prospectiveURL.startsWith("(")) {
- prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length());
- if(prospectiveURL.endsWith(")"))
- prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1);
- }
-
- uris.add(prospectiveURL);
-
- //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
- if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2)
- uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1));
-
- //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
- if(matcher.group(19)!=null) {
- String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13);
-
- //Again, handle URLs wrapped in parentheses
- if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
- alternateURL=alternateURL.substring(0,alternateURL.length()-1);
-
- uris.add(alternateURL);
+ tempFile = File.createTempFile("heritrix-ExtractorPDFContent", "tmp.pdf");
+ curi.getRecorder().copyContentBodyTo(tempFile);
+ try (PDDocument document = Loader.loadPDF(tempFile)) {
+ PDFTextStripper textStripper = new PDFTextStripper();
+
+ for (int i = 1; i <= document.getNumberOfPages(); i++) { //Page numbers start at 1
+ textStripper.setStartPage(i);
+ textStripper.setEndPage(i);
+ String pageParseText = textStripper.getText(document);
+ Matcher matcher = URLPattern.matcher(pageParseText);
+
+ while (matcher.find()) {
+ String prospectiveURL = pageParseText.substring(matcher.start(), matcher.end()).trim();
+
+ //handle URLs wrapped in parentheses
+ if (prospectiveURL.startsWith("(")) {
+ prospectiveURL = prospectiveURL.substring(1, prospectiveURL.length());
+ if (prospectiveURL.endsWith(")"))
+ prospectiveURL = prospectiveURL.substring(0, prospectiveURL.length() - 1);
+ }
+
+ uris.add(prospectiveURL);
+
+ //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
+ if (prospectiveURL.endsWith(".") && prospectiveURL.length() > 2)
+ uris.add(prospectiveURL.substring(0, prospectiveURL.length() - 1));
+
+ //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
+ if (matcher.group(19) != null) {
+ String alternateURL = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13);
+
+ //Again, handle URLs wrapped in parentheses
+ if (prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
+ alternateURL = alternateURL.substring(0, alternateURL.length() - 1);
+
+ // Again, remove trailing '.'
+ if (alternateURL.endsWith(".") && alternateURL.length() > 2)
+ alternateURL = alternateURL.substring(0, alternateURL.length() - 1);
+
+ uris.add(alternateURL);
+ }
}
}
}
@@ -130,7 +134,11 @@ protected boolean innerExtract(CrawlURI curi){
} catch (RuntimeException e) {
curi.getNonFatalFailures().add(e);
return false;
- }
+ } finally {
+ if (tempFile != null) {
+ tempFile.delete();
+ }
+ }
if (uris.size()<1) {
return true;
@@ -146,21 +154,7 @@ protected boolean innerExtract(CrawlURI curi){
// Set flag to indicate that link extraction is completed.
return true;
}
-
- public String extractPageText(PdfReader documentReader, int pageNum){
- String content ="";
- PdfReaderContentParser parser = new PdfReaderContentParser(documentReader);
- TextExtractionStrategy strat;
- try {
- strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy());
- content = strat.getResultantText();
-
- } catch (IOException e) {
- LOGGER.log(Level.WARNING, "Failed to parse pdf text in "
- + Thread.currentThread().getName(), e);
- }
- return content;
- }
+
@Override
protected boolean shouldExtract(CrawlURI uri) {
long max = getMaxSizeToParse();
diff --git a/contrib/src/main/resources/log4j.xml b/contrib/src/main/resources/log4j.xml
index e04f34ab8..8a4dae8e8 100644
--- a/contrib/src/main/resources/log4j.xml
+++ b/contrib/src/main/resources/log4j.xml
@@ -11,6 +11,10 @@
+
+
+
+
diff --git a/contrib/src/test/resources/log4j.xml b/contrib/src/test/resources/log4j.xml
index e04f34ab8..8a4dae8e8 100644
--- a/contrib/src/test/resources/log4j.xml
+++ b/contrib/src/test/resources/log4j.xml
@@ -11,6 +11,10 @@
+
+
+
+
diff --git a/dist/src/main/conf/logging.properties b/dist/src/main/conf/logging.properties
index 094cb1762..b14a804fd 100644
--- a/dist/src/main/conf/logging.properties
+++ b/dist/src/main/conf/logging.properties
@@ -6,6 +6,7 @@
org.apache.commons.httpclient.level = SEVERE
org.restlet.Component.LogFilter.level = SEVERE
org.eclipse.jetty.log.level = SEVERE
+org.apache.pdfbox = SEVERE
# ...but INFO for our classes, which reserve FINE/FINER/FINEST for bulk/trivia...
org.archive.level = INFO
diff --git a/dist/src/main/licenses/itext.LICENSE b/dist/src/main/licenses/itext.LICENSE
deleted file mode 100644
index b34862001..000000000
--- a/dist/src/main/licenses/itext.LICENSE
+++ /dev/null
@@ -1,24 +0,0 @@
-From http://www.lowagie.com/iText/download.html
-
-iText: a Free Java-PDF library by Bruno Lowagie and Paulo Soares
-
-License Agreement
-
-iText is published under 2 different licenses: MPL and LGPL.
-
-If you are a new user of iText, MPL is recommended (without the LGPL).
-MPL is less strict than LGPL. Please read the MPL license agreement
-[http://www.lowagie.com/iText/MPL-1.1.txt] before downloading and/or
-using iText.
-
-LGPL is maintained for backward compatibility only. If you choose the
-LGPL, you need to mention the MPL as an alternative license. Please
-read the LGPL license agreement [http://www.lowagie.com/iText/lgpl.txt]
-for more info.
-
-This library is free and I want it to stay free: you can use it
-without paying a fee; you don't need to register anywhere. Only keep
-in mind that agreeing with the license is crucial to protect the
-software, its developers and its users. This library is distributed
-in the hope that it will be useful, but WITHOUT any warranty. If you
-don't like free software, don't (ab)use it!
diff --git a/modules/pom.xml b/modules/pom.xml
index a6755776a..e827f91f7 100644
--- a/modules/pom.xml
+++ b/modules/pom.xml
@@ -95,6 +95,11 @@
jsch
0.1.54
+
+ org.apache.pdfbox
+ pdfbox
+ 3.0.1
+
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java
index 035264e19..683ee43fd 100644
--- a/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java
+++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java
@@ -89,12 +89,12 @@ protected boolean innerExtract(CrawlURI curi){
throw new RuntimeException(ioe);
}
- PDFParser parser;
ArrayList uris;
try {
curi.getRecorder().copyContentBodyTo(tempFile);
- parser = new PDFParser(tempFile.getAbsolutePath());
- uris = parser.extractURIs();
+ try (PDFParser parser = new PDFParser(tempFile.getAbsolutePath())){
+ uris = parser.extractURIs();
+ }
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
return false;
diff --git a/modules/src/main/java/org/archive/modules/extractor/PDFParser.java b/modules/src/main/java/org/archive/modules/extractor/PDFParser.java
index 8aa07cde0..7a6fecc66 100644
--- a/modules/src/main/java/org/archive/modules/extractor/PDFParser.java
+++ b/modules/src/main/java/org/archive/modules/extractor/PDFParser.java
@@ -18,12 +18,13 @@
*/
package org.archive.modules.extractor;
-import com.lowagie.text.pdf.PdfReader;
-import com.lowagie.text.pdf.PdfName;
-import com.lowagie.text.pdf.PdfObject;
-import com.lowagie.text.pdf.PdfDictionary;
-import com.lowagie.text.pdf.PRIndirectReference;
-import com.lowagie.text.pdf.PdfArray;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import java.io.*;
import java.util.*;
@@ -36,16 +37,11 @@
* @author Parker Thompson
*
*/
-//TODO make this more effecient, it currently had to read the whole file into memory
-// before processing can begin, and appears to take much longer than it "should"
-// to parse small, but admittedly complex, documents.
-public class PDFParser {
+public class PDFParser implements Closeable {
protected ArrayList foundURIs;
- protected ArrayList> encounteredReferences;
- protected PdfReader documentReader;
+ protected PDDocument documentReader;
protected byte[] document;
- protected PdfDictionary catalog;
public PDFParser(String doc) throws IOException {
resetState();
@@ -62,14 +58,8 @@ public PDFParser(byte[] doc) throws IOException{
*/
protected void resetState(){
foundURIs = new ArrayList();
- encounteredReferences = new ArrayList>();
documentReader = null;
document = null;
- catalog = null;
-
- for(int i=0; i < encounteredReferences.size(); i++){
- encounteredReferences.add(new ArrayList());
- }
}
/**
@@ -101,55 +91,7 @@ public void resetState(String doc) throws IOException{
*/
protected void getInFromFile(String doc) throws IOException{
File documentOnDisk = new File(doc);
-
- long length = documentOnDisk.length();
- document = new byte[(int)length];
-
- FileInputStream inStream = new FileInputStream(documentOnDisk);
-
- inStream.read(document);
- }
-
- /**
- * Indicates, based on a PDFObject's generation/id pair whether
- * the parser has already encountered this object (or a reference to it)
- * so we don't infinitely loop on circuits within the PDF.
- * @param generation
- * @param id
- * @return True if already seen.
- */
- protected boolean haveSeen(int generation, int id){
-
- // if we can't store this generation grow our list until we can
- if(generation >= encounteredReferences.size()){
- for(int i=encounteredReferences.size(); i <= generation; i++){
- encounteredReferences.add(new ArrayList());
- }
-
- // clearly we haven't seen it
- return false;
- }
-
- ArrayList generationList
- = encounteredReferences.get(generation);
-
- for (int i: generationList) {
- if(i == id){
- return true;
- }
- }
- return false;
- }
-
- /**
- * Note that an object (id/generation pair) has been seen by this parser
- * so that it can be handled differently when it is encountered again.
- * @param generation
- * @param id
- */
- protected void markAsSeen(int generation, int id){
- ArrayList objectIds = encounteredReferences.get(generation);
- objectIds.add(id);
+ documentReader = Loader.loadPDF(documentOnDisk);
}
/**
@@ -170,10 +112,8 @@ public ArrayList getURIs(){
*/
protected void initialize() throws IOException{
if(document != null){
- documentReader = new PdfReader(document);
+ documentReader = Loader.loadPDF(document);
}
-
- catalog = documentReader.getCatalog();
}
/**
@@ -181,73 +121,32 @@ protected void initialize() throws IOException{
* Returns an array list representing all URIs found in the document catalog tree.
* @return URIs from all objects found in a Pdf document's catalog.
*/
- public ArrayList extractURIs(){
- extractURIs(catalog);
- return getURIs();
- }
-
- /**
- * Parse a PdfDictionary, looking for URIs recursively and adding
- * them to foundURIs
- * @param entity
- */
- @SuppressWarnings("unchecked")
- protected void extractURIs(PdfObject entity){
-
- // deal with dictionaries
- if(entity.isDictionary()){
-
- PdfDictionary dictionary= (PdfDictionary)entity;
-
- Set allkeys = dictionary.getKeys();
- for (PdfName key: allkeys) {
- PdfObject value = dictionary.get(key);
-
- // see if it's the key is a UR[I,L]
- if( key.toString().equals("/URI") ||
- key.toString().equals("/URL") ) {
- foundURIs.add(value.toString());
-
- }else{
- this.extractURIs(value);
+ public ArrayList extractURIs() throws IOException {
+ for (PDPage page : documentReader.getPages()) {
+ for (PDAnnotation annotation : page.getAnnotations()) {
+ if (annotation instanceof PDAnnotationLink) {
+ PDAnnotationLink link = (PDAnnotationLink) annotation;
+ PDAction action = link.getAction();
+ if (action instanceof PDActionURI) {
+ PDActionURI uri = (PDActionURI) action;
+ foundURIs.add(uri.getURI());
}
-
- }
-
- // deal with arrays
- }else if(entity.isArray()){
-
- PdfArray array = (PdfArray)entity;
- for (PdfObject pdfObject : (Iterable)array.getArrayList()) {
- this.extractURIs(pdfObject);
}
-
- // deal with indirect references
- }else if(entity.getClass() == PRIndirectReference.class){
-
- PRIndirectReference indirect = (PRIndirectReference)entity;
-
- // if we've already seen a reference to this object
- if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
- return;
-
- // note that we've seen it if it's new
- }else{
- markAsSeen(indirect.getGeneration(), indirect.getNumber() );
- }
-
- // dereference the "pointer" and process the object
- indirect.getReader(); // FIXME: examine side-effects
- PdfObject direct = PdfReader.getPdfObject(indirect);
-
- this.extractURIs(direct);
}
+ }
+ return getURIs();
}
- public static void main(String[] argv){
+ @Override
+ public void close() throws IOException {
+ if (documentReader != null) {
+ documentReader.close();
+ }
+ }
+ public static void main(String[] argv){
try {
- PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
+ PDFParser parser = new PDFParser("/tmp/pdfspec.pdf");
ArrayList uris = parser.extractURIs();
Iterator i = uris.iterator();
while(i.hasNext()){
diff --git a/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java b/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java
new file mode 100644
index 000000000..58dedc7aa
--- /dev/null
+++ b/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java
@@ -0,0 +1,19 @@
+package org.archive.modules.extractor;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class PDFParserTest {
+ @Test
+ public void test() throws IOException {
+ byte[] data = IOUtils.resourceToByteArray("/org/archive/crawler/modules/extractor/PDFParserTest.pdf");
+ PDFParser parser = new PDFParser(data);
+ ArrayList uris = parser.extractURIs();
+ Assert.assertEquals(Collections.singletonList("https://example.com/link-annotation"), uris);
+ }
+}
\ No newline at end of file
diff --git a/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf b/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf
new file mode 100644
index 000000000..3d2e2f9a4
Binary files /dev/null and b/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf differ