Merge pull request #575 from internetarchive/pdfbox

Switch ExtractorPDF and ExtractorPDFContent to pdfbox
internetarchive · Apr 12, 2024 · b3d276b · b3d276b
2 parents 4552509 + 19dd5c0
commit b3d276b
Show file tree

Hide file tree

Showing 12 changed files with 127 additions and 231 deletions.
diff --git a/commons/pom.xml b/commons/pom.xml
@@ -110,12 +110,6 @@
 			<version>2.5.1-final-20040804</version>
 			<scope>compile</scope>
 		</dependency>
-		<dependency>
-			<groupId>com.lowagie</groupId>
-			<artifactId>itext</artifactId>
-			<version>1.3</version>
-			<scope>compile</scope>
-		</dependency>
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>

diff --git a/contrib/pom.xml b/contrib/pom.xml
@@ -58,9 +58,9 @@
 			<version>4.8.0</version>
 		</dependency>
 		<dependency>
-			<groupId>com.itextpdf</groupId>
-			<artifactId>itextpdf</artifactId>
-			<version>5.5.12</version>
+			<groupId>org.apache.pdfbox</groupId>
+			<artifactId>pdfbox</artifactId>
+			<version>3.0.1</version>
 		</dependency>
 		<dependency>
 			<groupId>org.easymock</groupId>

diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java
@@ -18,27 +18,22 @@
  */
 package org.archive.modules.extractor;
 
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.archive.modules.CrawlURI;
+
+import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.logging.Level;
 import java.util.logging.Logger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.commons.httpclient.URIException;
-import org.archive.modules.CrawlURI;
-
-import com.itextpdf.text.pdf.PdfReader;
-import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
-import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
-import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
-
 /**
  * PDF Content Extractor. This will parse the text content of a PDF and apply a
  * regex to search for links within the body of the text.
  * 
- * Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar
- * 
  * @author adam
  */
 public class ExtractorPDFContent extends ContentExtractor {
@@ -48,7 +43,7 @@ public class ExtractorPDFContent extends ContentExtractor {
 
     private static final Logger LOGGER =
         Logger.getLogger(ExtractorPDFContent.class.getName());
-    
+
     public static final Pattern URLPattern = Pattern.compile(
             "(?i)\\(?(https?):\\/\\/"+                                                  // protocol
             "(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+           // username
@@ -65,7 +60,7 @@ public class ExtractorPDFContent extends ContentExtractor {
             "((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+    // continue possible path
             "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+       // or possible query
             ")?");
-    
+
     {
         setMaxSizeToParse(10*1024*1024L); // 10MB
     }
@@ -79,47 +74,56 @@ public long getMaxSizeToParse() {
     public void setMaxSizeToParse(long threshold) {
         kp.put("maxSizeToParse",threshold);
     }
-    
-    
+
+
     public ExtractorPDFContent() {
     }
 
     protected boolean innerExtract(CrawlURI curi){
-        PdfReader documentReader;
         ArrayList<String> uris = new ArrayList<String>();
-
+        File tempFile = null;
         try {
-            documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream());
-
-            for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1
-                String pageParseText = extractPageText(documentReader,i);
-                Matcher matcher = URLPattern.matcher(pageParseText);
-
-                while(matcher.find()) {
-                    String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim();
-
-                    //handle URLs wrapped in parentheses
-                    if(prospectiveURL.startsWith("(")) {
-                        prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length());
-                        if(prospectiveURL.endsWith(")"))
-                            prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1);
-                    }
-
-                    uris.add(prospectiveURL);
-
-                    //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
-                    if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2)
-                        uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1));
-
-                    //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
-                    if(matcher.group(19)!=null) {
-                        String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13);
-
-                        //Again, handle URLs wrapped in parentheses
-                        if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
-                            alternateURL=alternateURL.substring(0,alternateURL.length()-1);
-
-                        uris.add(alternateURL);
+            tempFile = File.createTempFile("heritrix-ExtractorPDFContent", "tmp.pdf");
+            curi.getRecorder().copyContentBodyTo(tempFile);
+            try (PDDocument document = Loader.loadPDF(tempFile)) {
+                PDFTextStripper textStripper = new PDFTextStripper();
+
+                for (int i = 1; i <= document.getNumberOfPages(); i++) { //Page numbers start at 1
+                    textStripper.setStartPage(i);
+                    textStripper.setEndPage(i);
+                    String pageParseText = textStripper.getText(document);
+                    Matcher matcher = URLPattern.matcher(pageParseText);
+
+                    while (matcher.find()) {
+                        String prospectiveURL = pageParseText.substring(matcher.start(), matcher.end()).trim();
+
+                        //handle URLs wrapped in parentheses
+                        if (prospectiveURL.startsWith("(")) {
+                            prospectiveURL = prospectiveURL.substring(1, prospectiveURL.length());
+                            if (prospectiveURL.endsWith(")"))
+                                prospectiveURL = prospectiveURL.substring(0, prospectiveURL.length() - 1);
+                        }
+
+                        uris.add(prospectiveURL);
+
+                        //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
+                        if (prospectiveURL.endsWith(".") && prospectiveURL.length() > 2)
+                            uris.add(prospectiveURL.substring(0, prospectiveURL.length() - 1));
+
+                        //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
+                        if (matcher.group(19) != null) {
+                            String alternateURL = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13);
+
+                            //Again, handle URLs wrapped in parentheses
+                            if (prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
+                                alternateURL = alternateURL.substring(0, alternateURL.length() - 1);
+
+                            // Again, remove trailing '.'
+                            if (alternateURL.endsWith(".") && alternateURL.length() > 2)
+                                alternateURL = alternateURL.substring(0, alternateURL.length() - 1);
+
+                            uris.add(alternateURL);
+                        }
                     }
                 }
             }
@@ -130,7 +134,11 @@ protected boolean innerExtract(CrawlURI curi){
         } catch (RuntimeException e) {
             curi.getNonFatalFailures().add(e);
             return false;
-        } 
+        } finally {
+            if (tempFile != null) {
+                tempFile.delete();
+            }
+        }
 
         if (uris.size()<1) {
             return true;
@@ -146,21 +154,7 @@ protected boolean innerExtract(CrawlURI curi){
         // Set flag to indicate that link extraction is completed.
         return true;
     }
-
-    public String extractPageText(PdfReader documentReader, int pageNum){
-        String content ="";
-        PdfReaderContentParser parser = new PdfReaderContentParser(documentReader);
-        TextExtractionStrategy strat;
-        try {
-            strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy());
-            content = strat.getResultantText();
-
-        } catch (IOException e) {
-            LOGGER.log(Level.WARNING, "Failed to parse pdf text in "
-                    + Thread.currentThread().getName(), e);
-        }
-        return content;
-    }
+
     @Override
     protected boolean shouldExtract(CrawlURI uri) {
         long max = getMaxSizeToParse();

diff --git a/contrib/src/main/resources/log4j.xml b/contrib/src/main/resources/log4j.xml
@@ -11,6 +11,10 @@
 		<level value="ERROR" />
 	</logger>
 
+	<logger name="org.apache.pdfbox">
+		<level value="ERROR" />
+	</logger>
+
 	<logger name="org.eclipse.jetty">
 		<level value="ERROR" />
 	</logger>

diff --git a/contrib/src/test/resources/log4j.xml b/contrib/src/test/resources/log4j.xml
@@ -11,6 +11,10 @@
 		<level value="ERROR" />
 	</logger>
 
+	<logger name="org.apache.pdfbox">
+		<level value="ERROR" />
+	</logger>
+
 	<logger name="org.eclipse.jetty">
 		<level value="ERROR" />
 	</logger>

diff --git a/dist/src/main/conf/logging.properties b/dist/src/main/conf/logging.properties
@@ -6,6 +6,7 @@
 org.apache.commons.httpclient.level = SEVERE
 org.restlet.Component.LogFilter.level = SEVERE
 org.eclipse.jetty.log.level = SEVERE
+org.apache.pdfbox = SEVERE
 # ...but INFO for our classes, which reserve FINE/FINER/FINEST for bulk/trivia...
 org.archive.level = INFO
 

diff --git a/dist/src/main/licenses/itext.LICENSE b/dist/src/main/licenses/itext.LICENSE
diff --git a/modules/pom.xml b/modules/pom.xml
@@ -95,6 +95,11 @@
 			<artifactId>jsch</artifactId>
 			<version>0.1.54</version>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.pdfbox</groupId>
+			<artifactId>pdfbox</artifactId>
+			<version>3.0.1</version>
+		</dependency>
 	</dependencies>
 	<build>
 		<plugins>

diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java
@@ -89,12 +89,12 @@ protected boolean innerExtract(CrawlURI curi){
             throw new RuntimeException(ioe);
         }
 
-        PDFParser parser;
         ArrayList<String> uris;
         try {
             curi.getRecorder().copyContentBodyTo(tempFile);
-            parser = new PDFParser(tempFile.getAbsolutePath());
-            uris = parser.extractURIs();
+            try (PDFParser parser = new PDFParser(tempFile.getAbsolutePath())){
+                uris = parser.extractURIs();
+            }
         } catch (IOException e) {
             curi.getNonFatalFailures().add(e);
             return false;