Skip to content

Commit

Permalink
Merge pull request #575 from internetarchive/pdfbox
Browse files Browse the repository at this point in the history
Switch ExtractorPDF and ExtractorPDFContent to pdfbox
  • Loading branch information
ato authored Apr 12, 2024
2 parents 4552509 + 19dd5c0 commit b3d276b
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 231 deletions.
6 changes: 0 additions & 6 deletions commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,6 @@
<version>2.5.1-final-20040804</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.lowagie</groupId>
<artifactId>itext</artifactId>
<version>1.3</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
6 changes: 3 additions & 3 deletions contrib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
<version>4.8.0</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.12</version>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.easymock</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,22 @@
*/
package org.archive.modules.extractor;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.archive.modules.CrawlURI;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

/**
* PDF Content Extractor. This will parse the text content of a PDF and apply a
* regex to search for links within the body of the text.
*
* Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar
*
* @author adam
*/
public class ExtractorPDFContent extends ContentExtractor {
Expand All @@ -48,7 +43,7 @@ public class ExtractorPDFContent extends ContentExtractor {

private static final Logger LOGGER =
Logger.getLogger(ExtractorPDFContent.class.getName());

public static final Pattern URLPattern = Pattern.compile(
"(?i)\\(?(https?):\\/\\/"+ // protocol
"(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+ // username
Expand All @@ -65,7 +60,7 @@ public class ExtractorPDFContent extends ContentExtractor {
"((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // continue possible path
"(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+ // or possible query
")?");

{
setMaxSizeToParse(10*1024*1024L); // 10MB
}
Expand All @@ -79,47 +74,56 @@ public long getMaxSizeToParse() {
public void setMaxSizeToParse(long threshold) {
kp.put("maxSizeToParse",threshold);
}


public ExtractorPDFContent() {
}

protected boolean innerExtract(CrawlURI curi){
PdfReader documentReader;
ArrayList<String> uris = new ArrayList<String>();

File tempFile = null;
try {
documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream());

for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1
String pageParseText = extractPageText(documentReader,i);
Matcher matcher = URLPattern.matcher(pageParseText);

while(matcher.find()) {
String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim();

//handle URLs wrapped in parentheses
if(prospectiveURL.startsWith("(")) {
prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length());
if(prospectiveURL.endsWith(")"))
prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1);
}

uris.add(prospectiveURL);

//parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2)
uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1));

//Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
if(matcher.group(19)!=null) {
String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13);

//Again, handle URLs wrapped in parentheses
if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
alternateURL=alternateURL.substring(0,alternateURL.length()-1);

uris.add(alternateURL);
tempFile = File.createTempFile("heritrix-ExtractorPDFContent", "tmp.pdf");
curi.getRecorder().copyContentBodyTo(tempFile);
try (PDDocument document = Loader.loadPDF(tempFile)) {
PDFTextStripper textStripper = new PDFTextStripper();

for (int i = 1; i <= document.getNumberOfPages(); i++) { //Page numbers start at 1
textStripper.setStartPage(i);
textStripper.setEndPage(i);
String pageParseText = textStripper.getText(document);
Matcher matcher = URLPattern.matcher(pageParseText);

while (matcher.find()) {
String prospectiveURL = pageParseText.substring(matcher.start(), matcher.end()).trim();

//handle URLs wrapped in parentheses
if (prospectiveURL.startsWith("(")) {
prospectiveURL = prospectiveURL.substring(1, prospectiveURL.length());
if (prospectiveURL.endsWith(")"))
prospectiveURL = prospectiveURL.substring(0, prospectiveURL.length() - 1);
}

uris.add(prospectiveURL);

//parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
if (prospectiveURL.endsWith(".") && prospectiveURL.length() > 2)
uris.add(prospectiveURL.substring(0, prospectiveURL.length() - 1));

//Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
if (matcher.group(19) != null) {
String alternateURL = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13);

//Again, handle URLs wrapped in parentheses
if (prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
alternateURL = alternateURL.substring(0, alternateURL.length() - 1);

// Again, remove trailing '.'
if (alternateURL.endsWith(".") && alternateURL.length() > 2)
alternateURL = alternateURL.substring(0, alternateURL.length() - 1);

uris.add(alternateURL);
}
}
}
}
Expand All @@ -130,7 +134,11 @@ protected boolean innerExtract(CrawlURI curi){
} catch (RuntimeException e) {
curi.getNonFatalFailures().add(e);
return false;
}
} finally {
if (tempFile != null) {
tempFile.delete();
}
}

if (uris.size()<1) {
return true;
Expand All @@ -146,21 +154,7 @@ protected boolean innerExtract(CrawlURI curi){
// Set flag to indicate that link extraction is completed.
return true;
}

public String extractPageText(PdfReader documentReader, int pageNum){
String content ="";
PdfReaderContentParser parser = new PdfReaderContentParser(documentReader);
TextExtractionStrategy strat;
try {
strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy());
content = strat.getResultantText();

} catch (IOException e) {
LOGGER.log(Level.WARNING, "Failed to parse pdf text in "
+ Thread.currentThread().getName(), e);
}
return content;
}

@Override
protected boolean shouldExtract(CrawlURI uri) {
long max = getMaxSizeToParse();
Expand Down
4 changes: 4 additions & 0 deletions contrib/src/main/resources/log4j.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
<level value="ERROR" />
</logger>

<logger name="org.apache.pdfbox">
<level value="ERROR" />
</logger>

<logger name="org.eclipse.jetty">
<level value="ERROR" />
</logger>
Expand Down
4 changes: 4 additions & 0 deletions contrib/src/test/resources/log4j.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
<level value="ERROR" />
</logger>

<logger name="org.apache.pdfbox">
<level value="ERROR" />
</logger>

<logger name="org.eclipse.jetty">
<level value="ERROR" />
</logger>
Expand Down
1 change: 1 addition & 0 deletions dist/src/main/conf/logging.properties
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
org.apache.commons.httpclient.level = SEVERE
org.restlet.Component.LogFilter.level = SEVERE
org.eclipse.jetty.log.level = SEVERE
org.apache.pdfbox = SEVERE
# ...but INFO for our classes, which reserve FINE/FINER/FINEST for bulk/trivia...
org.archive.level = INFO

Expand Down
24 changes: 0 additions & 24 deletions dist/src/main/licenses/itext.LICENSE

This file was deleted.

5 changes: 5 additions & 0 deletions modules/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@
<artifactId>jsch</artifactId>
<version>0.1.54</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.1</version>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ protected boolean innerExtract(CrawlURI curi){
throw new RuntimeException(ioe);
}

PDFParser parser;
ArrayList<String> uris;
try {
curi.getRecorder().copyContentBodyTo(tempFile);
parser = new PDFParser(tempFile.getAbsolutePath());
uris = parser.extractURIs();
try (PDFParser parser = new PDFParser(tempFile.getAbsolutePath())){
uris = parser.extractURIs();
}
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
return false;
Expand Down
Loading

0 comments on commit b3d276b

Please sign in to comment.