diff --git a/commons/pom.xml b/commons/pom.xml index 4fdedfc60..6729c616d 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -110,12 +110,6 @@ 2.5.1-final-20040804 compile - - com.lowagie - itext - 1.3 - compile - junit junit diff --git a/contrib/pom.xml b/contrib/pom.xml index 4841396fe..6c3e61d51 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -58,9 +58,9 @@ 4.8.0 - com.itextpdf - itextpdf - 5.5.12 + org.apache.pdfbox + pdfbox + 3.0.1 org.easymock diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java index b09bde3a6..7972211c5 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorPDFContent.java @@ -18,27 +18,22 @@ */ package org.archive.modules.extractor; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.archive.modules.CrawlURI; + +import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.httpclient.URIException; -import org.archive.modules.CrawlURI; - -import com.itextpdf.text.pdf.PdfReader; -import com.itextpdf.text.pdf.parser.PdfReaderContentParser; -import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; -import com.itextpdf.text.pdf.parser.TextExtractionStrategy; - /** * PDF Content Extractor. This will parse the text content of a PDF and apply a * regex to search for links within the body of the text. * - * Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar - * * @author adam */ public class ExtractorPDFContent extends ContentExtractor { @@ -48,7 +43,7 @@ public class ExtractorPDFContent extends ContentExtractor { private static final Logger LOGGER = Logger.getLogger(ExtractorPDFContent.class.getName()); - + public static final Pattern URLPattern = Pattern.compile( "(?i)\\(?(https?):\\/\\/"+ // protocol "(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+ // username @@ -65,7 +60,7 @@ public class ExtractorPDFContent extends ContentExtractor { "((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // continue possible path "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+ // or possible query ")?"); - + { setMaxSizeToParse(10*1024*1024L); // 10MB } @@ -79,47 +74,56 @@ public long getMaxSizeToParse() { public void setMaxSizeToParse(long threshold) { kp.put("maxSizeToParse",threshold); } - - + + public ExtractorPDFContent() { } protected boolean innerExtract(CrawlURI curi){ - PdfReader documentReader; ArrayList uris = new ArrayList(); - + File tempFile = null; try { - documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream()); - - for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1 - String pageParseText = extractPageText(documentReader,i); - Matcher matcher = URLPattern.matcher(pageParseText); - - while(matcher.find()) { - String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim(); - - //handle URLs wrapped in parentheses - if(prospectiveURL.startsWith("(")) { - prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length()); - if(prospectiveURL.endsWith(")")) - prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1); - } - - uris.add(prospectiveURL); - - //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.' - if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2) - uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1)); - - //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong - if(matcher.group(19)!=null) { - String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13); - - //Again, handle URLs wrapped in parentheses - if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")")) - alternateURL=alternateURL.substring(0,alternateURL.length()-1); - - uris.add(alternateURL); + tempFile = File.createTempFile("heritrix-ExtractorPDFContent", "tmp.pdf"); + curi.getRecorder().copyContentBodyTo(tempFile); + try (PDDocument document = Loader.loadPDF(tempFile)) { + PDFTextStripper textStripper = new PDFTextStripper(); + + for (int i = 1; i <= document.getNumberOfPages(); i++) { //Page numbers start at 1 + textStripper.setStartPage(i); + textStripper.setEndPage(i); + String pageParseText = textStripper.getText(document); + Matcher matcher = URLPattern.matcher(pageParseText); + + while (matcher.find()) { + String prospectiveURL = pageParseText.substring(matcher.start(), matcher.end()).trim(); + + //handle URLs wrapped in parentheses + if (prospectiveURL.startsWith("(")) { + prospectiveURL = prospectiveURL.substring(1, prospectiveURL.length()); + if (prospectiveURL.endsWith(")")) + prospectiveURL = prospectiveURL.substring(0, prospectiveURL.length() - 1); + } + + uris.add(prospectiveURL); + + //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.' + if (prospectiveURL.endsWith(".") && prospectiveURL.length() > 2) + uris.add(prospectiveURL.substring(0, prospectiveURL.length() - 1)); + + //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong + if (matcher.group(19) != null) { + String alternateURL = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13); + + //Again, handle URLs wrapped in parentheses + if (prospectiveURL.startsWith("(") && alternateURL.endsWith(")")) + alternateURL = alternateURL.substring(0, alternateURL.length() - 1); + + // Again, remove trailing '.' + if (alternateURL.endsWith(".") && alternateURL.length() > 2) + alternateURL = alternateURL.substring(0, alternateURL.length() - 1); + + uris.add(alternateURL); + } } } } @@ -130,7 +134,11 @@ protected boolean innerExtract(CrawlURI curi){ } catch (RuntimeException e) { curi.getNonFatalFailures().add(e); return false; - } + } finally { + if (tempFile != null) { + tempFile.delete(); + } + } if (uris.size()<1) { return true; @@ -146,21 +154,7 @@ protected boolean innerExtract(CrawlURI curi){ // Set flag to indicate that link extraction is completed. return true; } - - public String extractPageText(PdfReader documentReader, int pageNum){ - String content =""; - PdfReaderContentParser parser = new PdfReaderContentParser(documentReader); - TextExtractionStrategy strat; - try { - strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy()); - content = strat.getResultantText(); - - } catch (IOException e) { - LOGGER.log(Level.WARNING, "Failed to parse pdf text in " - + Thread.currentThread().getName(), e); - } - return content; - } + @Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); diff --git a/contrib/src/main/resources/log4j.xml b/contrib/src/main/resources/log4j.xml index e04f34ab8..8a4dae8e8 100644 --- a/contrib/src/main/resources/log4j.xml +++ b/contrib/src/main/resources/log4j.xml @@ -11,6 +11,10 @@ + + + + diff --git a/contrib/src/test/resources/log4j.xml b/contrib/src/test/resources/log4j.xml index e04f34ab8..8a4dae8e8 100644 --- a/contrib/src/test/resources/log4j.xml +++ b/contrib/src/test/resources/log4j.xml @@ -11,6 +11,10 @@ + + + + diff --git a/dist/src/main/conf/logging.properties b/dist/src/main/conf/logging.properties index 094cb1762..b14a804fd 100644 --- a/dist/src/main/conf/logging.properties +++ b/dist/src/main/conf/logging.properties @@ -6,6 +6,7 @@ org.apache.commons.httpclient.level = SEVERE org.restlet.Component.LogFilter.level = SEVERE org.eclipse.jetty.log.level = SEVERE +org.apache.pdfbox = SEVERE # ...but INFO for our classes, which reserve FINE/FINER/FINEST for bulk/trivia... org.archive.level = INFO diff --git a/dist/src/main/licenses/itext.LICENSE b/dist/src/main/licenses/itext.LICENSE deleted file mode 100644 index b34862001..000000000 --- a/dist/src/main/licenses/itext.LICENSE +++ /dev/null @@ -1,24 +0,0 @@ -From http://www.lowagie.com/iText/download.html - -iText: a Free Java-PDF library by Bruno Lowagie and Paulo Soares - -License Agreement - -iText is published under 2 different licenses: MPL and LGPL. - -If you are a new user of iText, MPL is recommended (without the LGPL). -MPL is less strict than LGPL. Please read the MPL license agreement -[http://www.lowagie.com/iText/MPL-1.1.txt] before downloading and/or -using iText. - -LGPL is maintained for backward compatibility only. If you choose the -LGPL, you need to mention the MPL as an alternative license. Please -read the LGPL license agreement [http://www.lowagie.com/iText/lgpl.txt] -for more info. - -This library is free and I want it to stay free: you can use it -without paying a fee; you don't need to register anywhere. Only keep -in mind that agreeing with the license is crucial to protect the -software, its developers and its users. This library is distributed -in the hope that it will be useful, but WITHOUT any warranty. If you -don't like free software, don't (ab)use it! diff --git a/modules/pom.xml b/modules/pom.xml index a6755776a..e827f91f7 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -95,6 +95,11 @@ jsch 0.1.54 + + org.apache.pdfbox + pdfbox + 3.0.1 + diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java index 035264e19..683ee43fd 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorPDF.java @@ -89,12 +89,12 @@ protected boolean innerExtract(CrawlURI curi){ throw new RuntimeException(ioe); } - PDFParser parser; ArrayList uris; try { curi.getRecorder().copyContentBodyTo(tempFile); - parser = new PDFParser(tempFile.getAbsolutePath()); - uris = parser.extractURIs(); + try (PDFParser parser = new PDFParser(tempFile.getAbsolutePath())){ + uris = parser.extractURIs(); + } } catch (IOException e) { curi.getNonFatalFailures().add(e); return false; diff --git a/modules/src/main/java/org/archive/modules/extractor/PDFParser.java b/modules/src/main/java/org/archive/modules/extractor/PDFParser.java index 8aa07cde0..7a6fecc66 100644 --- a/modules/src/main/java/org/archive/modules/extractor/PDFParser.java +++ b/modules/src/main/java/org/archive/modules/extractor/PDFParser.java @@ -18,12 +18,13 @@ */ package org.archive.modules.extractor; -import com.lowagie.text.pdf.PdfReader; -import com.lowagie.text.pdf.PdfName; -import com.lowagie.text.pdf.PdfObject; -import com.lowagie.text.pdf.PdfDictionary; -import com.lowagie.text.pdf.PRIndirectReference; -import com.lowagie.text.pdf.PdfArray; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import java.io.*; import java.util.*; @@ -36,16 +37,11 @@ * @author Parker Thompson * */ -//TODO make this more effecient, it currently had to read the whole file into memory -// before processing can begin, and appears to take much longer than it "should" -// to parse small, but admittedly complex, documents. -public class PDFParser { +public class PDFParser implements Closeable { protected ArrayList foundURIs; - protected ArrayList> encounteredReferences; - protected PdfReader documentReader; + protected PDDocument documentReader; protected byte[] document; - protected PdfDictionary catalog; public PDFParser(String doc) throws IOException { resetState(); @@ -62,14 +58,8 @@ public PDFParser(byte[] doc) throws IOException{ */ protected void resetState(){ foundURIs = new ArrayList(); - encounteredReferences = new ArrayList>(); documentReader = null; document = null; - catalog = null; - - for(int i=0; i < encounteredReferences.size(); i++){ - encounteredReferences.add(new ArrayList()); - } } /** @@ -101,55 +91,7 @@ public void resetState(String doc) throws IOException{ */ protected void getInFromFile(String doc) throws IOException{ File documentOnDisk = new File(doc); - - long length = documentOnDisk.length(); - document = new byte[(int)length]; - - FileInputStream inStream = new FileInputStream(documentOnDisk); - - inStream.read(document); - } - - /** - * Indicates, based on a PDFObject's generation/id pair whether - * the parser has already encountered this object (or a reference to it) - * so we don't infinitely loop on circuits within the PDF. - * @param generation - * @param id - * @return True if already seen. - */ - protected boolean haveSeen(int generation, int id){ - - // if we can't store this generation grow our list until we can - if(generation >= encounteredReferences.size()){ - for(int i=encounteredReferences.size(); i <= generation; i++){ - encounteredReferences.add(new ArrayList()); - } - - // clearly we haven't seen it - return false; - } - - ArrayList generationList - = encounteredReferences.get(generation); - - for (int i: generationList) { - if(i == id){ - return true; - } - } - return false; - } - - /** - * Note that an object (id/generation pair) has been seen by this parser - * so that it can be handled differently when it is encountered again. - * @param generation - * @param id - */ - protected void markAsSeen(int generation, int id){ - ArrayList objectIds = encounteredReferences.get(generation); - objectIds.add(id); + documentReader = Loader.loadPDF(documentOnDisk); } /** @@ -170,10 +112,8 @@ public ArrayList getURIs(){ */ protected void initialize() throws IOException{ if(document != null){ - documentReader = new PdfReader(document); + documentReader = Loader.loadPDF(document); } - - catalog = documentReader.getCatalog(); } /** @@ -181,73 +121,32 @@ protected void initialize() throws IOException{ * Returns an array list representing all URIs found in the document catalog tree. * @return URIs from all objects found in a Pdf document's catalog. */ - public ArrayList extractURIs(){ - extractURIs(catalog); - return getURIs(); - } - - /** - * Parse a PdfDictionary, looking for URIs recursively and adding - * them to foundURIs - * @param entity - */ - @SuppressWarnings("unchecked") - protected void extractURIs(PdfObject entity){ - - // deal with dictionaries - if(entity.isDictionary()){ - - PdfDictionary dictionary= (PdfDictionary)entity; - - Set allkeys = dictionary.getKeys(); - for (PdfName key: allkeys) { - PdfObject value = dictionary.get(key); - - // see if it's the key is a UR[I,L] - if( key.toString().equals("/URI") || - key.toString().equals("/URL") ) { - foundURIs.add(value.toString()); - - }else{ - this.extractURIs(value); + public ArrayList extractURIs() throws IOException { + for (PDPage page : documentReader.getPages()) { + for (PDAnnotation annotation : page.getAnnotations()) { + if (annotation instanceof PDAnnotationLink) { + PDAnnotationLink link = (PDAnnotationLink) annotation; + PDAction action = link.getAction(); + if (action instanceof PDActionURI) { + PDActionURI uri = (PDActionURI) action; + foundURIs.add(uri.getURI()); } - - } - - // deal with arrays - }else if(entity.isArray()){ - - PdfArray array = (PdfArray)entity; - for (PdfObject pdfObject : (Iterable)array.getArrayList()) { - this.extractURIs(pdfObject); } - - // deal with indirect references - }else if(entity.getClass() == PRIndirectReference.class){ - - PRIndirectReference indirect = (PRIndirectReference)entity; - - // if we've already seen a reference to this object - if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){ - return; - - // note that we've seen it if it's new - }else{ - markAsSeen(indirect.getGeneration(), indirect.getNumber() ); - } - - // dereference the "pointer" and process the object - indirect.getReader(); // FIXME: examine side-effects - PdfObject direct = PdfReader.getPdfObject(indirect); - - this.extractURIs(direct); } + } + return getURIs(); } - public static void main(String[] argv){ + @Override + public void close() throws IOException { + if (documentReader != null) { + documentReader.close(); + } + } + public static void main(String[] argv){ try { - PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf"); + PDFParser parser = new PDFParser("/tmp/pdfspec.pdf"); ArrayList uris = parser.extractURIs(); Iterator i = uris.iterator(); while(i.hasNext()){ diff --git a/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java b/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java new file mode 100644 index 000000000..58dedc7aa --- /dev/null +++ b/modules/src/test/java/org/archive/modules/extractor/PDFParserTest.java @@ -0,0 +1,19 @@ +package org.archive.modules.extractor; + +import org.apache.commons.io.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; + +public class PDFParserTest { + @Test + public void test() throws IOException { + byte[] data = IOUtils.resourceToByteArray("/org/archive/crawler/modules/extractor/PDFParserTest.pdf"); + PDFParser parser = new PDFParser(data); + ArrayList uris = parser.extractURIs(); + Assert.assertEquals(Collections.singletonList("https://example.com/link-annotation"), uris); + } +} \ No newline at end of file diff --git a/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf b/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf new file mode 100644 index 000000000..3d2e2f9a4 Binary files /dev/null and b/modules/src/test/resources/org/archive/crawler/modules/extractor/PDFParserTest.pdf differ