Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PDF relativization test #12621

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -20,11 +20,7 @@ public ExternalFilesContentImporter(ImportFormatPreferences importFormatPreferen
}

public ParserResult importPDFContent(Path file, BibDatabaseContext context, FilePreferences filePreferences) {
try {
return new PdfMergeMetadataImporter(importFormatPreferences).importDatabase(file, context, filePreferences);
} catch (IOException e) {
return ParserResult.fromError(e);
}
return new PdfMergeMetadataImporter(importFormatPreferences).importDatabase(file, context, filePreferences);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The try-catch block removal shifts exception handling responsibility to the caller without documenting this change in method signature. Method should declare throws IOException.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IntelliJ said that it is not thrown at all

}

public ParserResult importFromBibFile(Path bibFile, FileUpdateMonitor fileUpdateMonitor) throws IOException {
Original file line number Diff line number Diff line change
@@ -4,7 +4,6 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;
@@ -26,7 +25,6 @@
import org.jabref.logic.importer.fileformat.pdf.PdfXmpImporter;
import org.jabref.logic.importer.util.FileFieldParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.StandardFileType;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.model.database.BibDatabaseContext;
import org.jabref.model.entry.BibEntry;
@@ -76,8 +74,8 @@ public PdfMergeMetadataImporter(ImportFormatPreferences importFormatPreferences)
* 2. Run {@link PdfImporter}s, and store extracted candidates in the list.
*/
@Override
public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
List<BibEntry> extractedCandidates = extractCandidatesFromPdf(filePath, document);
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException, ParseException {
List<BibEntry> extractedCandidates = extractCandidatesFromPdf(fullPath, document);
if (extractedCandidates.isEmpty()) {
return List.of();
}
@@ -87,12 +85,24 @@ public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws
Stream<BibEntry> allCandidates = Stream.concat(fetchedCandidates.stream(), extractedCandidates.stream());
BibEntry entry = mergeCandidates(allCandidates);

// We use the absolute path here as we do not know the context where this import will be used.
// The caller is responsible for making the path relative if necessary.
entry.addFile(new LinkedFile("", filePath, StandardFileType.PDF.getName()));
return List.of(entry);
}

/**
* Enhanced {@link PdfImporter#importDatabase(Path)} method that relativizes paths of PDF files (if turned on in the preferences).
*/
public ParserResult importDatabase(Path filePath, BibDatabaseContext bibDatabaseContext, FilePreferences filePreferences) {
ParserResult parserResult = importDatabase(filePath);

if (filePreferences.shouldStoreFilesRelativeToBibFile() && bibDatabaseContext.getDatabasePath().isPresent()) {
Path storePath = bibDatabaseContext.getDatabasePath().get().getParent().relativize(filePath);
parserResult.getDatabase().getEntries().forEach(entry ->
entry.addFile(new LinkedFile("", storePath, StandardField.PDF.getName())));
}

return parserResult;
}

private List<BibEntry> extractCandidatesFromPdf(Path filePath, PDDocument document) {
List<BibEntry> candidates = new ArrayList<>();

@@ -186,13 +196,6 @@ private static BibEntry mergeCandidates(Stream<BibEntry> candidates) {
return entry;
}

public ParserResult importDatabase(Path filePath, BibDatabaseContext context, FilePreferences filePreferences) throws IOException {
Objects.requireNonNull(context);
Objects.requireNonNull(filePreferences);

return importDatabase(filePath);
}

@Override
public String getId() {
return "pdfMerged";
Original file line number Diff line number Diff line change
@@ -185,7 +185,7 @@ private String streamlineTitle(String title) {
return removeNonLettersAtEnd(title);
}

public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException {
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException {
List<BibEntry> result = new ArrayList<>(1);
String firstPageContents = PdfUtils.getFirstPageContents(document);
Optional<String> titleByFontSize = extractTitleFromDocument(document);
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ public PdfEmbeddedBibFileImporter(ImportFormatPreferences importFormatPreference
* Extraction of embedded files in pdfs adapted from:
* <a href="https://svn.apache.org/repos/asf/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.javaj">...</a>
*/
public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException, ParseException {
List<BibEntry> allParsedEntries = new ArrayList<>();
PDDocumentNameDictionary nameDictionary = document.getDocumentCatalog().getNames();
if (nameDictionary != null) {
Original file line number Diff line number Diff line change
@@ -25,8 +25,8 @@ public PdfGrobidImporter(ImportFormatPreferences importFormatPreferences) {
this.importFormatPreferences = importFormatPreferences;
}

public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
return grobidService.processPDF(filePath, importFormatPreferences);
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException, ParseException {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method accepts PDDocument parameter but doesn't use it in the implementation, violating the Single Responsibility Principle by having unused parameters.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it just uses grobid.

Is there any way to mark this argument as explicitly unused?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't it be removed - I don't see any @Override annotation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! It should be overriden

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then you can add JavaDoc and use @param and then say that it is ignored ^^

return grobidService.processPDF(fullPath, importFormatPreferences);
}

@Override
Original file line number Diff line number Diff line change
@@ -31,7 +31,14 @@
* {@link PdfImporter#importDatabase(Path)} or {@link PdfMergeMetadataImporter}.
*/
public abstract class PdfImporter extends Importer {
public abstract List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException;
/**
* Abstract method that inheritors should define.
* <p>
* It contains two arguments: `fullPath`, which you can use to read file again for other properties
* (like XMP metadata); and `document`, which is a parsed PDF file. Many importers parse PDF files,
* so it was decided to parse it only once and provide the parsed data structure as an argument.
*/
public abstract List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException, ParseException;

@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ public PdfVerbatimBibtexImporter(ImportFormatPreferences importFormatPreferences
this.importFormatPreferences = importFormatPreferences;
}

public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException, ParseException {
List<BibEntry> result;
String firstPageContents = PdfUtils.getFirstPageContents(document);
BibtexParser parser = new BibtexParser(importFormatPreferences);
Original file line number Diff line number Diff line change
@@ -22,8 +22,8 @@ public PdfXmpImporter(XmpPreferences xmpPreferences) {
this.xmpPreferences = xmpPreferences;
}

public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException {
return new XmpUtilReader().readXmp(filePath, xmpPreferences);
public List<BibEntry> importDatabase(Path fullPath, PDDocument document) throws IOException {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Method accepts PDDocument parameter but doesn't use it in the implementation, violating Single Responsibility Principle by having unused parameters.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is expected.

Is there any way to mark this argument as explicitly unused?

return new XmpUtilReader().readXmp(fullPath, xmpPreferences);
}

@Override
Original file line number Diff line number Diff line change
@@ -90,22 +90,22 @@ void importWorksAsExpected() throws Exception {

@Test
void importRelativizesFilePath() throws Exception {
// Initialize database and preferences
FilePreferences preferences = mock(FilePreferences.class);
BibDatabaseContext database = new BibDatabaseContext();

// Initialize file and working directory
Path file = Path.of(PdfMergeMetadataImporter.class.getResource("/pdfs/minimal.pdf").toURI());
Path directory = Path.of(PdfMergeMetadataImporter.class.getResource("/pdfs/").toURI());
preferences.setWorkingDirectory(directory);

List<BibEntry> result = importer.importDatabase(file, database, preferences).getDatabase().getEntries();
FilePreferences filePreferences = mock(FilePreferences.class);
when(filePreferences.shouldStoreFilesRelativeToBibFile()).thenReturn(true);

BibDatabaseContext database = new BibDatabaseContext();
database.setDatabasePath(directory.resolve("db.bib"));

List<BibEntry> result = importer.importDatabase(file, database, filePreferences).getDatabase().getEntries();

BibEntry expected = new BibEntry(StandardEntryType.InProceedings)
.withField(StandardField.AUTHOR, "1 ")
.withField(StandardField.TITLE, "Hello World")
// Expecting relative path
.withField(StandardField.FILE, ":minimal.pdf:PDF");
.withField(StandardField.FILE, ":minimal.pdf:pdf");

assertEquals(List.of(expected), result);
}
Loading