-
-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EPUB import #12457
base: main
Are you sure you want to change the base?
EPUB import #12457
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,177 @@ | ||||
package org.jabref.logic.importer.fileformat; | ||||
|
||||
import java.io.BufferedReader; | ||||
import java.io.File; | ||||
import java.io.IOException; | ||||
import java.nio.file.FileSystem; | ||||
import java.nio.file.FileSystems; | ||||
import java.nio.file.FileVisitResult; | ||||
import java.nio.file.Files; | ||||
import java.nio.file.Path; | ||||
import java.nio.file.SimpleFileVisitor; | ||||
import java.nio.file.attribute.BasicFileAttributes; | ||||
import java.util.Arrays; | ||||
import java.util.List; | ||||
import java.util.Optional; | ||||
|
||||
import javax.xml.parsers.DocumentBuilder; | ||||
import javax.xml.parsers.DocumentBuilderFactory; | ||||
import javax.xml.parsers.ParserConfigurationException; | ||||
import javax.xml.xpath.XPath; | ||||
import javax.xml.xpath.XPathExpression; | ||||
import javax.xml.xpath.XPathExpressionException; | ||||
import javax.xml.xpath.XPathFactory; | ||||
|
||||
import org.jabref.gui.util.OptionalObjectProperty; | ||||
import org.jabref.logic.importer.ImportFormatPreferences; | ||||
import org.jabref.logic.importer.Importer; | ||||
import org.jabref.logic.importer.ParserResult; | ||||
import org.jabref.logic.l10n.Localization; | ||||
import org.jabref.logic.util.FileType; | ||||
import org.jabref.logic.util.StandardFileType; | ||||
import org.jabref.logic.util.io.FileUtil; | ||||
import org.jabref.logic.util.io.XMLUtil; | ||||
import org.jabref.model.entry.BibEntry; | ||||
import org.jabref.model.entry.LinkedFile; | ||||
import org.jabref.model.entry.field.Field; | ||||
import org.jabref.model.entry.field.StandardField; | ||||
import org.jabref.model.entry.types.StandardEntryType; | ||||
|
||||
import org.w3c.dom.Document; | ||||
import org.xml.sax.SAXException; | ||||
|
||||
public class EpubImporter extends Importer { | ||||
private static final char[] EPUB_HEADER_MAGIC_NUMBER = {0x50, 0x4b, 0x03, 0x04}; | ||||
|
||||
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | ||||
private final DocumentBuilder builder = factory.newDocumentBuilder(); | ||||
|
||||
private final XPathFactory xPathFactory = XPathFactory.newInstance(); | ||||
private final XPath xpath = xPathFactory.newXPath(); | ||||
|
||||
private final XPathExpression titlePath = xpath.compile("/package/metadata/title"); | ||||
private final XPathExpression creatorPath = xpath.compile("/package/metadata/creator"); | ||||
private final XPathExpression identifierPath = xpath.compile("/package/metadata/identifier"); | ||||
private final XPathExpression languagePath = xpath.compile("/package/metadata/language"); | ||||
private final XPathExpression sourcePath = xpath.compile("/package/metadata/source"); | ||||
private final XPathExpression descriptionPath = xpath.compile("/package/metadata/description"); | ||||
private final XPathExpression subjectPath = xpath.compile("/package/metadata/subject"); | ||||
|
||||
private BibEntry entry = new BibEntry(StandardEntryType.Book); | ||||
|
||||
private final ImportFormatPreferences importFormatPreferences; | ||||
|
||||
public EpubImporter(ImportFormatPreferences importFormatPreferences) throws XPathExpressionException, ParserConfigurationException { | ||||
this.importFormatPreferences = importFormatPreferences; | ||||
} | ||||
|
||||
// ePUB is a ZIP-based format, so this method will clash with other ZIP-based formats. | ||||
// Currently, only `.ctv6bak` is found. | ||||
@Override | ||||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||||
char[] header = new char[EPUB_HEADER_MAGIC_NUMBER.length]; | ||||
int nRead = input.read(header); | ||||
return nRead == EPUB_HEADER_MAGIC_NUMBER.length && Arrays.equals(header, EPUB_HEADER_MAGIC_NUMBER); | ||||
} | ||||
|
||||
@Override | ||||
public ParserResult importDatabase(Path filePath) throws IOException { | ||||
// Not in functional programming style, but making {@link entry} a local mutable variable makes it easier | ||||
// to write {@link addField}. | ||||
// Potentially, this class won't work properly in concurrent situations. | ||||
|
||||
// TODO: JabRef has {@link DublinCoreExtractor}, which is exactly the schema used in OPF. However, that class | ||||
// is tied to {@link DublinCoreSchema}, which is tied to {@link XMPSchema}. It seems there are no way to pass | ||||
// ordinary XML nodes to {@link DublinCoreSchema}. | ||||
// | ||||
// Current implementation uses some hand-crafted {@link XPath}s, which work okayish, but not as good as a | ||||
// full-featured {@link DublinCoreExtractor}. | ||||
|
||||
entry = new BibEntry(StandardEntryType.Book); | ||||
|
||||
try (FileSystem fileSystem = FileSystems.newFileSystem(filePath)) { | ||||
OptionalObjectProperty<Path> metadataFilePath = OptionalObjectProperty.empty(); | ||||
|
||||
Files.walkFileTree(fileSystem.getPath("/"), new SimpleFileVisitor<>() { | ||||
@Override | ||||
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { | ||||
if (file.toString().endsWith(".opf")) { | ||||
metadataFilePath.set(Optional.of(file)); | ||||
return FileVisitResult.TERMINATE; | ||||
} | ||||
return FileVisitResult.CONTINUE; | ||||
} | ||||
}); | ||||
|
||||
if (metadataFilePath.get().isEmpty()) { | ||||
return ParserResult.fromErrorMessage(Localization.lang("Could not find metadata file. Possibly corrupted ePUB file.")); | ||||
} | ||||
|
||||
File metadataFile = FileUtil.remapZipPath(metadataFilePath.get().get()).toFile(); | ||||
Document document = builder.parse(metadataFile); | ||||
|
||||
Optional<String> title = XMLUtil.getNodeContentByXPath(document, titlePath); | ||||
Optional<String> identifier = XMLUtil.getNodeContentByXPath(document, identifierPath); | ||||
Optional<String> source = XMLUtil.getNodeContentByXPath(document, sourcePath); | ||||
Optional<String> description = XMLUtil.getNodeContentByXPath(document, descriptionPath); | ||||
|
||||
List<String> authors = XMLUtil.getNodesContentByXPath(document, creatorPath); | ||||
List<String> subjects = XMLUtil.getNodesContentByXPath(document, subjectPath); | ||||
List<String> languages = XMLUtil.getNodesContentByXPath(document, languagePath); | ||||
|
||||
addField(StandardField.TITLE, title); | ||||
addField(StandardField.ABSTRACT, description); | ||||
|
||||
if (source.isPresent()) { | ||||
addField(StandardField.URL, source); | ||||
} else { | ||||
addField(StandardField.URL, identifier); | ||||
} | ||||
|
||||
addField(StandardField.AUTHOR, Optional.of(String.join(" and ", authors))); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Authorlist parser There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does it do? Parsing authors? Why do I need it there? Authors are specified separately in UPDATE: Ah, crap, I remember seeing sometimes it's not. Should investigate a bit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember that there is a fetcher which has a similar schema, take a look at that one There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh yes! There is Dublin Core scheme (which OPF internally uses)!
Well, anyway, it was an interesting experience with parsing XML files and using `XPath`s.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Terrible, it's tied to XMP format. I can't find a way to pass ordinary XML nodes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah damn. Can you try the Dublin Core Extractor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! It is this class that relies on XMP There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
And thus this refs #12457 (comment) |
||||
|
||||
// Might not be the right way. Leaving, as it still contains information. | ||||
addField(StandardField.LANGUAGE, Optional.of(String.join(" and ", languages))); | ||||
|
||||
entry.addKeywords(subjects, importFormatPreferences.bibEntryPreferences().getKeywordSeparator()); | ||||
|
||||
entry.addFile(new LinkedFile("", filePath.toAbsolutePath(), StandardFileType.EPUB.getName())); | ||||
|
||||
return ParserResult.fromEntry(entry); | ||||
} catch (SAXException | XPathExpressionException e) { | ||||
return ParserResult.fromError(e); | ||||
} | ||||
} | ||||
|
||||
// Tradeoff between conforming to controversial code standard and code simplicity. | ||||
// This refs: https://peps.python.org/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds. | ||||
private void addField(Field field, Optional<String> value) { | ||||
value.ifPresent(it -> entry.setField(field, it)); | ||||
} | ||||
|
||||
@Override | ||||
public ParserResult importDatabase(BufferedReader input) throws IOException { | ||||
throw new UnsupportedOperationException("EpubImporter does not support importDatabase(BufferedReader reader). " | ||||
+ "Instead use importDatabase(Path filePath)."); | ||||
} | ||||
|
||||
@Override | ||||
public String getId() { | ||||
return "epub"; | ||||
} | ||||
|
||||
@Override | ||||
public String getName() { | ||||
return "ePUB"; | ||||
} | ||||
|
||||
@Override | ||||
public String getDescription() { | ||||
return Localization.lang("Import the popular e-book file format ePUB"); | ||||
} | ||||
|
||||
@Override | ||||
public FileType getFileType() { | ||||
return StandardFileType.EPUB; | ||||
} | ||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Normally, one uses a stax parser to parse XML. (Not DOM, not SAX, not XPath)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I remember it was used. But it's not like I parse whole XML file: I only extract tiny bits of data.
I tried to not include any third-party libraries and use what we alredy have (module info must be unchanged). If there is a specifc reason why it's better to use stax here, I'll rewrite.
Stax is used for importing libraries for other formats, I guess it's core strength in it's parsing interface, that it doesn't load all files into memory. However in this PR an OPF file is parsed, which contains just metadata (all content, source, chapters are in separate XML files in ePUB). Maybe I can write this comment to justify using
XPath
s there?I'll also look, if types generated by stax could somehow be transformed into
DublinCoreSchema
...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just add a JavaDoc comment that you used XPath because you only parse a fragment of the file.
I think, XPath goes into the while DOM nevertheless and StAX would be more efficient. Nevertheless, StAX is more imperative and we as SQL guys like declarative (which is XPath)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StAX is more a "nice" way to walk around the DOM tree.
I commented because of consistency to the other importers