Skip to content

Commit

Permalink
Formatter: enable to select multiple identifiers and save records as …
Browse files Browse the repository at this point in the history
…MARCXML #558
  • Loading branch information
pkiraly committed Dec 18, 2024
1 parent 1a9aaf5 commit 5ba7550
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 14 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,10 @@ or with a bash script

options:
* [general parameters](#general-parameters)
* `-f`, `--format`: the name of the format (at time of writing there is no any)
* `-f`, `--format`: the MARC output format
* if not set, the output format follows the examples in the MARC21
documentation (see the example below)
* `xml`: the output will be MARCXML
* `-c <number>`, `-countNr <number>`: count number of the record (e.g. 1 means
the first record)
* `-s [path=query]`, `-search [path=query]`: print records matching the query.
Expand All @@ -1118,6 +1121,8 @@ options:
(default: TAB)
* `-e <file>`, `--fileName <file>`: the name of report the program produces
(default: `extracted.csv`)
* `-A <identifiers>`, `--ids <identifiers>`: a comma separated list of record
identifiers

The output of displaying a single MARC record is something like this one:

Expand Down Expand Up @@ -1823,6 +1828,7 @@ options:
with `_txt`). \[This parameter is available from v0.8.0\]
* `-D <int>`, `--commitAt <int>`: commit index after this number of records \[This parameter is available from v0.8.0\]
* `-E`, `--indexFieldCounts`: index the count of field instances \[This parameter is available from v0.8.0\]
* `-G`, `--indexSubfieldCounts`: index the count of subfield instances \[This parameter is available from v0.8.0\]
* `-F`, `--fieldPrefix <arg>`: field prefix

The `./index` file (which is used by `catalogues/[catalogue].sh` and `./qa-catalogue` scripts) has additional parameters:
Expand Down
61 changes: 48 additions & 13 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Formatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.MarcException;
import org.marc4j.MarcXmlWriter;
import org.marc4j.marc.Record;

import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand All @@ -40,6 +45,7 @@ public class Formatter implements BibliographicInputProcessor {
private final FormatterParameters parameters;
private final boolean readyToProcess;
private BufferedWriter writer;
private MarcXmlWriter marcXmlWriter;

public Formatter(String[] args) throws ParseException {
parameters = new FormatterParameters(args);
Expand Down Expand Up @@ -103,6 +109,18 @@ public void beforeIteration() {
logger.log(Level.WARNING, "beforeIteration", e);
}
}
if (parameters.getIds() != null && !parameters.getIds().isEmpty()
&& parameters.getFormat().equals("xml")) {
var path = Paths.get(parameters.getOutputDir(), parameters.getFileName());
logger.info("path: " + path.toAbsolutePath());
try {
// outputStream = new FileOutputStream(path.toFile());
marcXmlWriter = new MarcXmlWriter(new FileOutputStream(path.toFile()));
marcXmlWriter.setIndent(true);
} catch (FileNotFoundException e) {
logger.log(Level.WARNING, "beforeIteration", e);
}
}
}

@Override
Expand All @@ -112,15 +130,28 @@ public void fileOpened(Path file) {

@Override
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
boolean hasSpecifiedId = parameters.hasId() &&
marc4jRecord.getControlNumber() != null &&
marc4jRecord.getControlNumber().trim().equals(parameters.getId());
String id = marc4jRecord.getControlNumber() != null ?
marc4jRecord.getControlNumber().trim() : null;

boolean hasSpecifiedId = parameters.hasId() && id != null && id.equals(parameters.getId());

if (!hasSpecifiedId)
hasSpecifiedId = id != null
&& parameters.getIds() != null
&& !parameters.getIds().isEmpty()
&& parameters.getIds().contains(id);

boolean hasSpecifiedRecordNumber = parameters.getCountNr() > -1
&& parameters.getCountNr() == recordNumber;

if (hasSpecifiedId || hasSpecifiedRecordNumber) {
logger.info(marc4jRecord::toString);
if (parameters.getFormat().equals("xml")) {
marcXmlWriter.write(marc4jRecord);
// MarcXmlWriter.writeSingleRecord(marc4jRecord, System.out, true);
// MarcXmlWriter.writeSingleRecord(marc4jRecord, outputStream, true);
} else {
logger.info(marc4jRecord::toString);
}
}
}

Expand All @@ -135,7 +166,6 @@ public void processRecord(BibliographicRecord marcRecord, int recordNumber) thro
for (DataField field : marcRecord.getDatafields()) {
logger.info(field.getTag());
}
logger.info(() -> "has STA: " + marcRecord.hasDatafield("STA"));
}

if (parameters.hasSearch()) {
Expand Down Expand Up @@ -174,15 +204,20 @@ public void fileProcessed() {

@Override
public void afterIteration(int numberOfprocessedRecords, long duration) {
if (writer == null) {
return;
}

try {
writer.close();
} catch (IOException e) {
logger.log(Level.SEVERE, "afterIteration", e);
}
if (writer != null)
try {
writer.close();
} catch (IOException e) {
logger.log(Level.SEVERE, "afterIteration", e);
}

if (marcXmlWriter != null)
try {
marcXmlWriter.close();
} catch (MarcException e) {
logger.log(Level.SEVERE, "afterIteration", e);
}
}

private List<String> selectPicaResults(PicaRecord picaRecord) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class FormatterParameters extends CommonParameters {
private boolean withId = false;
private String separator = "\t";
private String fileName = DEFAULT_FILE_NAME;
private List<String> ids;

private boolean isOptionSet = false;

Expand All @@ -35,6 +36,7 @@ protected void setOptions() {
options.addOption("l", "selector", true, "selectors");
options.addOption("w", "withId", false, "the generated CSV should contain record ID as first field");
options.addOption("p", "separator", true, "separator between the parts (default: TAB)");
options.addOption("A", "ids", true, "list of identifiers separated by comma");
options.addOption("e", "fileName", true, String.format("output file (default: %s)", DEFAULT_FILE_NAME));
isOptionSet = true;
}
Expand Down Expand Up @@ -80,6 +82,10 @@ public FormatterParameters(String[] arguments) throws ParseException {

if (cmd.hasOption("fileName"))
fileName = cmd.getOptionValue("fileName");

if (cmd.hasOption("ids")) {
ids = List.of(cmd.getOptionValue("ids").split(","));
}
}

public String getFormat() {
Expand Down Expand Up @@ -126,6 +132,10 @@ public String getFileName() {
return fileName;
}

public List<String> getIds() {
return ids;
}

@Override
public String formatParameters() {
String text = super.formatParameters();
Expand All @@ -135,6 +145,7 @@ public String formatParameters() {
text += String.format("withId: %s%n", withId);
text += String.format("separator: %s%n", separator);
text += String.format("outputFile: %s%n", fileName);
text += String.format("ids: %s%n", StringUtils.join(ids, ", "));
return text;
}

Expand Down

0 comments on commit 5ba7550

Please sign in to comment.