Skip to content

Commit

Permalink
Implement completeness for UNIMARC #421: fix the package information
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Mar 5, 2024
1 parent ad5cc27 commit d9e9a24
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 17 deletions.
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1697,8 +1697,10 @@ Field suffixes:

* `*_sni`: not indexed, stored string fields -- good for storing fields used for displaying information
* `*_ss`: not parsed, stored, indexed string fields -- good for display and facets
* `*_tt`: parsed, not stored, indexed string fields -- good for term searches
* `*_tt`: parsed, not stored, indexed string fields -- good for term searches (these fields will be availabe if
`--indexWithTokenizedField` parameter is applied)
* `*_is`: parsed, not stored, indexed integer fields -- good for searching for numbers, such as error or group identifiers
(these fields will be availabe if `--indexFieldCounts` parameter is applied)

The mapped value

Expand Down Expand Up @@ -1770,7 +1772,8 @@ Besides these two indices there is a third index that contains different kind of
writing it contains only the results of validation, but later it will cover other information as well. It can be set by
the following parameter:

* `-4`, `--solrForScoresUrl <arg>`: the URL of the Solr server used to store scores
* `-4`, `--solrForScoresUrl <arg>`: the URL of the Solr server used to store scores (it is populated in the
`validate-sqlite` process which runs after validation)

During the indexing process the content of this index is meged into the `_dev` index, so after a successfull end of the
process this index is not needed anymore.
Expand Down Expand Up @@ -1840,6 +1843,24 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{

See the [solr-functions](https://github.com/pkiraly/qa-catalogue/blob/main/solr-functions) file for full code.

QA catalogue has a helper scipt to get information about the status of Solr index (Solr URL, location, the list of cores,
number of documents, size in the disk, and last modification):

```bash
$ ./index --status
Solr index status at http://localhost:8983
Solr directory: /opt/solr-9.3.0/server/solr

core | location | nr of docs | size | last modified
.................... | ............... | .......... | .......... | ...................
nls | nls_1 | 403946 | 1002.22 MB | 2023-11-25 21:59:39
nls_dev | nls_2 | 403943 | 987.22 MB | 2023-11-11 15:59:49
nls_validation | nls_validation | 403946 | 17.89 MB | 2023-11-25 21:35:44
yale | yale_2 | 2346976 | 9.51 GB | 2023-11-11 13:12:35
yale_dev | yale_1 | 2346976 | 9.27 GB | 2023-11-11 10:58:08
```


### Indexing MARC JSON records with Solr

```bash
Expand Down
4 changes: 4 additions & 0 deletions catalogues/bnpt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
NAME=bnpt
# TYPE_PARAMS="--marcVersion GENT"
TYPE_PARAMS="--schemaType UNIMARC --marcxml --emptyLargeCollectors"
TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/bnpt_validation"
TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField"
TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts"

MARC_DIR=${BASE_INPUT_DIR}/bnpt
MASK=bibliographics_*.xml

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,8 @@ public TagHierarchy getTagHierarchy(String path) {
String subfieldCode = paths[1];

// This subfield code is either indicator or subfield code. E.g. "ind1" or "a"
// This is an indicator
if (subfieldCode.startsWith("ind")) {

// This is an indicator
String indicatorNumber = subfieldCode.substring(3);
int indicatorIndex = Integer.parseInt(indicatorNumber) - 1;

Expand All @@ -96,6 +95,7 @@ public TagHierarchy getTagHierarchy(String path) {

@Override
public String getPackageName(DataField field) {
return Utils.extractPackageName(field);
TagCategory category = TagCategory.valueOf("UNIMARC_" + field.getTag().charAt(0));
return category.getPackageName();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@ public enum TagCategory {
PICA_2(52, "pica2", "2...", "PICA+ item", false),

// UNIMARC
UNIMARC_0(60, null, "0--", "Identification block", false),
UNIMARC_1(61, null, "1--", "Coded information block", false),
UNIMARC_2(62, null, "2--", "Descriptive information block", false),
UNIMARC_3(63, null, "3--", "Notes block", false),
UNIMARC_4(64, null, "4--", "Linking entry block", false),
UNIMARC_5(65, null, "5--", "Related title block", false),
UNIMARC_6(66, null, "6--", "Subject analysis and bibliographic history block", false),
UNIMARC_7(67, null, "7--", "Responsibility block", false),
UNIMARC_8(68, null, "8--", "International use block", false),
UNIMARC_9(68, null, "9--", "National use block", false),
UNIMARC_0(60, "tags0--", "0--", "Identification block", false),
UNIMARC_1(61, "tags1--", "1--", "Coded information block", false),
UNIMARC_2(62, "tags2--", "2--", "Descriptive information block", false),
UNIMARC_3(63, "tags3--", "3--", "Notes block", false),
UNIMARC_4(64, "tags4--", "4--", "Linking entry block", false),
UNIMARC_5(65, "tags5--", "5--", "Related title block", false),
UNIMARC_6(66, "tags6--", "6--", "Subject analysis and bibliographic history block", false),
UNIMARC_7(67, "tags7--", "7--", "Responsibility block", false),
UNIMARC_8(68, "tags8--", "8--", "International use block", false),
UNIMARC_9(68, "tags9--", "9--", "National use block", false),
OTHER(99, "unknown", "unknown", "unknown origin", false);


Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
documenttype,packageid,name,label,iscoretag,count
all,62,2--,Descriptive information block,false,1
all,61,1--,Coded information block,false,1
all,64,4--,Linking entry block,false,1
all,63,3--,Notes block,false,1
all,66,6--,Subject analysis and bibliographic history block,false,1
all,0,00X,Control Fields,true,1
all,100,unimarc,,false,1
all,68,8--,International use block,false,1
all,67,7--,Responsibility block,false,1
all,60,0--,Identification block,false,1
all,99,unknown,unknown origin,false,1
Books,62,2--,Descriptive information block,false,1
Books,61,1--,Coded information block,false,1
Books,64,4--,Linking entry block,false,1
Books,63,3--,Notes block,false,1
Books,66,6--,Subject analysis and bibliographic history block,false,1
Books,0,00X,Control Fields,true,1
Books,100,unimarc,,false,1
Books,68,8--,International use block,false,1
Books,67,7--,Responsibility block,false,1
Books,60,0--,Identification block,false,1
Books,99,unknown,unknown origin,false,1

0 comments on commit d9e9a24

Please sign in to comment.