index

#!/usr/bin/env bash
# Run Solr related tasks (index, get status information, purge a collection)
set -ueo pipefail

. ./common-variables
PORT=8983

. ./solr-functions

ME=$(basename "$0")

show_usage() { # display help message
  cat <<EOF
QA catalogue Indexing MARC records with Solr

usage:
 $ME [options] <files>

options:
 -m, --marcVersion <arg>              MARC version ('OCLC' or 'DNB')
 -h, --help                           display help
 -n, --nolog                          do not display log messages
 -l, --limit <arg>                    limit the number of records to process
 -o, --offset <arg>                   the first record to process
 -i, --id <arg>                       the MARC identifier (content of 001)
 -d, --defaultRecordType <arg>        the default record type if the record's type is undetectable
 -q, --fixAlephseq                    fix the known issues of Alephseq format
 -a, --fixAlma                        fix the known issues of Alma format
 -b, --fixKbr                         fix the known issues of Alma format
 -p, --alephseq                       the source is in Alephseq format
 -x, --marcxml                        the source is in MARCXML format
 -y, --lineSeparated                  the source is in line separated MARC format
 -t, --outputDir <arg>                output directory
 -r, --trimId                         remove spaces from the end of record IDs
 -z, --ignorableFields <arg>          ignore fields from the analysis
 -v, --ignorableRecords <arg>         ignore records from the analysis
 -f, --marcFormat <arg>               MARC format (like 'ISO' or 'MARCXML')
 -s, --dataSource <arg>               data source (file of stream)
 -g, --defaultEncoding <arg>          default character encoding
 -1, --alephseqLineType <arg>         Alephseq line type
 -2, --picaIdField <arg>              PICA id field
 -u, --picaSubfieldSeparator <arg>    PICA subfield separator
 -j, --picaSchemaFile <arg>           Avram PICA schema file
 -w, --schemaType <arg>               metadata schema type ('MARC21', 'UNIMARC', or 'PICA')
 -k, --picaRecordType <arg>           picaRecordType
 -c, --allowableRecords <arg>         allow records for the analysis
 -e, --groupBy <arg>                  group the results by the value of this data element (e.g. the ILN of  library)
 -3, --groupListFile <arg>            the file which contains a list of ILN codes
 -4, --solrForScoresUrl <arg>         the URL of the Solr server used to store scores
 -5, --processRecordsWithoutId      process the record even it does not have an identifier
 -S, --solrUrl <arg>                  the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)
 -A, --doCommit                       commits Solr index regularly
 -T, --solrFieldType <arg>            type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'
 -B, --useEmbedded                    use embedded Solr server (used in tests only)
 -C, --indexWithTokenizedField        index data elements as tokenized field as well
 -D, --commitAt <arg>                 commit index after this number of records
 -E, --indexFieldCounts               index the count of field instances
 -G, --indexSubfieldCounts            index the count of subfield instances
 -F, --fieldPrefix <arg>              field prefix
 -Z, --core <arg>                     The index name (core)
 -Y, --file-path <arg>                File path
 -X, --file-mask <arg>                File mask
 -W, --purge                          Purge index and exit
 -V, --status                         Show the status of index(es) and exit
 -U, --no-delete                      Do not delete index before starting indexing

more info: https://github.com/pkiraly/qa-catalogue#indexing-marc-records-with-solr

EOF
  exit 1
}

if [ $# -eq 0 ]; then
  show_usage
fi

SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:5S:AT:BCD:EF:GZ:Y:X:WVU"
LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,processRecordsWithoutId,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,indexSubfieldCounts,core:,file-path:,file-mask:,purge,status,no-delete"

GETOPT=$(getopt \
  -o "$SHORT_OPTIONS" \
  --long "$LONG_OPTIONS" \
  -n "$ME" -- "$@")
eval set -- "$GETOPT"

CORE=""
FILE_PATH=0
FILE_MASK=0
DO_PURGE=0
DO_STATUS=0
SKIP_DELETE=0
SOLR_URL=""
PARAMS=""
OUTPUT_DIR=""
HELP=0
while true ; do
  case "$1" in
    -m|--marcVersion)              PARAMS="$PARAMS --marcVersion $2" ;             shift 2 ;;
    -h|--help)                     PARAMS="$PARAMS --help" ; HELP=1;               shift   ;;
    -n|--nolog)                    PARAMS="$PARAMS --nolog" ;                      shift   ;;
    -l|--limit)                    PARAMS="$PARAMS --limit $2" ;                   shift 2 ;;
    -o|--offset)                   PARAMS="$PARAMS --offset $2" ;                  shift 2 ;;
    -i|--id)                       PARAMS="$PARAMS --id $2" ;                      shift 2 ;;
    -d|--defaultRecordType)        PARAMS="$PARAMS --defaultRecordType $2" ;       shift 2 ;;
    -q|--fixAlephseq)              PARAMS="$PARAMS --fixAlephseq" ;                shift   ;;
    -a|--fixAlma)                  PARAMS="$PARAMS --fixAlma" ;                    shift   ;;
    -b|--fixKbr)                   PARAMS="$PARAMS --fixKbr" ;                     shift   ;;
    -p|--alephseq)                 PARAMS="$PARAMS --alephseq" ;                   shift   ;;
    -x|--marcxml)                  PARAMS="$PARAMS --marcxml" ;                    shift   ;;
    -y|--lineSeparated)            PARAMS="$PARAMS --lineSeparated" ;              shift   ;;
    -t|--outputDir)                PARAMS="$PARAMS --outputDir $2" ; OUTPUT_DIR="$2" ; shift 2 ;;
    -r|--trimId)                   PARAMS="$PARAMS --trimId" ;                     shift   ;;
    -z|--ignorableFields)          PARAMS="$PARAMS --ignorableFields $2" ;         shift 2 ;;
    -v|--ignorableRecords)         PARAMS="$PARAMS --ignorableRecords $2" ;        shift 2 ;;
    -f|--marcFormat)               PARAMS="$PARAMS --marcFormat $2" ;              shift 2 ;;
    -s|--dataSource)               PARAMS="$PARAMS --dataSource $2" ;              shift 2 ;;
    -g|--defaultEncoding)          PARAMS="$PARAMS --defaultEncoding $2" ;         shift 2 ;;
    -1|--alephseqLineType)         PARAMS="$PARAMS --alephseqLineType $2" ;        shift 2 ;;
    -2|--picaIdField)              PARAMS="$PARAMS --picaIdField $2" ;             shift 2 ;;
    -u|--picaSubfieldSeparator)    PARAMS="$PARAMS --picaSubfieldSeparator $2" ;   shift 2 ;;
    -j|--picaSchemaFile)           PARAMS="$PARAMS --picaSchemaFile $2" ;          shift 2 ;;
    -w|--schemaType)               PARAMS="$PARAMS --schemaType $2" ;              shift 2 ;;
    -k|--picaRecordType)           PARAMS="$PARAMS --picaRecordType $2" ;          shift 2 ;;
    -c|--allowableRecords)         PARAMS="$PARAMS --allowableRecords $2" ;        shift 2 ;;
    -e|--groupBy)                  PARAMS="$PARAMS --groupBy $2" ;                 shift 2 ;;
    -3|--groupListFile)            PARAMS="$PARAMS --groupListFile $2" ;           shift 2 ;;
    -4|--solrForScoresUrl)         PARAMS="$PARAMS --solrForScoresUrl $2" ;        shift 2 ;;
    -5|--processRecordsWithoutId)  PARAMS="$PARAMS --processRecordsWithoutId" ;    shift   ;;
    -S|--solrUrl)                  PARAMS="$PARAMS --solrUrl $2" ; SOLR_URL="$2" ; shift 2 ;;
    -A|--doCommit)                 PARAMS="$PARAMS --doCommit" ;                   shift   ;;
    -T|--solrFieldType)            PARAMS="$PARAMS --solrFieldType $2" ;           shift 2 ;;
    -B|--useEmbedded)              PARAMS="$PARAMS --useEmbedded" ;                shift   ;;
    -C|--indexWithTokenizedField)  PARAMS="$PARAMS --indexWithTokenizedField" ;    shift   ;;
    -D|--commitAt)                 PARAMS="$PARAMS --commitAt $2" ;                shift 2 ;;
    -E|--indexFieldCounts)         PARAMS="$PARAMS --indexFieldCounts" ;           shift   ;;
    -G|--indexSubfieldCounts)      PARAMS="$PARAMS --indexSubfieldCounts" ;        shift   ;;
    -F|--fieldPrefix)              PARAMS="$PARAMS --fieldPrefix $2" ;             shift 2 ;;
    -Z|--core)                     CORE="$2" ;                                     shift 2 ;;
    -Y|--file-path)                FILE_PATH="$2" ;                                shift 2 ;;
    -X|--file-mask)                FILE_MASK="$2" ;                                shift 2 ;;
    -W|--purge)                    DO_PURGE=1 ;                                    shift   ;;
    -V|--status)                   DO_STATUS=1 ;                                   shift   ;;
    -U|--no-delete)                SKIP_DELETE=1 ;                                 shift   ;;
    --) shift ; break ;;
    *) echo "Internal error!: $1" ; exit 1 ;;
  esac
done

if [[ "$HELP" -eq 1 ]]; then
  show_usage
fi

if [[ "$SOLR_URL" == "" ]]; then
  SOLR_URL="$SOLR_HOST/solr/$CORE"
  PARAMS="$PARAMS --solrUrl $SOLR_URL"
else
  # HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  SOLR_HOST=$(echo "$SOLR_URL" | grep -oP "^https?://[^/]+" || true)
  CORE=$(echo "$SOLR_URL" | grep -oP "[^/]+$" || true)
fi
echo "SOLR URL: $SOLR_URL"
echo "SOLR HOST: $SOLR_HOST"
echo "CORE: $CORE"

if [[ "$DO_STATUS" -eq 1 ]]; then
  status
  exit 0;
fi

if [[ "$DO_PURGE" == "1" ]]; then
  purge_and_exit "$CORE"
fi

if [[ "$SKIP_DELETE" == "0" ]]; then
  purge_core "$CORE"
fi

echo "Start indexing"
curl -s "$SOLR_URL/update" -H "Content-type: text/xml" --data-binary '<commit/>'

CMD="/usr/bin/java -Xmx2g -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr"

echo "$CMD $PARAMS ${FILE_PATH}/${FILE_MASK}"
# shellcheck disable=SC2206
$CMD $PARAMS "$FILE_PATH"/${FILE_MASK}

echo "Start optimizing"
optimize_core "$CORE"

echo "store field list to JSON file"
store_fields "$SOLR_DB_URL" "$OUTPUT_DIR"

echo "indexing DONE"