-
Notifications
You must be signed in to change notification settings - Fork 17
/
index
executable file
·229 lines (204 loc) · 10.8 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env bash
#####
# Run Solr related tasks (index, get status information, purge a collection)
#
#
. ./common-variables
PORT=8983
. ./solr-functions
ME=$(basename $0)
show_usage() { # display help message
cat <<EOF
QA catalogue Indexing MARC records with Solr
usage:
${ME} [options] <files>
options:
-m, --marcVersion <arg> MARC version ('OCLC' or 'DNB')
-h, --help display help
-n, --nolog do not display log messages
-l, --limit <arg> limit the number of records to process
-o, --offset <arg> the first record to process
-i, --id <arg> the MARC identifier (content of 001)
-d, --defaultRecordType <arg> the default record type if the record's type is undetectable
-q, --fixAlephseq fix the known issues of Alephseq format
-a, --fixAlma fix the known issues of Alma format
-b, --fixKbr fix the known issues of Alma format
-p, --alephseq the source is in Alephseq format
-x, --marcxml the source is in MARCXML format
-y, --lineSeparated the source is in line separated MARC format
-t, --outputDir <arg> output directory
-r, --trimId remove spaces from the end of record IDs
-z, --ignorableFields <arg> ignore fields from the analysis
-v, --ignorableRecords <arg> ignore records from the analysis
-f, --marcFormat <arg> MARC format (like 'ISO' or 'MARCXML')
-s, --dataSource <arg> data source (file of stream)
-g, --defaultEncoding <arg> default character encoding
-1, --alephseqLineType <arg> Alephseq line type
-2, --picaIdField <arg> PICA id field
-u, --picaSubfieldSeparator <arg> PICA subfield separator
-j, --picaSchemaFile <arg> Avram PICA schema file
-w, --schemaType <arg> metadata schema type ('MARC21', 'UNIMARC', or 'PICA')
-k, --picaRecordType <arg> picaRecordType
-c, --allowableRecords <arg> allow records for the analysis
-e, --groupBy <arg> group the results by the value of this data element (e.g. the ILN of library)
-3, --groupListFile <arg> the file which contains a list of ILN codes
-4, --solrForScoresUrl <arg> the URL of the Solr server used to store scores
-S, --solrUrl <arg> the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)
-A, --doCommit commits Solr index regularly
-T, --solrFieldType <arg> type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'
-B, --useEmbedded use embedded Solr server (used in tests only)
-C, --indexWithTokenizedField index data elements as tokenized field as well
-D, --commitAt <arg> commit index after this number of records
-E, --indexFieldCounts index the count of field instances
-G, --indexSubfieldCounts index the count of subfield instances
-F, --fieldPrefix <arg> field prefix
-Z, --core <arg> The index name (core)
-Y, --file-path <arg> File path
-X, --file-mask <arg> File mask
-W, --purge Purge index and exit
-V, --status Show the status of index(es) and exit
-U, --no-delete Do not delete index before starting indexing
more info: https://github.com/pkiraly/qa-catalogue#indexing-marc-records-with-solr
EOF
exit 1
}
if [ $# -eq 0 ]; then
show_usage
fi
SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:GZ:Y:X:WVU"
LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,indexSubfieldCounts,core:,file-path:,file-mask:,purge,status,no-delete"
GETOPT=$(getopt \
-o ${SHORT_OPTIONS} \
--long ${LONG_OPTIONS} \
-n ${ME} -- "$@")
eval set -- "${GETOPT}"
CORE=""
FILE_PATH=0
FILE_MASK=0
DO_PURGE=0
DO_STATUS=0
SKIP_DELETE=0
SOLR_URL=""
PARAMS=""
OUTPUT_DIR=""
HELP=0
while true ; do
case "$1" in
-m|--marcVersion) PARAMS="$PARAMS --marcVersion $2" ; shift 2 ;;
-h|--help) PARAMS="$PARAMS --help" ; HELP=1; shift ;;
-n|--nolog) PARAMS="$PARAMS --nolog" ; shift ;;
-l|--limit) PARAMS="$PARAMS --limit $2" ; shift 2 ;;
-o|--offset) PARAMS="$PARAMS --offset $2" ; shift 2 ;;
-i|--id) PARAMS="$PARAMS --id $2" ; shift 2 ;;
-d|--defaultRecordType) PARAMS="$PARAMS --defaultRecordType $2" ; shift 2 ;;
-q|--fixAlephseq) PARAMS="$PARAMS --fixAlephseq" ; shift ;;
-a|--fixAlma) PARAMS="$PARAMS --fixAlma" ; shift ;;
-b|--fixKbr) PARAMS="$PARAMS --fixKbr" ; shift ;;
-p|--alephseq) PARAMS="$PARAMS --alephseq" ; shift ;;
-x|--marcxml) PARAMS="$PARAMS --marcxml" ; shift ;;
-y|--lineSeparated) PARAMS="$PARAMS --lineSeparated" ; shift ;;
-t|--outputDir) PARAMS="$PARAMS --outputDir $2" ; OUTPUT_DIR="$2" ; shift 2 ;;
-r|--trimId) PARAMS="$PARAMS --trimId" ; shift ;;
-z|--ignorableFields) PARAMS="$PARAMS --ignorableFields $2" ; shift 2 ;;
-v|--ignorableRecords) PARAMS="$PARAMS --ignorableRecords $2" ; shift 2 ;;
-f|--marcFormat) PARAMS="$PARAMS --marcFormat $2" ; shift 2 ;;
-s|--dataSource) PARAMS="$PARAMS --dataSource $2" ; shift 2 ;;
-g|--defaultEncoding) PARAMS="$PARAMS --defaultEncoding $2" ; shift 2 ;;
-1|--alephseqLineType) PARAMS="$PARAMS --alephseqLineType $2" ; shift 2 ;;
-2|--picaIdField) PARAMS="$PARAMS --picaIdField $2" ; shift 2 ;;
-u|--picaSubfieldSeparator) PARAMS="$PARAMS --picaSubfieldSeparator $2" ; shift 2 ;;
-j|--picaSchemaFile) PARAMS="$PARAMS --picaSchemaFile $2" ; shift 2 ;;
-w|--schemaType) PARAMS="$PARAMS --schemaType $2" ; shift 2 ;;
-k|--picaRecordType) PARAMS="$PARAMS --picaRecordType $2" ; shift 2 ;;
-c|--allowableRecords) PARAMS="$PARAMS --allowableRecords $2" ; shift 2 ;;
-e|--groupBy) PARAMS="$PARAMS --groupBy $2" ; shift 2 ;;
-3|--groupListFile) PARAMS="$PARAMS --groupListFile $2" ; shift 2 ;;
-4|--solrForScoresUrl) PARAMS="$PARAMS --solrForScoresUrl $2" ; shift 2 ;;
-S|--solrUrl) PARAMS="$PARAMS --solrUrl $2" ; SOLR_URL="$2" ; shift 2 ;;
-A|--doCommit) PARAMS="$PARAMS --doCommit" ; shift ;;
-T|--solrFieldType) PARAMS="$PARAMS --solrFieldType $2" ; shift 2 ;;
-B|--useEmbedded) PARAMS="$PARAMS --useEmbedded" ; shift ;;
-C|--indexWithTokenizedField) PARAMS="$PARAMS --indexWithTokenizedField" ; shift ;;
-D|--commitAt) PARAMS="$PARAMS --commitAt $2" ; shift 2 ;;
-E|--indexFieldCounts) PARAMS="$PARAMS --indexFieldCounts" ; shift ;;
-G|--indexSubfieldCounts) PARAMS="$PARAMS --indexSubfieldCounts" ; shift ;;
-F|--fieldPrefix) PARAMS="$PARAMS --fieldPrefix $2" ; shift 2 ;;
-Z|--core) CORE="$2" ; shift 2 ;;
-Y|--file-path) FILE_PATH="$2" ; shift 2 ;;
-X|--file-mask) FILE_MASK="$2" ; shift 2 ;;
-W|--purge) DO_PURGE=1 ; shift ;;
-V|--status) DO_STATUS=1 ; shift ;;
-U|--no-delete) SKIP_DELETE=1 ; shift ;;
--) shift ; break ;;
*) echo "Internal error!: $1" ; exit 1 ;;
esac
done
if [[ $HELP -eq 1 ]]; then
show_usage
fi
if [[ $DO_STATUS -eq 1 ]]; then
status
exit 0;
fi
if [[ "${SOLR_URL}" == "" ]]; then
SOLR_URL="${SOLR_HOST}/solr/${CORE}"
PARAMS="${PARAMS} --solrUrl ${SOLR_URL}"
else
# HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
SOLR_HOST=$(echo $SOLR_URL | grep -oP "^https?://[^/]+" || true)
CORE=$(echo $SOLR_URL | grep -oP "[^/]+$" || true)
fi
echo "SOLR URL: ${SOLR_URL}"
echo "SOLR HOST: ${SOLR_HOST}"
echo "CORE: ${CORE}"
if [[ "${DO_PURGE}" == "1" ]]; then
purge_and_exit ${CORE}
fi
if [[ "${SKIP_DELETE}" == "0" ]]; then
purge_core $CORE
fi
echo "Start indexing"
curl -s $SOLR_URL/update -H "Content-type: text/xml" --data-binary '<commit/>'
# cat <<EOT
# running the command
# ---BEGIN
# /usr/bin/java -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr \
# --solrUrl ${SOLR_DB_URL} \
# --solrFieldType $solrFieldType \
# --defaultRecordType $defaultRecordType \
# --marcVersion $marcVersion \
# ${VALIDATION_PARAMS} \
# $limit \
# $trimId \
# $marcxml \
# $alephseq \
# $ignorableRecords \
# $defaultEncoding \
# $alephseqLineType \
# $schemaType \
# $marcFormat \
# $ignorableFields \
# $groupBy \
# $outputDir \
# $PARAMS \
# ${FILE_PATH}/${FILE_MASK}
# ---END
# EOT
CMD="/usr/bin/java -Xmx2g -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr"
# echo $CMD $PARAMS "$@"
# $CMD $PARAMS "$@"
echo $CMD $PARAMS ${FILE_PATH}/${FILE_MASK}
$CMD $PARAMS ${FILE_PATH}/${FILE_MASK}
# /usr/bin/java -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr \
# --solrUrl ${SOLR_DB_URL} --solrFieldType $solrFieldType \
# --defaultRecordType $defaultRecordType \
# --marcVersion $marcVersion \
# ${VALIDATION_PARAMS} \
# $limit $trimId $marcxml $alephseq $ignorableRecords $defaultEncoding $alephseqLineType $schemaType \
# $marcFormat $ignorableFields $groupBy $outputDir $PARAMS \
# ${FILE_PATH}/${FILE_MASK}
echo "Start optimizing"
optimize_core $CORE
echo "store field list to JSON file"
store_fields ${SOLR_DB_URL} ${OUTPUT_DIR}
echo "indexing DONE"