index

#!/usr/bin/env bash
#####
# Run Solr related tasks (index, get status information, purge a collection)
#
#

. ./common-variables
PORT=8983

. ./solr-functions

ME=$(basename $0)

show_usage() { # display help message
  cat <<EOF
QA catalogue Indexing MARC records with Solr

usage:
 ${ME} [options] <files>

options:
 -m, --marcVersion <arg>              MARC version ('OCLC' or 'DNB')
 -h, --help                           display help
 -n, --nolog                          do not display log messages
 -l, --limit <arg>                    limit the number of records to process
 -o, --offset <arg>                   the first record to process
 -i, --id <arg>                       the MARC identifier (content of 001)
 -d, --defaultRecordType <arg>        the default record type if the record's type is undetectable
 -q, --fixAlephseq                    fix the known issues of Alephseq format
 -a, --fixAlma                        fix the known issues of Alma format
 -b, --fixKbr                         fix the known issues of Alma format
 -p, --alephseq                       the source is in Alephseq format
 -x, --marcxml                        the source is in MARCXML format
 -y, --lineSeparated                  the source is in line separated MARC format
 -t, --outputDir <arg>                output directory
 -r, --trimId                         remove spaces from the end of record IDs
 -z, --ignorableFields <arg>          ignore fields from the analysis
 -v, --ignorableRecords <arg>         ignore records from the analysis
 -f, --marcFormat <arg>               MARC format (like 'ISO' or 'MARCXML')
 -s, --dataSource <arg>               data source (file of stream)
 -g, --defaultEncoding <arg>          default character encoding
 -1, --alephseqLineType <arg>         Alephseq line type
 -2, --picaIdField <arg>              PICA id field
 -u, --picaSubfieldSeparator <arg>    PICA subfield separator
 -j, --picaSchemaFile <arg>           Avram PICA schema file
 -w, --schemaType <arg>               metadata schema type ('MARC21', 'UNIMARC', or 'PICA')
 -k, --picaRecordType <arg>           picaRecordType
 -c, --allowableRecords <arg>         allow records for the analysis
 -e, --groupBy <arg>                  group the results by the value of this data element (e.g. the ILN of  library)
 -3, --groupListFile <arg>            the file which contains a list of ILN codes
 -4, --solrForScoresUrl <arg>         the URL of the Solr server used to store scores
 -S, --solrUrl <arg>                  the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)
 -A, --doCommit                       commits Solr index regularly
 -T, --solrFieldType <arg>            type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'
 -B, --useEmbedded                    use embedded Solr server (used in tests only)
 -C, --indexWithTokenizedField        index data elements as tokenized field as well
 -D, --commitAt <arg>                 commit index after this number of records
 -E, --indexFieldCounts               index the count of field instances
 -F, --fieldPrefix <arg>              field prefix
 -Z, --core <arg>                     The index name (core)
 -Y, --file-path <arg>                File path
 -X, --file-mask <arg>                File mask
 -W, --purge                          Purge index and exit
 -V, --status                         Show the status of index(es) and exit
 -U, --no-delete                      Do not delete index before starting indexing

more info: https://github.com/pkiraly/qa-catalogue#indexing-marc-records-with-solr

EOF
  exit 1
}

if [ $# -eq 0 ]; then
  show_usage
fi

SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:Z:Y:X:WVU"
LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,core:,file-path:,file-mask:,purge,status,no-delete"

GETOPT=$(getopt \
  -o ${SHORT_OPTIONS} \
  --long ${LONG_OPTIONS} \
  -n ${ME} -- "$@")
eval set -- "${GETOPT}"

CORE=""
FILE_PATH=0
FILE_MASK=0
DO_PURGE=0
DO_STATUS=0
SKIP_DELETE=0
SOLR_URL=""
PARAMS=""
OUTPUT_DIR=""
HELP=0
while true ; do
  case "$1" in
    -m|--marcVersion)              PARAMS="$PARAMS --marcVersion $2" ;             shift 2 ;;
    -h|--help)                     PARAMS="$PARAMS --help" ; HELP=1;               shift   ;;
    -n|--nolog)                    PARAMS="$PARAMS --nolog" ;                      shift   ;;
    -l|--limit)                    PARAMS="$PARAMS --limit $2" ;                   shift 2 ;;
    -o|--offset)                   PARAMS="$PARAMS --offset $2" ;                  shift 2 ;;
    -i|--id)                       PARAMS="$PARAMS --id $2" ;                      shift 2 ;;
    -d|--defaultRecordType)        PARAMS="$PARAMS --defaultRecordType $2" ;       shift 2 ;;
    -q|--fixAlephseq)              PARAMS="$PARAMS --fixAlephseq" ;                shift   ;;
    -a|--fixAlma)                  PARAMS="$PARAMS --fixAlma" ;                    shift   ;;
    -b|--fixKbr)                   PARAMS="$PARAMS --fixKbr" ;                     shift   ;;
    -p|--alephseq)                 PARAMS="$PARAMS --alephseq" ;                   shift   ;;
    -x|--marcxml)                  PARAMS="$PARAMS --marcxml" ;                    shift   ;;
    -y|--lineSeparated)            PARAMS="$PARAMS --lineSeparated" ;              shift   ;;
    -t|--outputDir)                PARAMS="$PARAMS --outputDir $2" ; OUTPUT_DIR="$2" ; shift 2 ;;
    -r|--trimId)                   PARAMS="$PARAMS --trimId" ;                     shift   ;;
    -z|--ignorableFields)          PARAMS="$PARAMS --ignorableFields $2" ;         shift 2 ;;
    -v|--ignorableRecords)         PARAMS="$PARAMS --ignorableRecords $2" ;        shift 2 ;;
    -f|--marcFormat)               PARAMS="$PARAMS --marcFormat $2" ;              shift 2 ;;
    -s|--dataSource)               PARAMS="$PARAMS --dataSource $2" ;              shift 2 ;;
    -g|--defaultEncoding)          PARAMS="$PARAMS --defaultEncoding $2" ;         shift 2 ;;
    -1|--alephseqLineType)         PARAMS="$PARAMS --alephseqLineType $2" ;        shift 2 ;;
    -2|--picaIdField)              PARAMS="$PARAMS --picaIdField $2" ;             shift 2 ;;
    -u|--picaSubfieldSeparator)    PARAMS="$PARAMS --picaSubfieldSeparator $2" ;   shift 2 ;;
    -j|--picaSchemaFile)           PARAMS="$PARAMS --picaSchemaFile $2" ;          shift 2 ;;
    -w|--schemaType)               PARAMS="$PARAMS --schemaType $2" ;              shift 2 ;;
    -k|--picaRecordType)           PARAMS="$PARAMS --picaRecordType $2" ;          shift 2 ;;
    -c|--allowableRecords)         PARAMS="$PARAMS --allowableRecords $2" ;        shift 2 ;;
    -e|--groupBy)                  PARAMS="$PARAMS --groupBy $2" ;                 shift 2 ;;
    -3|--groupListFile)            PARAMS="$PARAMS --groupListFile $2" ;           shift 2 ;;
    -4|--solrForScoresUrl)         PARAMS="$PARAMS --solrForScoresUrl $2" ;        shift 2 ;;
    -S|--solrUrl)                  PARAMS="$PARAMS --solrUrl $2" ; SOLR_URL="$2" ; shift 2 ;;
    -A|--doCommit)                 PARAMS="$PARAMS --doCommit" ;                   shift   ;;
    -T|--solrFieldType)            PARAMS="$PARAMS --solrFieldType $2" ;           shift 2 ;;
    -B|--useEmbedded)              PARAMS="$PARAMS --useEmbedded" ;                shift   ;;
    -C|--indexWithTokenizedField)  PARAMS="$PARAMS --indexWithTokenizedField" ;    shift   ;;
    -D|--commitAt)                 PARAMS="$PARAMS --commitAt $2" ;                shift 2 ;;
    -E|--indexFieldCounts)         PARAMS="$PARAMS --indexFieldCounts" ;           shift   ;;
    -F|--fieldPrefix)              PARAMS="$PARAMS --fieldPrefix $2" ;             shift 2 ;;
    -Z|--core)                     CORE="$2" ;                                     shift 2 ;;
    -Y|--file-path)                FILE_PATH="$2" ;                                shift 2 ;;
    -X|--file-mask)                FILE_MASK="$2" ;                                shift 2 ;;
    -W|--purge)                    DO_PURGE=1 ;                                    shift   ;;
    -V|--status)                   DO_STATUS=1 ;                                   shift   ;;
    -U|--no-delete)                SKIP_DELETE=1 ;                                 shift   ;;
    --) shift ; break ;;
    *) echo "Internal error!: $1" ; exit 1 ;;
  esac
done

if [[ $HELP -eq 1 ]]; then
  show_usage
fi

if [[ $DO_STATUS -eq 1 ]]; then
  status
  exit 0;
fi

if [[ "${SOLR_URL}" == "" ]]; then
  SOLR_URL="${SOLR_HOST}/solr/${CORE}"
  PARAMS="${PARAMS} --solrUrl ${SOLR_URL}"
else
    # HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  SOLR_HOST=$(echo $SOLR_URL | grep -oP "^https?://[^/]+" || true)
  CORE=$(echo $SOLR_URL | grep -oP "[^/]+$" || true)
fi
echo "SOLR URL: ${SOLR_URL}"
echo "SOLR HOST: ${SOLR_HOST}"
echo "CORE: ${CORE}"

if [[ "${DO_PURGE}" == "1" ]]; then
  purge_and_exit ${CORE}
fi

if [[ "${SKIP_DELETE}" == "0" ]]; then
  purge_core $CORE
fi

echo "Start indexing"
curl -s $SOLR_URL/update -H "Content-type: text/xml" --data-binary '<commit/>'

# cat <<EOT
# running the command
# ---BEGIN
# /usr/bin/java -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr \
#   --solrUrl ${SOLR_DB_URL} \
#   --solrFieldType $solrFieldType \
#   --defaultRecordType $defaultRecordType \
#   --marcVersion $marcVersion \
#   ${VALIDATION_PARAMS} \
#   $limit \
#   $trimId \
#   $marcxml \
#   $alephseq \
#   $ignorableRecords \
#   $defaultEncoding \
#   $alephseqLineType \
#   $schemaType \
#   $marcFormat \
#   $ignorableFields \
#   $groupBy \
#   $outputDir \
#   $PARAMS \
#   ${FILE_PATH}/${FILE_MASK}
# ---END
# EOT

CMD="/usr/bin/java -Xmx2g -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr"

# echo $CMD $PARAMS "$@"
# $CMD $PARAMS "$@"
echo $CMD $PARAMS ${FILE_PATH}/${FILE_MASK}
$CMD $PARAMS ${FILE_PATH}/${FILE_MASK}

# /usr/bin/java -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr \
#   --solrUrl ${SOLR_DB_URL} --solrFieldType $solrFieldType \
#   --defaultRecordType $defaultRecordType \
#   --marcVersion $marcVersion \
#   ${VALIDATION_PARAMS} \
#   $limit $trimId $marcxml $alephseq $ignorableRecords $defaultEncoding $alephseqLineType $schemaType \
#   $marcFormat $ignorableFields $groupBy $outputDir $PARAMS \
#   ${FILE_PATH}/${FILE_MASK}

echo "Start optimizing"
optimize_core $CORE

echo "store field list to JSON file"
store_fields ${SOLR_DB_URL} ${OUTPUT_DIR}

echo "indexing DONE"