metadata: fix dojeson for virtua records

* Fixes dojson transformation errors. * Adds data path parameter to setup script. * Adds all languages to documents json schema and form. * Adds marc21tojson utils cli for parallel dojson transformation. Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rero · Oct 21, 2019 · 3e62499 · 3e62499
1 parent f9d91d2
commit 3e62499
Showing 15 changed files with 6,837 additions and 1,679 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -27,6 +27,7 @@ include docker/postgres/Dockerfile
 include Dockerfile
 include scripts/bootstrap
 include scripts/console
+include scripts/dojson_virtua
 include scripts/server
 include scripts/setup
 include scripts/update

diff --git a/data/documents_big.xml b/data/documents_big.xml
diff --git a/data/documents_small.xml b/data/documents_small.xml
diff --git a/rero_ils/dojson/utils.py b/rero_ils/dojson/utils.py
@@ -18,11 +18,26 @@
 """Dojson utils."""
 
 import re
+import sys
+import traceback
 
 import click
 from dojson import Overdo, utils
 
 
+def not_repetitive(bibid, key, value, subfield, default=None):
+    """Get the first value if the value is a list or tuple."""
+    if default is None:
+        data = value.get(subfield)
+    else:
+        data = value.get(subfield, default)
+    if isinstance(data, (dict, tuple)):
+        print('WARNING NOT REPETITIVE:', bibid, key, subfield, value,
+              sep='\t', file=sys.stderr)
+        data = data[0]
+    return data
+
+
 def remove_trailing_punctuation(
         data,
         punctuation=',',
@@ -121,6 +136,7 @@ class ReroIlsMarc21Overdo(ReroIlsOverdo):
     This class adds RERO Marc21 properties and functions to the ReroIlsOverdo.
     """
 
+    bib_id = ''
     field_008_data = ''
     lang_from_008 = None
     date1_from_008 = None
@@ -134,29 +150,43 @@ def __init__(self, bases=None, entry_point_group=None):
         """Reroilsmarc21overdo init."""
         super(ReroIlsMarc21Overdo, self).__init__(
             bases=bases, entry_point_group=entry_point_group)
+        self.count = 0
 
     def do(self, blob, ignore_missing=True, exception_handlers=None):
         """Translate blob values and instantiate new model instance."""
-        self.blob_record = blob
-        self.field_008_data = ''
-        self.date1_from_008 = None
-        self.date2_from_008 = None
-        self.date_type_from_008 = ''
-        fields_008 = self.get_fields(tag='008')
-        if fields_008:
-            self.field_008_data = self.get_control_field_data(
-                fields_008[0]).replace('\n', '')
-            self.date1_from_008 = self.field_008_data[7:11]
-            self.date2_from_008 = self.field_008_data[11:15]
-            self.date_type_from_008 = self.field_008_data[6]
-        self.init_lang()
-        self.init_country()
-        self.init_alternate_graphic()
-        result = super(ReroIlsMarc21Overdo, self).do(
-            blob,
-            ignore_missing=ignore_missing,
-            exception_handlers=exception_handlers
-        )
+        self.count += 1
+        # print('INFO:', blob.get('001', '???'), self.count, file=sys.stderr)
+        result = None
+        try:
+            self.blob_record = blob
+            try:
+                self.bib_id = self.get_fields(tag='001')[0]['data']
+            except Exception as err:
+                self.bib_id = '???'
+            import sys
+            self.field_008_data = ''
+            self.date1_from_008 = None
+            self.date2_from_008 = None
+            self.date_type_from_008 = ''
+            fields_008 = self.get_fields(tag='008')
+            if fields_008:
+                self.field_008_data = self.get_control_field_data(
+                    fields_008[0]).replace('\n', '')
+                self.date1_from_008 = self.field_008_data[7:11]
+                self.date2_from_008 = self.field_008_data[11:15]
+                self.date_type_from_008 = self.field_008_data[6]
+            self.init_lang()
+            self.init_country()
+            self.init_alternate_graphic()
+            result = super(ReroIlsMarc21Overdo, self).do(
+                blob,
+                ignore_missing=ignore_missing,
+                exception_handlers=exception_handlers
+            )
+        except Exception as err:
+            print('ERROR:', self.bib_id, self.count, err,
+                  sep='\t', file=sys.stderr)
+            traceback.print_exc()
         return result
 
     def get_link_data(self, subfields_6_data):
@@ -185,7 +215,11 @@ def init_country(self):
             field_044 = fields_044[0]
             cantons_codes = self.get_subfields(field_044, 'c')
             for cantons_codes in self.get_subfields(field_044, 'c'):
-                self.cantons.append(cantons_codes.split('-')[1])
+                try:
+                    self.cantons.append(cantons_codes.split('-')[1])
+                except:
+                    print('ERROR INIT CANTONS:', self.bib_id, cantons_codes,
+                          sep='\t', file=sys.stderr)
             if self.cantons:
                 self.country = 'sz'
         else:

diff --git a/rero_ils/modules/cli.py b/rero_ils/modules/cli.py
@@ -20,16 +20,21 @@
 from __future__ import absolute_import, print_function
 
 import difflib
+import gc
 import json
+import logging
+import multiprocessing
 import os
 import sys
+import time
 from collections import OrderedDict
 from glob import glob
 from json import loads
 
 import click
 import jsonref
 import yaml
+from dojson.contrib.marc21.utils import create_record, split_stream
 from flask import current_app
 from flask.cli import with_appcontext
 from flask_security.confirmable import confirm_user
@@ -41,9 +46,11 @@
 from invenio_search.proxies import current_search
 from jsonschema import validate
 from jsonschema.exceptions import ValidationError
+from lxml import etree
 from pkg_resources import resource_string
 from werkzeug.local import LocalProxy
 
+from .documents.dojson.contrib.marc21tojson import marc21tojson
 from .items.cli import create_items, reindex_items
 from .loans.cli import create_loans
 from .patrons.cli import import_users
@@ -427,8 +434,10 @@ def test_license(file, extension, copyrights, license_lines, verbose):
 @click.argument('type', default='documents')
 @click.argument('schema', default='document-v0.0.1.json')
 @click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
-@click.option('-s', '--save', 'savefile', type=click.File('w'), default=None)
-def check_validate(jsonfile, type, schema, verbose, savefile):
+@click.option('-e', '--error_file', 'error_file', type=click.File('w'),
+              default=None)
+@click.option('-o', '--ok_file', 'ok_file', type=click.File('w'), default=None)
+def check_validate(jsonfile, type, schema, verbose, error_file, ok_file):
     """Check record validation."""
     click.secho('Testing json schema for file', fg='green')
     schema_in_bytes = resource_string(
@@ -438,7 +447,6 @@ def check_validate(jsonfile, type, schema, verbose, savefile):
             schema=schema
         )
     )
-
     schema = loads(schema_in_bytes.decode('utf8'))
     datas = json.load(jsonfile)
     count = 0
@@ -454,13 +462,17 @@ def check_validate(jsonfile, type, schema, verbose, savefile):
             data["pid"] = 'dummy'
         try:
             validate(data, schema)
-        except ValidationError as excp:
-            if savefile:
-                savefile.write(json.dumps(data, indent=2))
+            if ok_file:
+                if data["pid"] == 'dummy':
+                    del data["pid"]
+                ok_file.write(json.dumps(data, indent=2))
+        except ValidationError as err:
+            if error_file:
+                error_file.write(json.dumps(data, indent=2))
             click.secho(
                 'Error validate in record: {count}'.format(count=count),
                 fg='red')
-            click.secho(str(excp))
+            click.secho(str(err))
 
 
 @utils.command('compile_json')
@@ -475,3 +487,215 @@ def compile_json(src_jsonfile, output, verbose):
     if not output:
         output = sys.stdout
     json.dump(data, fp=output, indent=2)
+
+
+def do_worker(marc21records, results, pid_requierd):
+    """Worker for marc21 to json transformation."""
+    schema_in_bytes = resource_string(
+        'rero_ils.modules.documents.jsonschemas',
+        'documents/document-v0.0.1.json'
+    )
+    schema = loads(schema_in_bytes.decode('utf8'))
+    for data in marc21records:
+        data_json = data['json']
+        pid = data_json.get('001', '???')
+        try:
+            record = marc21tojson.do(data_json)
+            if not record.get("$schema"):
+                # create dummy schema in data
+                record["$schema"] = 'dummy'
+            if not pid_requierd:
+                if not record.get("pid"):
+                    # create dummy pid in data
+                    record["pid"] = 'dummy'
+            validate(record, schema)
+            if record["$schema"] == 'dummy':
+                del record["$schema"]
+            if not pid_requierd:
+                if record["pid"] == 'dummy':
+                    del record["pid"]
+            results.append({
+                'status': True,
+                'data': record
+            })
+        except Exception as err:
+            msg = ''
+            # make the error message shorter
+            for line in str(err).split('\n'):
+                if line and line[0] != ' ':
+                    if len(line) > 100:
+                        line = line[:25] + ' ...'
+                    msg += '\t' + line
+            msg = msg.strip()
+            print('ERROR:', pid, msg, sep='\t', file=sys.stderr)
+            # import traceback
+            # traceback.print_exc()
+            results.append({
+                'pid': pid,
+                'status': False,
+                'data': data['xml']
+            })
+
+
+class Marc21toJson():
+    """Class for Marc21 recorts to Json transformation."""
+
+    count = 0
+    count_ok = 0
+    count_ko = 0
+
+    def __init__(self, xml_file, json_file_ok, xml_file_error,
+                 parallel=8, chunk=5000,
+                 verbose=False, debug=False, pid_requierd=False):
+        self.xml_file = xml_file
+        self.json_file_ok = json_file_ok
+        self.xml_file_error = xml_file_error
+        self.parallel = parallel
+        self.chunk = chunk
+        self.verbose = verbose
+        if verbose:
+            print('Main process pid:', multiprocessing.current_process().pid)
+        self.debug = debug
+        if debug:
+            multiprocessing.log_to_stderr(logging.DEBUG)
+        self.pid_requierd = pid_requierd
+        self.ctx = multiprocessing.get_context("spawn")
+        manager = self.ctx.Manager()
+        self.results = manager.list()
+        self.process_records = {}
+        self.process_records['new'] = {'records': []}
+        return self.start()
+
+    # get_index_of_smallest_pid could be used to order the results
+    # but it's slows down the transformation
+    def get_index_of_smallest_pid(self):
+        """Get smallest pid from results."""
+        smallest_pid = float('inf')
+        smallest_index = 0
+        for index, result in enumerate(self.results):
+            pid = result.get('data').get('pid')
+            if not pid:
+                return 0
+            pid = int(pid)
+            if pid < smallest_pid:
+                smallest_pid = pid
+                smallest_index = index
+        return smallest_index
+
+    def write_results(self):
+        """Write results from multiprocess to file."""
+        while self.results:
+            # use for orderd pid results:
+            # value = results.pop(get_index_of_smallest_pid(results))
+            value = self.results.pop(0)
+            status = value.get('status')
+            data = value.get('data')
+            if status:
+                self.count_ok += 1
+                for line in json.dumps(data, indent=2).split('\n'):
+                    self.json_file_ok.write('\n  ' + line)
+                self.json_file_ok.write(',')
+            else:
+                self.count_ko += 1
+                self.xml_file_error.write(data)
+        # free memory from garbage collector
+        gc.collect()
+
+    def wait_free_process(self, parallel):
+        """Wait for process to finish."""
+        while len(self.process_records) > parallel:
+            for key in list(self.process_records.keys()):
+                process = self.process_records[key].get('process')
+                if process:
+                    if not process.is_alive():
+                        process.join()
+                        self.process_records[key]['records'] = None
+                        self.process_records[key]['process'] = None
+                        self.process_records[key] = None
+                        del self.process_records[key]
+            time.sleep(1)
+        # print('++++++>', gc.collect())
+
+    def start_new_process(self):
+        """Start a new process in context."""
+        self.wait_free_process(self.parallel)
+        self.process_records[self.count] = self.process_records.pop('new')
+        new_process = self.ctx.Process(
+            target=do_worker,
+            args=(self.process_records[self.count]['records'],
+                  self.results, self.pid_requierd)
+        )
+        new_process.start()
+        self.process_records[self.count]['process'] = new_process
+        self.process_records['new'] = {'records': []}
+        if self.verbose:
+            if self.count < self.chunk:
+                start = 1
+            else:
+                start = self.count - self.chunk + 1
+            print('Start process pid:', new_process.pid, 'records:',
+                  start, '..', self.count)
+
+    def start(self):
+        self.json_file_ok.write('[')
+        self.xml_file_error.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
+        self.xml_file_error.write(
+            b'<collection xmlns="http://www.loc.gov/MARC21/slim">\n\n'
+        )
+        for marc21xml in split_stream(self.xml_file):
+            marc21json_record = create_record(marc21xml)
+            self.process_records['new']['records'].append({
+                'json': marc21json_record,
+                'xml': etree.tostring(
+                    marc21xml,
+                    pretty_print=True,
+                    encoding='UTF-8').strip()
+            })
+            self.count += 1
+            if len(self.process_records['new']['records']) % self.chunk == 0:
+                self.write_results()
+                self.start_new_process()
+        # process the remaining records
+        self.start_new_process()
+        # wait for all processes to finish (only process_records['new'])
+        self.wait_free_process(1)
+        self.write_results()
+        self.json_file_ok.write('\n]')
+        self.xml_file_error.write(b'\n</collection>')
+        return self.count, self.count_ok, self.count_ko
+
+
+@utils.command('marc21tojson')
+@click.argument('xml_file', type=click.File('rb'))
+@click.argument('json_file_ok', type=click.File('w'))
+@click.argument('xml_file_error', type=click.File('wb'))
+@click.option('-p', '--parallel', 'parallel', default=8)
+@click.option('-c', '--chunk', 'chunk', default=5000)
+@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
+@click.option('-d', '--debug', 'debug', is_flag=True, default=False)
+@click.option('-r', '--pidrequierd', 'pid_requierd', is_flag=True,
+              default=False)
+def marc21json(xml_file, json_file_ok, xml_file_error, parallel, chunk,
+               verbose, debug, pid_requierd):
+    """Convert xml file to json with dojson."""
+
+    click.secho('Marc21 to Json transform: ', fg='green', nl=False)
+    if pid_requierd and verbose:
+        click.secho(' (validation tests pid) ', nl=False)
+    click.secho(xml_file.name)
+
+    count, count_ok, count_ko = Marc21toJson(xml_file, json_file_ok,
+                                             xml_file_error,
+                                             parallel, chunk,
+                                             verbose, debug, pid_requierd)
+
+    click.secho('Total records: ', fg='green', nl=False)
+    click.secho(str(count), nl=False)
+    click.secho('-', nl=False)
+    click.secho(str(count_ok + count_ko))
+
+    click.secho('Records transformed: ', fg='green', nl=False)
+    click.secho(str(count_ok))
+    if count_ko:
+        click.secho('Records with errors: ', fg='red', nl=False)
+        click.secho(str(count_ko))
diff --git a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
diff --git a/rero_ils/modules/documents/jsonschemas/documents/document-minimal-v0.0.1.json b/rero_ils/modules/documents/jsonschemas/documents/document-minimal-v0.0.1.json
diff --git a/rero_ils/modules/documents/jsonschemas/documents/document-minimal-v0.0.1_src.json b/rero_ils/modules/documents/jsonschemas/documents/document-minimal-v0.0.1_src.json
diff --git a/rero_ils/modules/documents/jsonschemas/documents/document-v0.0.1.json b/rero_ils/modules/documents/jsonschemas/documents/document-v0.0.1.json
diff --git a/rero_ils/modules/documents/jsonschemas/documents/document-v0.0.1_src.json b/rero_ils/modules/documents/jsonschemas/documents/document-v0.0.1_src.json
diff --git a/rero_ils/modules/documents/jsonschemas/form_documents/document-v0.0.1.json b/rero_ils/modules/documents/jsonschemas/form_documents/document-v0.0.1.json
diff --git a/scripts/dojson_virtua b/scripts/dojson_virtua
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+NC='\033[0m'            # Default color
+COLORED='\033[1;97;44m' # Bold + white + blue background
+
+set -e
+
+msg() {
+  echo -e "${COLORED}${EMPHASIS}[INFO]${NC}${COLORED}: ${1}${NC}" 1>&2
+}
+
+msg Dojson ${1}
+pipenv run dojson -i ${1} -l marcxml -d pjson do marc21tojson >${1%.*}.json 2>${1%.*}.log
+
+msg Validate: ${1%.*}.json
+pipenv run invenio utils validate ${1%.*}.json documents document-v0.0.1.json -e ${1%.*}_error.json -o ${1%.*}_ok.json
diff --git a/scripts/setup b/scripts/setup
@@ -44,7 +44,7 @@ CREATE_ITEMS_HOLDINGS_BIG=false
 STOP_EXECUTION=true
 
 # options may be followed by one colon to indicate they have a required argument
-if ! options=$(getopt -o dsb -l deployment,create_items_holdings_small,create_items_holdings_big -- "$@")
+if ! options=$(getopt -o dsb -l deployment,create_items_holdings_small,create_items_holdings_big,data_path: -- "$@")
 then
     # something went wrong, getopt will put out an error message for us
     exit 1
@@ -57,13 +57,20 @@ do
     -s|--create_items_holdings_small) CREATE_ITEMS_HOLDINGS_SMALL=true ;;
     -b|--create_items_holdings_big) CREATE_ITEMS_HOLDINGS_BIG=true ;;
     -c|--continue) STOP_EXECUTION=false ;;
+    -D|--data_path) DATA_PATH=$2 ;;
     (--) shift; break;;
     (-*) display_error_message "$0: error - unrecognized option $1"; exit 1;;
     (*) break;;
     esac
     shift
 done
 
+
+if [ ! -d $DATA_PATH ]; then
+    display_error_message "Error - data path does not exist: $DATA_PATH"
+    exit 1
+fi
+
 if $CREATE_ITEMS_HOLDINGS_SMALL && $CREATE_ITEMS_HOLDINGS_BIG
 then
     display_error_message "Error - chose option for 'small' or 'big' documents generation"
@@ -163,9 +170,12 @@ pipenv run invenio fixtures import_users ${DATA_PATH}/users.json -v
 
 # # xml to json transformation for rero marcxml records
 # pipenv run dojson -i ${DATA_PATH}/documents_big.xml -l marcxml -d pjson do marc21tojson > ${DATA_PATH}/documents_big.json
-# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json -s documents_big_errors.json
+# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json
+# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json -e documents_big_errors.json -o documents_big_ok.json
+
 # pipenv run dojson -i ${DATA_PATH}/documents_small.xml -l marcxml -d pjson do marc21tojson > ${DATA_PATH}/documents_small.json
-# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json -s documents_small_errors.json
+# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json
+# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json -e documents_small_errors.json -o documents_small_errors.json
 
 if $DEPLOYMENT
 then

diff --git a/setup.py b/setup.py
@@ -155,7 +155,7 @@ def run(self):
             'head = rero_ils.dojson.cli:head',
         ],
         'dojson.cli.dump': [
-            'pjson = rero_ils.dojson.cli:pretty_json_dump',
+            'pjson = rero_ils.dojson.cli:pretty_json_dump'
         ],
         'dojson.cli.rule': [
             'marc21tojson = rero_ils.modules.documents.dojson'

diff --git a/tests/unit/test_documents_dojson.py b/tests/unit/test_documents_dojson.py
@@ -1663,6 +1663,7 @@ def test_get_mef_person_link(mock_get, capsys):
         }
     })
     mef_url = get_mef_person_link(
+        bibid='1',
         id='(RERO)A003945843',
         key='100..',
         value={'0': '(RERO)A003945843'}
@@ -1671,6 +1672,7 @@ def test_get_mef_person_link(mock_get, capsys):
 
     os.environ['RERO_ILS_MEF_HOST'] = 'mefdev.test.rero.ch'
     mef_url = get_mef_person_link(
+        bibid='1',
         id='(RERO)A003945843',
         key='100..',
         value={'0': '(RERO)A003945843'}
@@ -1679,11 +1681,12 @@ def test_get_mef_person_link(mock_get, capsys):
 
     mock_get.return_value = mock_response(status=400)
     mef_url = get_mef_person_link(
+        bibid='1',
         id='(RERO)AXXXXXXXXX',
         key='100..',
         value={'0': '(RERO)AAXXXXXXXXX'}
     )
     assert not mef_url
     out, err = capsys.readouterr()
-    assert err == 'ERROR: MEF request ' +\
-        'https://mefdev.test.rero.ch/api/mef/?q=rero.pid:AXXXXXXXXX 400\n'
+    assert err == "ERROR MEF REQUEST:\t1\t" +\
+        'https://mefdev.test.rero.ch/api/mef/?q=rero.pid:AXXXXXXXXX\t400\n'