Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
metadata: fix dojeson for virtua records
Browse files Browse the repository at this point in the history
* Fixes dojson transformation errors.
* Adds data path parameter to setup script.
* Adds all languages to documents json schema and form.
* Adds marc21tojson utils cli for parallel dojson transformation.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rerowep and rerowep committed Oct 21, 2019
1 parent f9d91d2 commit 3e62499
Showing 15 changed files with 6,837 additions and 1,679 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -27,6 +27,7 @@ include docker/postgres/Dockerfile
include Dockerfile
include scripts/bootstrap
include scripts/console
include scripts/dojson_virtua
include scripts/server
include scripts/setup
include scripts/update
2,386 changes: 1,193 additions & 1,193 deletions data/documents_big.xml

Large diffs are not rendered by default.

572 changes: 286 additions & 286 deletions data/documents_small.xml

Large diffs are not rendered by default.

76 changes: 55 additions & 21 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
@@ -18,11 +18,26 @@
"""Dojson utils."""

import re
import sys
import traceback

import click
from dojson import Overdo, utils


def not_repetitive(bibid, key, value, subfield, default=None):
"""Get the first value if the value is a list or tuple."""
if default is None:
data = value.get(subfield)
else:
data = value.get(subfield, default)
if isinstance(data, (dict, tuple)):
print('WARNING NOT REPETITIVE:', bibid, key, subfield, value,
sep='\t', file=sys.stderr)
data = data[0]
return data


def remove_trailing_punctuation(
data,
punctuation=',',
@@ -121,6 +136,7 @@ class ReroIlsMarc21Overdo(ReroIlsOverdo):
This class adds RERO Marc21 properties and functions to the ReroIlsOverdo.
"""

bib_id = ''
field_008_data = ''
lang_from_008 = None
date1_from_008 = None
@@ -134,29 +150,43 @@ def __init__(self, bases=None, entry_point_group=None):
"""Reroilsmarc21overdo init."""
super(ReroIlsMarc21Overdo, self).__init__(
bases=bases, entry_point_group=entry_point_group)
self.count = 0

def do(self, blob, ignore_missing=True, exception_handlers=None):
"""Translate blob values and instantiate new model instance."""
self.blob_record = blob
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
)
self.count += 1
# print('INFO:', blob.get('001', '???'), self.count, file=sys.stderr)
result = None
try:
self.blob_record = blob
try:
self.bib_id = self.get_fields(tag='001')[0]['data']
except Exception as err:
self.bib_id = '???'
import sys
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
)
except Exception as err:
print('ERROR:', self.bib_id, self.count, err,
sep='\t', file=sys.stderr)
traceback.print_exc()
return result

def get_link_data(self, subfields_6_data):
@@ -185,7 +215,11 @@ def init_country(self):
field_044 = fields_044[0]
cantons_codes = self.get_subfields(field_044, 'c')
for cantons_codes in self.get_subfields(field_044, 'c'):
self.cantons.append(cantons_codes.split('-')[1])
try:
self.cantons.append(cantons_codes.split('-')[1])
except:
print('ERROR INIT CANTONS:', self.bib_id, cantons_codes,
sep='\t', file=sys.stderr)
if self.cantons:
self.country = 'sz'
else:
238 changes: 231 additions & 7 deletions rero_ils/modules/cli.py
Original file line number Diff line number Diff line change
@@ -20,16 +20,21 @@
from __future__ import absolute_import, print_function

import difflib
import gc
import json
import logging
import multiprocessing
import os
import sys
import time
from collections import OrderedDict
from glob import glob
from json import loads

import click
import jsonref
import yaml
from dojson.contrib.marc21.utils import create_record, split_stream
from flask import current_app
from flask.cli import with_appcontext
from flask_security.confirmable import confirm_user
@@ -41,9 +46,11 @@
from invenio_search.proxies import current_search
from jsonschema import validate
from jsonschema.exceptions import ValidationError
from lxml import etree
from pkg_resources import resource_string
from werkzeug.local import LocalProxy

from .documents.dojson.contrib.marc21tojson import marc21tojson
from .items.cli import create_items, reindex_items
from .loans.cli import create_loans
from .patrons.cli import import_users
@@ -427,8 +434,10 @@ def test_license(file, extension, copyrights, license_lines, verbose):
@click.argument('type', default='documents')
@click.argument('schema', default='document-v0.0.1.json')
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-s', '--save', 'savefile', type=click.File('w'), default=None)
def check_validate(jsonfile, type, schema, verbose, savefile):
@click.option('-e', '--error_file', 'error_file', type=click.File('w'),
default=None)
@click.option('-o', '--ok_file', 'ok_file', type=click.File('w'), default=None)
def check_validate(jsonfile, type, schema, verbose, error_file, ok_file):
"""Check record validation."""
click.secho('Testing json schema for file', fg='green')
schema_in_bytes = resource_string(
@@ -438,7 +447,6 @@ def check_validate(jsonfile, type, schema, verbose, savefile):
schema=schema
)
)

schema = loads(schema_in_bytes.decode('utf8'))
datas = json.load(jsonfile)
count = 0
@@ -454,13 +462,17 @@ def check_validate(jsonfile, type, schema, verbose, savefile):
data["pid"] = 'dummy'
try:
validate(data, schema)
except ValidationError as excp:
if savefile:
savefile.write(json.dumps(data, indent=2))
if ok_file:
if data["pid"] == 'dummy':
del data["pid"]
ok_file.write(json.dumps(data, indent=2))
except ValidationError as err:
if error_file:
error_file.write(json.dumps(data, indent=2))
click.secho(
'Error validate in record: {count}'.format(count=count),
fg='red')
click.secho(str(excp))
click.secho(str(err))


@utils.command('compile_json')
@@ -475,3 +487,215 @@ def compile_json(src_jsonfile, output, verbose):
if not output:
output = sys.stdout
json.dump(data, fp=output, indent=2)


def do_worker(marc21records, results, pid_requierd):
"""Worker for marc21 to json transformation."""
schema_in_bytes = resource_string(
'rero_ils.modules.documents.jsonschemas',
'documents/document-v0.0.1.json'
)
schema = loads(schema_in_bytes.decode('utf8'))
for data in marc21records:
data_json = data['json']
pid = data_json.get('001', '???')
try:
record = marc21tojson.do(data_json)
if not record.get("$schema"):
# create dummy schema in data
record["$schema"] = 'dummy'
if not pid_requierd:
if not record.get("pid"):
# create dummy pid in data
record["pid"] = 'dummy'
validate(record, schema)
if record["$schema"] == 'dummy':
del record["$schema"]
if not pid_requierd:
if record["pid"] == 'dummy':
del record["pid"]
results.append({
'status': True,
'data': record
})
except Exception as err:
msg = ''
# make the error message shorter
for line in str(err).split('\n'):
if line and line[0] != ' ':
if len(line) > 100:
line = line[:25] + ' ...'
msg += '\t' + line
msg = msg.strip()
print('ERROR:', pid, msg, sep='\t', file=sys.stderr)
# import traceback
# traceback.print_exc()
results.append({
'pid': pid,
'status': False,
'data': data['xml']
})


class Marc21toJson():
"""Class for Marc21 recorts to Json transformation."""

count = 0
count_ok = 0
count_ko = 0

def __init__(self, xml_file, json_file_ok, xml_file_error,
parallel=8, chunk=5000,
verbose=False, debug=False, pid_requierd=False):
self.xml_file = xml_file
self.json_file_ok = json_file_ok
self.xml_file_error = xml_file_error
self.parallel = parallel
self.chunk = chunk
self.verbose = verbose
if verbose:
print('Main process pid:', multiprocessing.current_process().pid)
self.debug = debug
if debug:
multiprocessing.log_to_stderr(logging.DEBUG)
self.pid_requierd = pid_requierd
self.ctx = multiprocessing.get_context("spawn")
manager = self.ctx.Manager()
self.results = manager.list()
self.process_records = {}
self.process_records['new'] = {'records': []}
return self.start()

# get_index_of_smallest_pid could be used to order the results
# but it's slows down the transformation
def get_index_of_smallest_pid(self):
"""Get smallest pid from results."""
smallest_pid = float('inf')
smallest_index = 0
for index, result in enumerate(self.results):
pid = result.get('data').get('pid')
if not pid:
return 0
pid = int(pid)
if pid < smallest_pid:
smallest_pid = pid
smallest_index = index
return smallest_index

def write_results(self):
"""Write results from multiprocess to file."""
while self.results:
# use for orderd pid results:
# value = results.pop(get_index_of_smallest_pid(results))
value = self.results.pop(0)
status = value.get('status')
data = value.get('data')
if status:
self.count_ok += 1
for line in json.dumps(data, indent=2).split('\n'):
self.json_file_ok.write('\n ' + line)
self.json_file_ok.write(',')
else:
self.count_ko += 1
self.xml_file_error.write(data)
# free memory from garbage collector
gc.collect()

def wait_free_process(self, parallel):
"""Wait for process to finish."""
while len(self.process_records) > parallel:
for key in list(self.process_records.keys()):
process = self.process_records[key].get('process')
if process:
if not process.is_alive():
process.join()
self.process_records[key]['records'] = None
self.process_records[key]['process'] = None
self.process_records[key] = None
del self.process_records[key]
time.sleep(1)
# print('++++++>', gc.collect())

def start_new_process(self):
"""Start a new process in context."""
self.wait_free_process(self.parallel)
self.process_records[self.count] = self.process_records.pop('new')
new_process = self.ctx.Process(
target=do_worker,
args=(self.process_records[self.count]['records'],
self.results, self.pid_requierd)
)
new_process.start()
self.process_records[self.count]['process'] = new_process
self.process_records['new'] = {'records': []}
if self.verbose:
if self.count < self.chunk:
start = 1
else:
start = self.count - self.chunk + 1
print('Start process pid:', new_process.pid, 'records:',
start, '..', self.count)

def start(self):
self.json_file_ok.write('[')
self.xml_file_error.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
self.xml_file_error.write(
b'<collection xmlns="http://www.loc.gov/MARC21/slim">\n\n'
)
for marc21xml in split_stream(self.xml_file):
marc21json_record = create_record(marc21xml)
self.process_records['new']['records'].append({
'json': marc21json_record,
'xml': etree.tostring(
marc21xml,
pretty_print=True,
encoding='UTF-8').strip()
})
self.count += 1
if len(self.process_records['new']['records']) % self.chunk == 0:
self.write_results()
self.start_new_process()
# process the remaining records
self.start_new_process()
# wait for all processes to finish (only process_records['new'])
self.wait_free_process(1)
self.write_results()
self.json_file_ok.write('\n]')
self.xml_file_error.write(b'\n</collection>')
return self.count, self.count_ok, self.count_ko


@utils.command('marc21tojson')
@click.argument('xml_file', type=click.File('rb'))
@click.argument('json_file_ok', type=click.File('w'))
@click.argument('xml_file_error', type=click.File('wb'))
@click.option('-p', '--parallel', 'parallel', default=8)
@click.option('-c', '--chunk', 'chunk', default=5000)
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-d', '--debug', 'debug', is_flag=True, default=False)
@click.option('-r', '--pidrequierd', 'pid_requierd', is_flag=True,
default=False)
def marc21json(xml_file, json_file_ok, xml_file_error, parallel, chunk,
verbose, debug, pid_requierd):
"""Convert xml file to json with dojson."""

click.secho('Marc21 to Json transform: ', fg='green', nl=False)
if pid_requierd and verbose:
click.secho(' (validation tests pid) ', nl=False)
click.secho(xml_file.name)

count, count_ok, count_ko = Marc21toJson(xml_file, json_file_ok,
xml_file_error,
parallel, chunk,
verbose, debug, pid_requierd)

click.secho('Total records: ', fg='green', nl=False)
click.secho(str(count), nl=False)
click.secho('-', nl=False)
click.secho(str(count_ok + count_ko))

click.secho('Records transformed: ', fg='green', nl=False)
click.secho(str(count_ok))
if count_ko:
click.secho('Records with errors: ', fg='red', nl=False)
click.secho(str(count_ko))
228 changes: 145 additions & 83 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

990 changes: 982 additions & 8 deletions rero_ils/modules/documents/jsonschemas/documents/document-v0.0.1.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,934 changes: 1,907 additions & 27 deletions rero_ils/modules/documents/jsonschemas/form_documents/document-v0.0.1.json

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions scripts/dojson_virtua
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash
# -*- coding: utf-8 -*-
#
# RERO ILS
# Copyright (C) 2019 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

NC='\033[0m' # Default color
COLORED='\033[1;97;44m' # Bold + white + blue background

set -e

msg() {
echo -e "${COLORED}${EMPHASIS}[INFO]${NC}${COLORED}: ${1}${NC}" 1>&2
}

msg Dojson ${1}
pipenv run dojson -i ${1} -l marcxml -d pjson do marc21tojson >${1%.*}.json 2>${1%.*}.log

msg Validate: ${1%.*}.json
pipenv run invenio utils validate ${1%.*}.json documents document-v0.0.1.json -e ${1%.*}_error.json -o ${1%.*}_ok.json
16 changes: 13 additions & 3 deletions scripts/setup
Original file line number Diff line number Diff line change
@@ -44,7 +44,7 @@ CREATE_ITEMS_HOLDINGS_BIG=false
STOP_EXECUTION=true

# options may be followed by one colon to indicate they have a required argument
if ! options=$(getopt -o dsb -l deployment,create_items_holdings_small,create_items_holdings_big -- "$@")
if ! options=$(getopt -o dsb -l deployment,create_items_holdings_small,create_items_holdings_big,data_path: -- "$@")
then
# something went wrong, getopt will put out an error message for us
exit 1
@@ -57,13 +57,20 @@ do
-s|--create_items_holdings_small) CREATE_ITEMS_HOLDINGS_SMALL=true ;;
-b|--create_items_holdings_big) CREATE_ITEMS_HOLDINGS_BIG=true ;;
-c|--continue) STOP_EXECUTION=false ;;
-D|--data_path) DATA_PATH=$2 ;;
(--) shift; break;;
(-*) display_error_message "$0: error - unrecognized option $1"; exit 1;;
(*) break;;
esac
shift
done


if [ ! -d $DATA_PATH ]; then
display_error_message "Error - data path does not exist: $DATA_PATH"
exit 1
fi

if $CREATE_ITEMS_HOLDINGS_SMALL && $CREATE_ITEMS_HOLDINGS_BIG
then
display_error_message "Error - chose option for 'small' or 'big' documents generation"
@@ -163,9 +170,12 @@ pipenv run invenio fixtures import_users ${DATA_PATH}/users.json -v

# # xml to json transformation for rero marcxml records
# pipenv run dojson -i ${DATA_PATH}/documents_big.xml -l marcxml -d pjson do marc21tojson > ${DATA_PATH}/documents_big.json
# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json -s documents_big_errors.json
# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json
# pipenv run invenio utils validate data/documents_big.json documents document-v0.0.1.json -e documents_big_errors.json -o documents_big_ok.json

# pipenv run dojson -i ${DATA_PATH}/documents_small.xml -l marcxml -d pjson do marc21tojson > ${DATA_PATH}/documents_small.json
# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json -s documents_small_errors.json
# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json
# pipenv run invenio utils validate data/documents_small.json documents document-v0.0.1.json -e documents_small_errors.json -o documents_small_errors.json

if $DEPLOYMENT
then
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -155,7 +155,7 @@ def run(self):
'head = rero_ils.dojson.cli:head',
],
'dojson.cli.dump': [
'pjson = rero_ils.dojson.cli:pretty_json_dump',
'pjson = rero_ils.dojson.cli:pretty_json_dump'
],
'dojson.cli.rule': [
'marc21tojson = rero_ils.modules.documents.dojson'
7 changes: 5 additions & 2 deletions tests/unit/test_documents_dojson.py
Original file line number Diff line number Diff line change
@@ -1663,6 +1663,7 @@ def test_get_mef_person_link(mock_get, capsys):
}
})
mef_url = get_mef_person_link(
bibid='1',
id='(RERO)A003945843',
key='100..',
value={'0': '(RERO)A003945843'}
@@ -1671,6 +1672,7 @@ def test_get_mef_person_link(mock_get, capsys):

os.environ['RERO_ILS_MEF_HOST'] = 'mefdev.test.rero.ch'
mef_url = get_mef_person_link(
bibid='1',
id='(RERO)A003945843',
key='100..',
value={'0': '(RERO)A003945843'}
@@ -1679,11 +1681,12 @@ def test_get_mef_person_link(mock_get, capsys):

mock_get.return_value = mock_response(status=400)
mef_url = get_mef_person_link(
bibid='1',
id='(RERO)AXXXXXXXXX',
key='100..',
value={'0': '(RERO)AAXXXXXXXXX'}
)
assert not mef_url
out, err = capsys.readouterr()
assert err == 'ERROR: MEF request ' +\
'https://mefdev.test.rero.ch/api/mef/?q=rero.pid:AXXXXXXXXX 400\n'
assert err == "ERROR MEF REQUEST:\t1\t" +\
'https://mefdev.test.rero.ch/api/mef/?q=rero.pid:AXXXXXXXXX\t400\n'

0 comments on commit 3e62499

Please sign in to comment.