From 2e8b63f4a512356e6647afa47c4850bd33511e25 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Mon, 30 May 2022 11:56:20 +0200 Subject: [PATCH] Schemas * Added schemas for publisher (Springer) parser * Added schemas for generic parser * Added schemas for enhancer output * Added tests * Added note about schemas in README * ref: cern-sis/issues-scoap3#65 --- README.md | 6 + dags/schemas/enhancement_schema.py | 84 ++++++++++++++ dags/schemas/generic_parser_schema.py | 57 +++++++++ dags/schemas/parser_schema.py | 57 +++++++++ requirements.txt | 1 + .../test_enchancement_schema/correct.json | 20 ++++ .../data/test_enchancement_schema/empty.json | 1 + .../missing_fields.json | 18 +++ .../test_generic_parser_schema/correct.json | 108 ++++++++++++++++++ .../test_generic_parser_schema/empty.json | 1 + .../missing_fields.json | 92 +++++++++++++++ .../data/test_parser_schema/correct.json | 67 +++++++++++ .../data/test_parser_schema/empty.json | 1 + .../test_parser_schema/missing_fields.json | 50 ++++++++ .../units/schemas/test_enhancement_schema.py | 38 ++++++ .../schemas/test_generic_parser_schema.py | 51 +++++++++ tests/units/schemas/test_parser_schema.py | 59 ++++++++++ 17 files changed, 711 insertions(+) create mode 100644 dags/schemas/enhancement_schema.py create mode 100644 dags/schemas/generic_parser_schema.py create mode 100644 dags/schemas/parser_schema.py create mode 100644 tests/units/schemas/data/test_enchancement_schema/correct.json create mode 100644 tests/units/schemas/data/test_enchancement_schema/empty.json create mode 100644 tests/units/schemas/data/test_enchancement_schema/missing_fields.json create mode 100644 tests/units/schemas/data/test_generic_parser_schema/correct.json create mode 100644 tests/units/schemas/data/test_generic_parser_schema/empty.json create mode 100644 tests/units/schemas/data/test_generic_parser_schema/missing_fields.json create mode 100644 tests/units/schemas/data/test_parser_schema/correct.json create mode 100644 tests/units/schemas/data/test_parser_schema/empty.json create mode 100644 tests/units/schemas/data/test_parser_schema/missing_fields.json create mode 100644 tests/units/schemas/test_enhancement_schema.py create mode 100644 tests/units/schemas/test_generic_parser_schema.py create mode 100644 tests/units/schemas/test_parser_schema.py diff --git a/README.md b/README.md index 253a4a6a..074ac25e 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,9 @@ A Makefile has been created to ease this process. The available targets are the Airflow UI will be rinning on localhost:8080. More details about Airflow installation and running can be found [here](https://airflow.apache.org/docs/apache-airflow/stable/start/local.html) + +## Schemas + +1. Parser schema - describes the structure of the output of publisher parser, and input for generic parser +2. Generic parser schema - describes the structure of the output of generic parser, and input for enhancer +3. Enhancer schema - describes the structure of the output of enhancer diff --git a/dags/schemas/enhancement_schema.py b/dags/schemas/enhancement_schema.py new file mode 100644 index 00000000..174a534f --- /dev/null +++ b/dags/schemas/enhancement_schema.py @@ -0,0 +1,84 @@ +from marshmallow import Schema, fields + + +class Abstracts(Schema): + value = fields.Str( + required=True, error_messages={"required": "Value in abstracts is required"} + ) + source = fields.Str( + required=True, error_messages={"required": "Source in abstracts is required"} + ) + + +class AquisitionSource(Schema): + source = fields.Str( + required=True, + error_messages={"required": "Source in aquisition source is required"}, + ) + method = fields.Str( + required=True, + error_messages={"required": "Method in aquisition source is required"}, + ) + date = fields.DateTime( + required=True, + error_messages={"required": "Date in aquisition source is required"}, + ) + submission_number = fields.Str( + required=True, + error_messages={ + "required": "Submission number in aquisition source is required" + }, + ) + + +class CopyRight(Schema): + holder = fields.Str( + required=True, + error_messages={"required": "Holder in copy right source is required"}, + ) + year = fields.Int( + required=True, + error_messages={"required": "Year in copy right source is required"}, + ) + statement = fields.Str( + required=True, + error_messages={"required": "Statement in copy right source is required"}, + ) + material = fields.Str( + required=True, + error_messages={"required": "Material in copy right source is required"}, + ) + + +class Imprints(Schema): + date = fields.Date( + required=True, + error_messages={"required": "Date in imprints source is required"}, + ) + publisher = fields.Str( + required=True, + error_messages={"required": "Date in imprints source is required"}, + ) + + +class Titles(Schema): + title = fields.Str( + required=True, error_messages={"required": "Tile in titles source is required"} + ) + subtitle = fields.Str( + required=True, + error_messages={"required": "Subtitle in subtitle source is required"}, + ) + source = fields.Str( + required=True, + error_messages={"required": "Source in titles source is required"}, + ) + + +class EnhancementSchema(Schema): + abstracts = fields.List(fields.Nested(Abstracts()), required=True) + acquisition_source = fields.Nested(AquisitionSource(), required=True) + copyright = fields.List(fields.Nested(CopyRight()), required=True) + imprints = fields.List(fields.Nested(Imprints()), required=True) + record_creation_date = fields.DateTime(required=True) + titles = fields.List(fields.Nested(Titles()), required=True) diff --git a/dags/schemas/generic_parser_schema.py b/dags/schemas/generic_parser_schema.py new file mode 100644 index 00000000..6b868bcf --- /dev/null +++ b/dags/schemas/generic_parser_schema.py @@ -0,0 +1,57 @@ +from marshmallow import Schema, fields +from schemas.parser_schema import Affiliations, Author, License, ValueDict + + +class ClassificationNumber(Schema): + classification_number = fields.Str(required=True) + standard = fields.Str(required=True) + + +class Collection(Schema): + primary = fields.Str(required=True) + + +class FreeKeyword(Schema): + source = fields.Str(required=True) + value = fields.Str(required=True) + + +class ThesisSupervisor(Schema): + affiliations = fields.List(fields.Nested(Affiliations()), required=True) + full_name = fields.Str(required=True) + + +class PublicationInfo(Schema): + artid = fields.Str(required=True) + journal_issue = fields.Str(required=True) + journal_title = fields.Str(required=True) + journal_volume = fields.Str(required=True) + material = fields.Str(required=True) + page_end = fields.Str(required=True) + page_start = fields.Str(required=True) + year = fields.Int(required=True) + + +class GenericParserSchema(Schema): + abstract = fields.Str(required=True) + arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True) + authors = fields.List(fields.Nested(Author()), required=True) + classification_numbers = fields.List( + fields.Nested(ClassificationNumber()), required=True + ) + collaborations = fields.List(fields.Nested(ValueDict()), required=True) + collections = fields.List(fields.Nested(Collection()), required=True) + control_field = fields.Str(required=True) + copyright_holder = fields.Str(required=True) + copyright_year = fields.Str(required=True) + date_published = fields.Date(required=True) + dois = fields.List(fields.Nested(ValueDict()), required=True) + free_keywords = fields.List(fields.Nested(FreeKeyword()), required=True) + license = fields.List(fields.Nested(License()), required=True) + local_files = fields.List(fields.Nested(ValueDict()), required=True) + page_nr = fields.List(fields.Int(required=True)) + publication_info = fields.List(fields.Nested(PublicationInfo()), required=True) + thesis = fields.Str(required=True) + thesis_supervisor = fields.List(fields.Nested(ThesisSupervisor()), required=True) + title = fields.Str(required=True) + urls = fields.List(fields.Nested(ValueDict()), required=True) diff --git a/dags/schemas/parser_schema.py b/dags/schemas/parser_schema.py new file mode 100644 index 00000000..908c9baf --- /dev/null +++ b/dags/schemas/parser_schema.py @@ -0,0 +1,57 @@ +from marshmallow import Schema, fields + + +class ValueDict(Schema): + value = fields.Str(required=True) + + +class Affiliations(Schema): + value = fields.Str(required=True) + organization = fields.Str(required=True) + country = fields.Str(required=True) + + +class Author(Schema): + surname = fields.Str(required=True) + given_names = fields.Str(required=True) + email = fields.Str(required=True) + affiliations = fields.List(fields.Nested(Affiliations())) + full_name = fields.Str(required=True) + + +class License(Schema): + license = fields.Str(required=True) + url = fields.Str(required=True) + + +class ParserSchema(Schema): + journal_doctype = fields.Str(required=True) + dois = fields.List(fields.Str(), required=True) + arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True) + page_nr = fields.List(fields.Int(), required=True) + abstract = fields.Str(required=True) + title = fields.Str(required=True) + classification_numbers = fields.List(fields.Str(), required=True) + authors = fields.List(fields.Nested(Author()), required=True) + collaborations = fields.List(fields.Str(), required=True) + journal_title = fields.Str(required=True) + journal_issue = fields.Str(required=True) + journal_volume = fields.Str(required=True) + journal_artid = fields.Str(required=True) + journal_fpage = fields.Str(required=True) + journal_lpage = fields.Str(required=True) + journal_year = fields.Int(required=True) + date_published = fields.Date(required=True) + related_article_doi = fields.List(fields.Str(), required=True) + copyright_holder = fields.Str(required=True) + # Really copy right year is a string? + copyright_year = fields.Str(required=True) + license = fields.List(fields.Nested(License()), required=True) + collections = fields.List(fields.Str(), required=True) + control_field = fields.List(fields.Str(), required=True) + free_keywords = fields.List(fields.Str(), required=True) + # is thesis supervisor really the same as author? + thesis_supervisor = fields.List(fields.Nested(Author()), required=True) + thesis = fields.List(fields.Str(), required=True) + urls = fields.List(fields.Str(), required=True) + local_files = fields.List(fields.Str(), required=True) diff --git a/requirements.txt b/requirements.txt index 98460367..e66a7bb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ PyYAML==6.0 furl==2.1.3 structlog==21.5.0 bleach==4.1.0 +marshmallow==3.15.0 diff --git a/tests/units/schemas/data/test_enchancement_schema/correct.json b/tests/units/schemas/data/test_enchancement_schema/correct.json new file mode 100644 index 00000000..7d426ece --- /dev/null +++ b/tests/units/schemas/data/test_enchancement_schema/correct.json @@ -0,0 +1,20 @@ +{ + "abstracts": [{ "value": "this is abstracts", "source": "Springer" }], + "acquisition_source": { + "source": "Springer", + "method": "Springer", + "date": "2022-05-20T00:00:00", + "submission_number": "path/to/the/file" + }, + "copyright": [ + { + "holder": "copyright_holder", + "year": "2020", + "statement": "copyright_statement", + "material": "copyright_material" + } + ], + "imprints": [{ "date": "2022-05-20", "publisher": "Springer" }], + "record_creation_date": "2022-05-20T00:00:00", + "titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }] +} diff --git a/tests/units/schemas/data/test_enchancement_schema/empty.json b/tests/units/schemas/data/test_enchancement_schema/empty.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/tests/units/schemas/data/test_enchancement_schema/empty.json @@ -0,0 +1 @@ +{} diff --git a/tests/units/schemas/data/test_enchancement_schema/missing_fields.json b/tests/units/schemas/data/test_enchancement_schema/missing_fields.json new file mode 100644 index 00000000..cd56d547 --- /dev/null +++ b/tests/units/schemas/data/test_enchancement_schema/missing_fields.json @@ -0,0 +1,18 @@ +{ + "acquisition_source": { + "source": "Springer", + "method": "Springer", + "date": "2022-05-20T00:00:00", + "submission_number": "path/to/the/file" + }, + "copyright": [ + { + "holder": "copyright_holder", + "year": "2020", + "statement": "copyright_statement", + "material": "copyright_material" + } + ], + "record_creation_date": "2022-05-20T00:00:00", + "titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }] +} diff --git a/tests/units/schemas/data/test_generic_parser_schema/correct.json b/tests/units/schemas/data/test_generic_parser_schema/correct.json new file mode 100644 index 00000000..f3379cb0 --- /dev/null +++ b/tests/units/schemas/data/test_generic_parser_schema/correct.json @@ -0,0 +1,108 @@ +{ + "abstract": "Test abstract", + "arxiv_eprints": [ + { + "value": "Test Eprint" + } + ], + "authors": [ + { + "affiliations": [ + { + "country": "Test country", + "organization": "Test org", + "value": "Test affiliation" + } + ], + "email": "test@email.com", + "full_name": "Test Surname, Test names", + "given_names": "Test names", + "surname": "Test Surname" + } + ], + "classification_numbers": [ + { + "classification_number": "Test classification 1", + "standard": "PACS" + }, + { + "classification_number": "Test classification 2", + "standard": "PACS" + } + ], + "collaborations": [ + { + "value": "Test collaboration" + } + ], + "collections": [ + { + "primary": "Test Collection" + } + ], + "control_field": "Test control field", + "copyright_holder": "Test Copyright", + "copyright_year": "2019", + "date_published": "2019-02-06", + "dois": [ + { + "value": "Test dois" + }, + { + "value": "Test related article doi" + } + ], + "free_keywords": [ + { + "source": "author", + "value": "Test free 1" + }, + { + "source": "author", + "value": "Test free 2" + } + ], + "license": [ + { + "license": "CC-BY-4.0", + "url": "https://creativecommons.org/licenses//by/4.0" + } + ], + "local_files": [ + { + "value": "Test local file" + } + ], + "page_nr": [45], + "publication_info": [ + { + "artid": "Test art-id", + "journal_issue": "2", + "journal_title": "Test title", + "journal_volume": "79", + "material": "article", + "page_end": "45", + "page_start": "1", + "year": 2019 + } + ], + "thesis": "Test thesis", + "thesis_supervisor": [ + { + "affiliations": [ + { + "country": "Test country", + "organization": "Test org", + "value": "Test affiliation" + } + ], + "full_name": "Test Surname, Test names" + } + ], + "title": "Test title", + "urls": [ + { + "value": "test.com" + } + ] +} diff --git a/tests/units/schemas/data/test_generic_parser_schema/empty.json b/tests/units/schemas/data/test_generic_parser_schema/empty.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/tests/units/schemas/data/test_generic_parser_schema/empty.json @@ -0,0 +1 @@ +{} diff --git a/tests/units/schemas/data/test_generic_parser_schema/missing_fields.json b/tests/units/schemas/data/test_generic_parser_schema/missing_fields.json new file mode 100644 index 00000000..6a8c1348 --- /dev/null +++ b/tests/units/schemas/data/test_generic_parser_schema/missing_fields.json @@ -0,0 +1,92 @@ +{ + "arxiv_eprints": [ + { + "value": "Test Eprint" + } + ], + "classification_numbers": [ + { + "classification_number": "Test classification 1", + "standard": "PACS" + }, + { + "classification_number": "Test classification 2", + "standard": "PACS" + } + ], + "collaborations": [ + { + "value": "Test collaboration" + } + ], + "collections": [ + { + "primary": "Test Collection" + } + ], + "control_field": "Test control field", + "copyright_holder": "Test Copyright", + "copyright_year": "2019", + "date_published": "2019-02-06", + "dois": [ + { + "value": "Test dois" + }, + { + "value": "Test related article doi" + } + ], + "free_keywords": [ + { + "source": "author", + "value": "Test free 1" + }, + { + "source": "author", + "value": "Test free 2" + } + ], + "license": [ + { + "license": "CC-BY-4.0", + "url": "https://creativecommons.org/licenses//by/4.0" + } + ], + "local_files": [ + { + "value": "Test local file" + } + ], + "page_nr": [45], + "publication_info": [ + { + "artid": "Test art-id", + "journal_issue": "2", + "journal_title": "Test title", + "journal_volume": "79", + "material": "article", + "page_end": "45", + "page_start": "1", + "year": 2019 + } + ], + "thesis": "Test thesis", + "thesis_supervisor": [ + { + "affiliations": [ + { + "country": "Test country", + "organization": "Test org", + "value": "Test affiliation" + } + ], + "full_name": "Test Surname, Test names" + } + ], + "title": "Test title", + "urls": [ + { + "value": "test.com" + } + ] +} diff --git a/tests/units/schemas/data/test_parser_schema/correct.json b/tests/units/schemas/data/test_parser_schema/correct.json new file mode 100644 index 00000000..f8484ed0 --- /dev/null +++ b/tests/units/schemas/data/test_parser_schema/correct.json @@ -0,0 +1,67 @@ +{ + "journal_doctype": "article", + "dois": ["Test dois"], + "arxiv_eprints": [ + { + "value": "Test Eprint" + } + ], + "page_nr": [45], + "abstract": "Test abstract", + "title": "Test title", + "classification_numbers": ["Test classification 1", "Test classification 2"], + "authors": [ + { + "surname": "Test Surname", + "given_names": "Test names", + "email": "test@email.com", + "affiliations": [ + { + "value": "Test affiliation", + "organization": "Test org", + "country": "Test country" + } + ], + "full_name": "Test, Name" + } + ], + "collaborations": ["Test collaboration"], + "journal_title": "Test title", + "journal_issue": "2", + "journal_volume": "79", + "journal_artid": "Test art-id", + "journal_fpage": "1", + "journal_lpage": "45", + "journal_year": 2019, + "date_published": "2019-02-06", + "related_article_doi": ["Test related article doi"], + "copyright_holder": "Test Copyright", + "copyright_year": "2019", + "license": [ + { + "license": "CC-BY-4.0", + "url": "https://creativecommons.org/licenses//by/4.0" + } + ], + "collections": ["Test Collection"], + "control_field": ["Test control field"], + "free_keywords": ["Test free 1", "Test free 2"], + "thesis_supervisor": [ + { + "surname": "Test Surname", + "given_names": "Test names", + "email": "test@email.com", + "affiliations": [ + { + "value": "Test affiliation", + "organization": "Test org", + "country": "Test country" + } + ], + "full_name": "Test, Name" + } + ], + "thesis": ["Test thesis", "Test other thesis"], + "urls": ["test.com"], + "local_files": ["Test local file"] +} diff --git a/tests/units/schemas/data/test_parser_schema/empty.json b/tests/units/schemas/data/test_parser_schema/empty.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/tests/units/schemas/data/test_parser_schema/empty.json @@ -0,0 +1 @@ +{} diff --git a/tests/units/schemas/data/test_parser_schema/missing_fields.json b/tests/units/schemas/data/test_parser_schema/missing_fields.json new file mode 100644 index 00000000..01bb435c --- /dev/null +++ b/tests/units/schemas/data/test_parser_schema/missing_fields.json @@ -0,0 +1,50 @@ +{ + "journal_doctype": "article", + "dois": ["Test dois"], + "arxiv_eprints": [ + { + "value": "Test Eprint" + } + ], + "page_nr": [45], + "classification_numbers": ["Test classification 1", "Test classification 2"], + "collaborations": ["Test collaboration"], + "journal_title": "Test title", + "journal_issue": "2", + "journal_volume": "79", + "journal_artid": "Test art-id", + "journal_fpage": "1", + "journal_lpage": "45", + "journal_year": 2019, + "date_published": "2019-02-06", + "related_article_doi": ["Test related article doi"], + "copyright_holder": "Test Copyright", + "copyright_year": "2019", + "license": [ + { + "license": "CC-BY-4.0", + "url": "https://creativecommons.org/licenses//by/4.0" + } + ], + "collections": ["Test Collection"], + "control_field": ["Test control field"], + "free_keywords": ["Test free 1", "Test free 2"], + "thesis_supervisor": [ + { + "surname": "Test Surname", + "given_names": "Test names", + "email": "test@email.com", + "affiliations": [ + { + "value": "Test affiliation", + "organization": "Test org", + "country": "Test country" + } + ], + "full_name": "Test, Name" + } + ], + "thesis": ["Test thesis", "Test other thesis"], + "urls": ["test.com"], + "local_files": ["Test local file"] +} diff --git a/tests/units/schemas/test_enhancement_schema.py b/tests/units/schemas/test_enhancement_schema.py new file mode 100644 index 00000000..73fcfbfe --- /dev/null +++ b/tests/units/schemas/test_enhancement_schema.py @@ -0,0 +1,38 @@ +import json + +import pytest +from schemas.enhancement_schema import EnhancementSchema + + +@pytest.mark.parametrize( + "file_name, expected", + [ + pytest.param( + "correct.json", {}, id="test_enhancement_schema_with_correct_input" + ), + pytest.param( + "missing_fields.json", + { + "abstracts": ["Missing data for required field."], + "imprints": ["Missing data for required field."], + }, + id="test_enhancement_schema_with_missing_fields", + ), + pytest.param( + "empty.json", + { + "abstracts": ["Missing data for required field."], + "acquisition_source": ["Missing data for required field."], + "copyright": ["Missing data for required field."], + "imprints": ["Missing data for required field."], + "record_creation_date": ["Missing data for required field."], + "titles": ["Missing data for required field."], + }, + id="test_enhancement_schema_with_empty_json", + ), + ], +) +def test_enhancement_schema(file_name, expected, shared_datadir): + file = (shared_datadir / "test_enchancement_schema" / file_name).read_text() + validation = EnhancementSchema().validate(json.loads(file)) + assert validation == expected diff --git a/tests/units/schemas/test_generic_parser_schema.py b/tests/units/schemas/test_generic_parser_schema.py new file mode 100644 index 00000000..95ed741f --- /dev/null +++ b/tests/units/schemas/test_generic_parser_schema.py @@ -0,0 +1,51 @@ +import json + +import pytest +from schemas.generic_parser_schema import GenericParserSchema + + +@pytest.mark.parametrize( + "file_name, expected", + [ + pytest.param( + "correct.json", {}, id="test_generic_parser_schema_with_correct_input" + ), + pytest.param( + "missing_fields.json", + { + "abstract": ["Missing data for required field."], + "authors": ["Missing data for required field."], + }, + id="test_generic_parser_schema_with_missing_fields", + ), + pytest.param( + "empty.json", + { + "abstract": ["Missing data for required field."], + "arxiv_eprints": ["Missing data for required field."], + "authors": ["Missing data for required field."], + "classification_numbers": ["Missing data for required field."], + "collaborations": ["Missing data for required field."], + "collections": ["Missing data for required field."], + "control_field": ["Missing data for required field."], + "copyright_holder": ["Missing data for required field."], + "copyright_year": ["Missing data for required field."], + "date_published": ["Missing data for required field."], + "dois": ["Missing data for required field."], + "free_keywords": ["Missing data for required field."], + "license": ["Missing data for required field."], + "local_files": ["Missing data for required field."], + "publication_info": ["Missing data for required field."], + "thesis": ["Missing data for required field."], + "thesis_supervisor": ["Missing data for required field."], + "title": ["Missing data for required field."], + "urls": ["Missing data for required field."], + }, + id="test_generic_parser_schema_with_empty_json", + ), + ], +) +def test_generic_parser_schema(file_name, expected, shared_datadir): + file = (shared_datadir / "test_generic_parser_schema" / file_name).read_text() + validation = GenericParserSchema().validate(json.loads(file)) + assert validation == expected diff --git a/tests/units/schemas/test_parser_schema.py b/tests/units/schemas/test_parser_schema.py new file mode 100644 index 00000000..9ab41315 --- /dev/null +++ b/tests/units/schemas/test_parser_schema.py @@ -0,0 +1,59 @@ +import json + +import pytest +from schemas.parser_schema import ParserSchema + + +@pytest.mark.parametrize( + "file_name, expected", + [ + pytest.param("correct.json", {}, id="test_parser_schema_with_correct_input"), + pytest.param( + "missing_fields.json", + { + "abstract": ["Missing data for required field."], + "title": ["Missing data for required field."], + "authors": ["Missing data for required field."], + }, + id="test_parser_schema_with_missing_fields", + ), + pytest.param( + "empty.json", + { + "journal_doctype": ["Missing data for required field."], + "dois": ["Missing data for required field."], + "arxiv_eprints": ["Missing data for required field."], + "page_nr": ["Missing data for required field."], + "abstract": ["Missing data for required field."], + "title": ["Missing data for required field."], + "classification_numbers": ["Missing data for required field."], + "authors": ["Missing data for required field."], + "collaborations": ["Missing data for required field."], + "journal_title": ["Missing data for required field."], + "journal_issue": ["Missing data for required field."], + "journal_volume": ["Missing data for required field."], + "journal_artid": ["Missing data for required field."], + "journal_fpage": ["Missing data for required field."], + "journal_lpage": ["Missing data for required field."], + "journal_year": ["Missing data for required field."], + "date_published": ["Missing data for required field."], + "related_article_doi": ["Missing data for required field."], + "copyright_holder": ["Missing data for required field."], + "copyright_year": ["Missing data for required field."], + "license": ["Missing data for required field."], + "collections": ["Missing data for required field."], + "control_field": ["Missing data for required field."], + "free_keywords": ["Missing data for required field."], + "thesis_supervisor": ["Missing data for required field."], + "thesis": ["Missing data for required field."], + "urls": ["Missing data for required field."], + "local_files": ["Missing data for required field."], + }, + id="test_parser_schema_with_empty_obj", + ), + ], +) +def test_generic_parser_schema(file_name, expected, shared_datadir): + file = (shared_datadir / "test_parser_schema" / file_name).read_text() + validation = ParserSchema().validate(json.loads(file)) + assert validation == expected