Skip to content

Commit

Permalink
Schemas
Browse files Browse the repository at this point in the history
* Added schemas for publisher (Springer) parser
* Added schemas for generic parser
* Added schemas for enhancer output
* Added tests
* Added note about schemas in README
* ref:  cern-sis/issues-scoap3#65
  • Loading branch information
ErnestaP committed May 30, 2022
1 parent 8bcb144 commit 2e8b63f
Show file tree
Hide file tree
Showing 17 changed files with 711 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,9 @@ A Makefile has been created to ease this process. The available targets are the

Airflow UI will be rinning on localhost:8080.
More details about Airflow installation and running can be found [here](https://airflow.apache.org/docs/apache-airflow/stable/start/local.html)

## Schemas

1. Parser schema - describes the structure of the output of publisher parser, and input for generic parser
2. Generic parser schema - describes the structure of the output of generic parser, and input for enhancer
3. Enhancer schema - describes the structure of the output of enhancer
84 changes: 84 additions & 0 deletions dags/schemas/enhancement_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from marshmallow import Schema, fields


class Abstracts(Schema):
value = fields.Str(
required=True, error_messages={"required": "Value in abstracts is required"}
)
source = fields.Str(
required=True, error_messages={"required": "Source in abstracts is required"}
)


class AquisitionSource(Schema):
source = fields.Str(
required=True,
error_messages={"required": "Source in aquisition source is required"},
)
method = fields.Str(
required=True,
error_messages={"required": "Method in aquisition source is required"},
)
date = fields.DateTime(
required=True,
error_messages={"required": "Date in aquisition source is required"},
)
submission_number = fields.Str(
required=True,
error_messages={
"required": "Submission number in aquisition source is required"
},
)


class CopyRight(Schema):
holder = fields.Str(
required=True,
error_messages={"required": "Holder in copy right source is required"},
)
year = fields.Int(
required=True,
error_messages={"required": "Year in copy right source is required"},
)
statement = fields.Str(
required=True,
error_messages={"required": "Statement in copy right source is required"},
)
material = fields.Str(
required=True,
error_messages={"required": "Material in copy right source is required"},
)


class Imprints(Schema):
date = fields.Date(
required=True,
error_messages={"required": "Date in imprints source is required"},
)
publisher = fields.Str(
required=True,
error_messages={"required": "Date in imprints source is required"},
)


class Titles(Schema):
title = fields.Str(
required=True, error_messages={"required": "Tile in titles source is required"}
)
subtitle = fields.Str(
required=True,
error_messages={"required": "Subtitle in subtitle source is required"},
)
source = fields.Str(
required=True,
error_messages={"required": "Source in titles source is required"},
)


class EnhancementSchema(Schema):
abstracts = fields.List(fields.Nested(Abstracts()), required=True)
acquisition_source = fields.Nested(AquisitionSource(), required=True)
copyright = fields.List(fields.Nested(CopyRight()), required=True)
imprints = fields.List(fields.Nested(Imprints()), required=True)
record_creation_date = fields.DateTime(required=True)
titles = fields.List(fields.Nested(Titles()), required=True)
57 changes: 57 additions & 0 deletions dags/schemas/generic_parser_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from marshmallow import Schema, fields
from schemas.parser_schema import Affiliations, Author, License, ValueDict


class ClassificationNumber(Schema):
classification_number = fields.Str(required=True)
standard = fields.Str(required=True)


class Collection(Schema):
primary = fields.Str(required=True)


class FreeKeyword(Schema):
source = fields.Str(required=True)
value = fields.Str(required=True)


class ThesisSupervisor(Schema):
affiliations = fields.List(fields.Nested(Affiliations()), required=True)
full_name = fields.Str(required=True)


class PublicationInfo(Schema):
artid = fields.Str(required=True)
journal_issue = fields.Str(required=True)
journal_title = fields.Str(required=True)
journal_volume = fields.Str(required=True)
material = fields.Str(required=True)
page_end = fields.Str(required=True)
page_start = fields.Str(required=True)
year = fields.Int(required=True)


class GenericParserSchema(Schema):
abstract = fields.Str(required=True)
arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True)
authors = fields.List(fields.Nested(Author()), required=True)
classification_numbers = fields.List(
fields.Nested(ClassificationNumber()), required=True
)
collaborations = fields.List(fields.Nested(ValueDict()), required=True)
collections = fields.List(fields.Nested(Collection()), required=True)
control_field = fields.Str(required=True)
copyright_holder = fields.Str(required=True)
copyright_year = fields.Str(required=True)
date_published = fields.Date(required=True)
dois = fields.List(fields.Nested(ValueDict()), required=True)
free_keywords = fields.List(fields.Nested(FreeKeyword()), required=True)
license = fields.List(fields.Nested(License()), required=True)
local_files = fields.List(fields.Nested(ValueDict()), required=True)
page_nr = fields.List(fields.Int(required=True))
publication_info = fields.List(fields.Nested(PublicationInfo()), required=True)
thesis = fields.Str(required=True)
thesis_supervisor = fields.List(fields.Nested(ThesisSupervisor()), required=True)
title = fields.Str(required=True)
urls = fields.List(fields.Nested(ValueDict()), required=True)
57 changes: 57 additions & 0 deletions dags/schemas/parser_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from marshmallow import Schema, fields


class ValueDict(Schema):
value = fields.Str(required=True)


class Affiliations(Schema):
value = fields.Str(required=True)
organization = fields.Str(required=True)
country = fields.Str(required=True)


class Author(Schema):
surname = fields.Str(required=True)
given_names = fields.Str(required=True)
email = fields.Str(required=True)
affiliations = fields.List(fields.Nested(Affiliations()))
full_name = fields.Str(required=True)


class License(Schema):
license = fields.Str(required=True)
url = fields.Str(required=True)


class ParserSchema(Schema):
journal_doctype = fields.Str(required=True)
dois = fields.List(fields.Str(), required=True)
arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True)
page_nr = fields.List(fields.Int(), required=True)
abstract = fields.Str(required=True)
title = fields.Str(required=True)
classification_numbers = fields.List(fields.Str(), required=True)
authors = fields.List(fields.Nested(Author()), required=True)
collaborations = fields.List(fields.Str(), required=True)
journal_title = fields.Str(required=True)
journal_issue = fields.Str(required=True)
journal_volume = fields.Str(required=True)
journal_artid = fields.Str(required=True)
journal_fpage = fields.Str(required=True)
journal_lpage = fields.Str(required=True)
journal_year = fields.Int(required=True)
date_published = fields.Date(required=True)
related_article_doi = fields.List(fields.Str(), required=True)
copyright_holder = fields.Str(required=True)
# Really copy right year is a string?
copyright_year = fields.Str(required=True)
license = fields.List(fields.Nested(License()), required=True)
collections = fields.List(fields.Str(), required=True)
control_field = fields.List(fields.Str(), required=True)
free_keywords = fields.List(fields.Str(), required=True)
# is thesis supervisor really the same as author?
thesis_supervisor = fields.List(fields.Nested(Author()), required=True)
thesis = fields.List(fields.Str(), required=True)
urls = fields.List(fields.Str(), required=True)
local_files = fields.List(fields.Str(), required=True)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ PyYAML==6.0
furl==2.1.3
structlog==21.5.0
bleach==4.1.0
marshmallow==3.15.0
20 changes: 20 additions & 0 deletions tests/units/schemas/data/test_enchancement_schema/correct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"abstracts": [{ "value": "this is abstracts", "source": "Springer" }],
"acquisition_source": {
"source": "Springer",
"method": "Springer",
"date": "2022-05-20T00:00:00",
"submission_number": "path/to/the/file"
},
"copyright": [
{
"holder": "copyright_holder",
"year": "2020",
"statement": "copyright_statement",
"material": "copyright_material"
}
],
"imprints": [{ "date": "2022-05-20", "publisher": "Springer" }],
"record_creation_date": "2022-05-20T00:00:00",
"titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"acquisition_source": {
"source": "Springer",
"method": "Springer",
"date": "2022-05-20T00:00:00",
"submission_number": "path/to/the/file"
},
"copyright": [
{
"holder": "copyright_holder",
"year": "2020",
"statement": "copyright_statement",
"material": "copyright_material"
}
],
"record_creation_date": "2022-05-20T00:00:00",
"titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }]
}
108 changes: 108 additions & 0 deletions tests/units/schemas/data/test_generic_parser_schema/correct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"abstract": "Test abstract",
"arxiv_eprints": [
{
"value": "Test Eprint"
}
],
"authors": [
{
"affiliations": [
{
"country": "Test country",
"organization": "Test org",
"value": "Test affiliation"
}
],
"email": "test@email.com",
"full_name": "Test Surname, Test names",
"given_names": "Test names",
"surname": "Test Surname"
}
],
"classification_numbers": [
{
"classification_number": "Test classification 1",
"standard": "PACS"
},
{
"classification_number": "Test classification 2",
"standard": "PACS"
}
],
"collaborations": [
{
"value": "Test collaboration"
}
],
"collections": [
{
"primary": "Test Collection"
}
],
"control_field": "Test control field",
"copyright_holder": "Test Copyright",
"copyright_year": "2019",
"date_published": "2019-02-06",
"dois": [
{
"value": "Test dois"
},
{
"value": "Test related article doi"
}
],
"free_keywords": [
{
"source": "author",
"value": "Test free 1"
},
{
"source": "author",
"value": "Test free 2"
}
],
"license": [
{
"license": "CC-BY-4.0",
"url": "https://creativecommons.org/licenses//by/4.0"
}
],
"local_files": [
{
"value": "Test local file"
}
],
"page_nr": [45],
"publication_info": [
{
"artid": "Test art-id",
"journal_issue": "2",
"journal_title": "Test title",
"journal_volume": "79",
"material": "article",
"page_end": "45",
"page_start": "1",
"year": 2019
}
],
"thesis": "Test thesis",
"thesis_supervisor": [
{
"affiliations": [
{
"country": "Test country",
"organization": "Test org",
"value": "Test affiliation"
}
],
"full_name": "Test Surname, Test names"
}
],
"title": "Test title",
"urls": [
{
"value": "test.com"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading

0 comments on commit 2e8b63f

Please sign in to comment.