Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

schema for Enhancement #30

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,9 @@ A Makefile has been created to ease this process. The available targets are the

Airflow UI will be rinning on localhost:8080.
More details about Airflow installation and running can be found [here](https://airflow.apache.org/docs/apache-airflow/stable/start/local.html)

## Schemas

1. Parser schema - describes the structure of the output of publisher parser, and input for generic parser
2. Generic parser schema - describes the structure of the output of generic parser, and input for enhancer
3. Enhancer schema - describes the structure of the output of enhancer
84 changes: 84 additions & 0 deletions dags/schemas/enhancement_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from marshmallow import Schema, fields


class Abstracts(Schema):
value = fields.Str(
required=True, error_messages={"required": "Value in abstracts is required"}
)
source = fields.Str(
required=True, error_messages={"required": "Source in abstracts is required"}
)


class AquisitionSource(Schema):
source = fields.Str(
required=True,
error_messages={"required": "Source in aquisition source is required"},
)
method = fields.Str(
required=True,
error_messages={"required": "Method in aquisition source is required"},
)
date = fields.DateTime(
required=True,
error_messages={"required": "Date in aquisition source is required"},
)
submission_number = fields.Str(
required=True,
error_messages={
"required": "Submission number in aquisition source is required"
},
)


class CopyRight(Schema):
holder = fields.Str(
required=True,
error_messages={"required": "Holder in copy right source is required"},
)
year = fields.Int(
required=True,
error_messages={"required": "Year in copy right source is required"},
)
statement = fields.Str(
required=True,
error_messages={"required": "Statement in copy right source is required"},
)
material = fields.Str(
required=True,
error_messages={"required": "Material in copy right source is required"},
)


class Imprints(Schema):
date = fields.Date(
required=True,
error_messages={"required": "Date in imprints source is required"},
)
publisher = fields.Str(
required=True,
error_messages={"required": "Date in imprints source is required"},
)


class Titles(Schema):
title = fields.Str(
required=True, error_messages={"required": "Tile in titles source is required"}
)
subtitle = fields.Str(
required=True,
error_messages={"required": "Subtitle in subtitle source is required"},
)
source = fields.Str(
required=True,
error_messages={"required": "Source in titles source is required"},
)


class EnhancementSchema(Schema):
abstracts = fields.List(fields.Nested(Abstracts()), required=True)
acquisition_source = fields.Nested(AquisitionSource(), required=True)
copyright = fields.List(fields.Nested(CopyRight()), required=True)
imprints = fields.List(fields.Nested(Imprints()), required=True)
record_creation_date = fields.DateTime(required=True)
titles = fields.List(fields.Nested(Titles()), required=True)
57 changes: 57 additions & 0 deletions dags/schemas/generic_parser_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from marshmallow import Schema, fields
from schemas.parser_schema import Affiliations, Author, License, ValueDict


class ClassificationNumber(Schema):
classification_number = fields.Str(required=True)
standard = fields.Str(required=True)


class Collection(Schema):
primary = fields.Str(required=True)


class FreeKeyword(Schema):
source = fields.Str(required=True)
value = fields.Str(required=True)


class ThesisSupervisor(Schema):
affiliations = fields.List(fields.Nested(Affiliations()), required=True)
full_name = fields.Str(required=True)


class PublicationInfo(Schema):
artid = fields.Str(required=True)
journal_issue = fields.Str(required=True)
journal_title = fields.Str(required=True)
journal_volume = fields.Str(required=True)
material = fields.Str(required=True)
page_end = fields.Str(required=True)
page_start = fields.Str(required=True)
year = fields.Int(required=True)


class GenericParserSchema(Schema):
abstract = fields.Str(required=True)
arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True)
authors = fields.List(fields.Nested(Author()), required=True)
classification_numbers = fields.List(
fields.Nested(ClassificationNumber()), required=True
)
collaborations = fields.List(fields.Nested(ValueDict()), required=True)
collections = fields.List(fields.Nested(Collection()), required=True)
control_field = fields.Str(required=True)
copyright_holder = fields.Str(required=True)
copyright_year = fields.Str(required=True)
date_published = fields.Date(required=True)
dois = fields.List(fields.Nested(ValueDict()), required=True)
free_keywords = fields.List(fields.Nested(FreeKeyword()), required=True)
license = fields.List(fields.Nested(License()), required=True)
local_files = fields.List(fields.Nested(ValueDict()), required=True)
page_nr = fields.List(fields.Int(required=True))
publication_info = fields.List(fields.Nested(PublicationInfo()), required=True)
thesis = fields.Str(required=True)
thesis_supervisor = fields.List(fields.Nested(ThesisSupervisor()), required=True)
title = fields.Str(required=True)
urls = fields.List(fields.Nested(ValueDict()), required=True)
57 changes: 57 additions & 0 deletions dags/schemas/parser_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from marshmallow import Schema, fields


class ValueDict(Schema):
value = fields.Str(required=True)


class Affiliations(Schema):
value = fields.Str(required=True)
organization = fields.Str(required=True)
country = fields.Str(required=True)


class Author(Schema):
surname = fields.Str(required=True)
given_names = fields.Str(required=True)
email = fields.Str(required=True)
affiliations = fields.List(fields.Nested(Affiliations()))
full_name = fields.Str(required=True)


class License(Schema):
license = fields.Str(required=True)
url = fields.Str(required=True)


class ParserSchema(Schema):
journal_doctype = fields.Str(required=True)
dois = fields.List(fields.Str(), required=True)
arxiv_eprints = fields.List(fields.Nested(ValueDict()), required=True)
page_nr = fields.List(fields.Int(), required=True)
abstract = fields.Str(required=True)
title = fields.Str(required=True)
classification_numbers = fields.List(fields.Str(), required=True)
authors = fields.List(fields.Nested(Author()), required=True)
collaborations = fields.List(fields.Str(), required=True)
journal_title = fields.Str(required=True)
journal_issue = fields.Str(required=True)
journal_volume = fields.Str(required=True)
journal_artid = fields.Str(required=True)
journal_fpage = fields.Str(required=True)
journal_lpage = fields.Str(required=True)
journal_year = fields.Int(required=True)
date_published = fields.Date(required=True)
related_article_doi = fields.List(fields.Str(), required=True)
copyright_holder = fields.Str(required=True)
# Really copy right year is a string?
copyright_year = fields.Str(required=True)
license = fields.List(fields.Nested(License()), required=True)
collections = fields.List(fields.Str(), required=True)
control_field = fields.List(fields.Str(), required=True)
free_keywords = fields.List(fields.Str(), required=True)
# is thesis supervisor really the same as author?
thesis_supervisor = fields.List(fields.Nested(Author()), required=True)
thesis = fields.List(fields.Str(), required=True)
urls = fields.List(fields.Str(), required=True)
local_files = fields.List(fields.Str(), required=True)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ PyYAML==6.0
furl==2.1.3
structlog==21.5.0
bleach==4.1.0
marshmallow==3.15.0
20 changes: 20 additions & 0 deletions tests/units/schemas/data/test_enchancement_schema/correct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"abstracts": [{ "value": "this is abstracts", "source": "Springer" }],
"acquisition_source": {
"source": "Springer",
"method": "Springer",
"date": "2022-05-20T00:00:00",
"submission_number": "path/to/the/file"
},
"copyright": [
{
"holder": "copyright_holder",
"year": "2020",
"statement": "copyright_statement",
"material": "copyright_material"
}
],
"imprints": [{ "date": "2022-05-20", "publisher": "Springer" }],
"record_creation_date": "2022-05-20T00:00:00",
"titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"acquisition_source": {
"source": "Springer",
"method": "Springer",
"date": "2022-05-20T00:00:00",
"submission_number": "path/to/the/file"
},
"copyright": [
{
"holder": "copyright_holder",
"year": "2020",
"statement": "copyright_statement",
"material": "copyright_material"
}
],
"record_creation_date": "2022-05-20T00:00:00",
"titles": [{ "title": "title", "subtitle": "subtitle", "source": "Springer" }]
}
108 changes: 108 additions & 0 deletions tests/units/schemas/data/test_generic_parser_schema/correct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"abstract": "Test abstract",
"arxiv_eprints": [
{
"value": "Test Eprint"
}
],
"authors": [
{
"affiliations": [
{
"country": "Test country",
"organization": "Test org",
"value": "Test affiliation"
}
],
"email": "test@email.com",
"full_name": "Test Surname, Test names",
"given_names": "Test names",
"surname": "Test Surname"
}
],
"classification_numbers": [
{
"classification_number": "Test classification 1",
"standard": "PACS"
},
{
"classification_number": "Test classification 2",
"standard": "PACS"
}
],
"collaborations": [
{
"value": "Test collaboration"
}
],
"collections": [
{
"primary": "Test Collection"
}
],
"control_field": "Test control field",
"copyright_holder": "Test Copyright",
"copyright_year": "2019",
"date_published": "2019-02-06",
"dois": [
{
"value": "Test dois"
},
{
"value": "Test related article doi"
}
],
"free_keywords": [
{
"source": "author",
"value": "Test free 1"
},
{
"source": "author",
"value": "Test free 2"
}
],
"license": [
{
"license": "CC-BY-4.0",
"url": "https://creativecommons.org/licenses//by/4.0"
}
],
"local_files": [
{
"value": "Test local file"
}
],
"page_nr": [45],
"publication_info": [
{
"artid": "Test art-id",
"journal_issue": "2",
"journal_title": "Test title",
"journal_volume": "79",
"material": "article",
"page_end": "45",
"page_start": "1",
"year": 2019
}
],
"thesis": "Test thesis",
"thesis_supervisor": [
{
"affiliations": [
{
"country": "Test country",
"organization": "Test org",
"value": "Test affiliation"
}
],
"full_name": "Test Surname, Test names"
}
],
"title": "Test title",
"urls": [
{
"value": "test.com"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading