From 848ab0690aee331dbee1bd4da16b7853acedd022 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:18:11 -0500 Subject: [PATCH 1/6] Preliminary frictionlessdata datapackage to describe data exports --- datapackage.json | 374 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 datapackage.json diff --git a/datapackage.json b/datapackage.json new file mode 100644 index 000000000..68acc0742 --- /dev/null +++ b/datapackage.json @@ -0,0 +1,374 @@ +{ + "profile": "tabular-data-package", + "title": "Princeton Geniza Project metadata", + "resources": [ + { + "path": "data/documents.csv", + "name": "documents", + "profile": "tabular-data-resource", + "scheme": "file", + "format": "csv", + "hashing": "md5", + "encoding": "utf-8-sig", + "schema": { + "fields": [ + { + "type": "integer", + "name": "pgpid", + "constraints": { + "required": true + }, + "description": "PGP identifier" + }, + { + "type": "string", + "name": "url", + "description": "permalink; public view of this document" + }, + { + "type": "string", + "name": "iiif_urls", + "description": "URLs for one or more IIIF manifests, if available for associated fragments" + }, + { + "type": "string", + "name": "fragment_urls", + "description": "URLs to access associated fragments at owning library or museum site, if available" + }, + { + "type": "string", + "name": "shelfmark", + "constraints": { + "required": true + }, + "description": "shelfmark or combined shelfmark for the fragments this document appears on" + }, + { + "name": "multifragment", + "type": "any", + "description": "Identifier for fragment part, if any associated fragments are part of a multifragment" + }, + { + "type": "string", + "name": "side", + "description": "sides of associated fragments where this document appears" + }, + { + "type": "string", + "name": "region", + "description": "Label for region of fragment that document text occupies, for each associated fragment" + }, + { + "type": "string", + "name": "type", + "description": "type of document" + }, + { + "type": "string", + "name": "tags", + "description": "tags describing the document" + }, + { + "type": "string", + "name": "description", + "description": "text description summarizing the document" + }, + { + "type": "string", + "name": "scholarship_records", + "description": "Short citations for known scholarship about this document" + }, + { + "type": "string", + "name": "shelfmarks_historic", + "description": "Historic shelfmarks, if any, for associated fragments" + }, + { + "type": "string", + "name": "languages_primary", + "description": "Primary languages used in the text of this document" + }, + { + "type": "string", + "name": "languages_secondary", + "description": "Additional languages used in the text of this document" + }, + { + "name": "language_note", + "type": "string", + "description": "Additional information about the languages or scripts used in this document" + }, + { + "type": "string", + "name": "doc_date_original", + "description": "Date on document — date in original calendar" + }, + { + "type": "string", + "name": "doc_date_calendar", + "description": "Calendar for document date" + }, + { + "type": "string", + "name": "doc_date_standard", + "description": "Date on document, standardized - CE date (convert to Julian before 1582, Gregorian after 1582)" + }, + { + "type": "string", + "name": "notes", + "description": "temporary - empty admin field" + }, + { + "type": "string", + "name": "needs_review", + "description": "temporary - empty admin field" + }, + { + "type": "string", + "name": "url_admin", + "description": "temporary - empty admin field" + }, + { + "type": "datetime", + "name": "initial_entry", + "description": "Date document was added to PGP" + }, + { + "type": "datetime", + "name": "last_modified", + "description": "Date document was last modified in PGP" + }, + { + "type": "string", + "name": "input_by", + "description": "Researchers who contributed to the metadata for this document" + }, + { + "type": "string", + "name": "status", + "description": "temporary - empty admin field" + }, + { + "type": "string", + "name": "library", + "description": "Library location/ownership for associated fragments" + }, + { + "type": "string", + "name": "collection", + "description": "Library collection for associated fragments" + }, + { + "type": "string", + "name": "has_transcription", + "description": "Boolean indicator if document has transcription available", + "constraints": { + "required": true + } + }, + { + "type": "string", + "name": "has_translation", + "description": "Boolean indicator if document has translation available", + "constraints": { + "required": true + } + } + ] + } + }, + { + "path": "data/footnotes.csv", + "name": "footnotes", + "profile": "tabular-data-resource", + "scheme": "file", + "format": "csv", + "hashing": "md5", + "encoding": "utf-8-sig", + "schema": { + "fields": [ + { + "type": "string", + "name": "document" + }, + { + "type": "integer", + "name": "document_id" + }, + { + "type": "string", + "name": "source" + }, + { + "type": "string", + "name": "location" + }, + { + "type": "string", + "name": "doc_relation" + }, + { + "type": "string", + "name": "notes" + }, + { + "type": "string", + "name": "url" + }, + { + "type": "string", + "name": "content" + }, + { + "name": "admin_url", + "type": "any", + "description": "temporary - empty admin field" + } + ] + } + }, + { + "path": "data/fragments.csv", + "name": "fragments", + "profile": "tabular-data-resource", + "scheme": "file", + "format": "csv", + "hashing": "md5", + "encoding": "utf-8-sig", + "schema": { + "fields": [ + { + "type": "string", + "name": "shelfmark" + }, + { + "type": "string", + "name": "pgpids" + }, + { + "type": "string", + "name": "old_shelfmarks" + }, + { + "type": "string", + "name": "collection" + }, + { + "type": "string", + "name": "library" + }, + { + "type": "string", + "name": "library_abbrev" + }, + { + "type": "string", + "name": "collection_name" + }, + { + "type": "string", + "name": "collection_abbrev" + }, + { + "name": "url", + "type": "any" + }, + { + "name": "iiif_url", + "type": "any" + }, + { + "type": "string", + "name": "is_multifragment" + }, + { + "type": "datetime", + "name": "created" + }, + { + "type": "datetime", + "name": "last_modified" + } + ] + } + }, + { + "path": "data/sources.csv", + "name": "sources", + "profile": "tabular-data-resource", + "scheme": "file", + "format": "csv", + "hashing": "md5", + "encoding": "utf-8-sig", + "schema": { + "fields": [ + { + "type": "string", + "name": "source_type" + }, + { + "type": "string", + "name": "authors" + }, + { + "type": "string", + "name": "title" + }, + { + "type": "string", + "name": "journal_book" + }, + { + "type": "string", + "name": "volume" + }, + { + "type": "integer", + "name": "issue" + }, + { + "type": "integer", + "name": "year" + }, + { + "type": "string", + "name": "place_published" + }, + { + "type": "string", + "name": "publisher" + }, + { + "type": "integer", + "name": "edition" + }, + { + "type": "string", + "name": "other_info" + }, + { + "type": "string", + "name": "page_range" + }, + { + "type": "string", + "name": "languages" + }, + { + "type": "string", + "name": "url" + }, + { + "type": "string", + "name": "notes" + }, + { + "type": "integer", + "name": "num_footnotes" + } + ] + } + } + ], + "profile": "data-package" +} From 1971e9942dc91e3cb72cd769ae41376966949f3e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:26:24 -0500 Subject: [PATCH 2/6] Document foreign key between footnotes and documents --- datapackage.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datapackage.json b/datapackage.json index 68acc0742..04b67a1da 100644 --- a/datapackage.json +++ b/datapackage.json @@ -225,7 +225,16 @@ "description": "temporary - empty admin field" } ] - } + }, + "foreignKeys": [ + { + "fields": "document_id", + "reference": { + "resource": "documents", + "fields": "pgpid" + } + } + ] }, { "path": "data/fragments.csv", From 1dd53e23bbdee5607c9cc4aac91fef15b0a9c29f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:26:44 -0500 Subject: [PATCH 3/6] Add frictionless validation workflow --- .github/workflows/validation.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/validation.yml diff --git a/.github/workflows/validation.yml b/.github/workflows/validation.yml new file mode 100644 index 000000000..59eb937d9 --- /dev/null +++ b/.github/workflows/validation.yml @@ -0,0 +1,23 @@ +name: validation + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + + # Validate + + validate: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Validate data + uses: frictionlessdata/repository@v2 + with: + packages: "datapackage.json" From 88430bbb8f1f7caf35c54f9dd2c1c45b45354108 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:30:30 -0500 Subject: [PATCH 4/6] Add validation workflow status badge to readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b3f2d6413..f94b954a6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # PGP Metadata +[![validation](https://github.com/princetongenizalab/pgp-metadata/actions/workflows/validation.yml/badge.svg)](https://github.com/princetongenizalab/pgp-metadata/actions/workflows/validation.yml) + This repository is used to version and publish data from the [Princeton Geniza Project](https://geniza.princeton.edu/) (PGP). The data files are automatically exported and synchronized from the PGP database. Commits include co-author information where possible, to credit the researchers who are working on the data. Files in the data directory should _NOT_ be edited or modified directly. From d7b3d89232ded61ff811fcb6bbe4248e87a51937 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:49:06 -0500 Subject: [PATCH 5/6] Additional validation and constraints --- datapackage.json | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/datapackage.json b/datapackage.json index 04b67a1da..9571a0222 100644 --- a/datapackage.json +++ b/datapackage.json @@ -23,6 +23,7 @@ { "type": "string", "name": "url", + "format": "uri", "description": "permalink; public view of this document" }, { @@ -159,22 +160,28 @@ "description": "Library collection for associated fragments" }, { - "type": "string", + "type": "boolean", "name": "has_transcription", "description": "Boolean indicator if document has transcription available", + "trueValues": ["Y"], + "falseValues": ["N"], "constraints": { "required": true } }, { - "type": "string", + "type": "boolean", "name": "has_translation", "description": "Boolean indicator if document has translation available", + "trueValues": ["Y"], + "falseValues": ["N"], "constraints": { "required": true } } - ] + ], + "primaryKey": "pgpid", + "missingValues": [""] } }, { @@ -213,7 +220,8 @@ }, { "type": "string", - "name": "url" + "name": "url", + "format": "uri" }, { "type": "string", @@ -226,6 +234,7 @@ } ] }, + "missingValues": [""], "foreignKeys": [ { "fields": "document_id", @@ -280,11 +289,13 @@ }, { "name": "url", - "type": "any" + "type": "string", + "format": "uri" }, { "name": "iiif_url", - "type": "any" + "type": "string", + "format": "uri" }, { "type": "string", @@ -298,7 +309,8 @@ "type": "datetime", "name": "last_modified" } - ] + ], + "missingValues": [""] } }, { @@ -365,7 +377,8 @@ }, { "type": "string", - "name": "url" + "name": "url", + "format": "uri" }, { "type": "string", @@ -376,7 +389,8 @@ "name": "num_footnotes" } ] - } + }, + "missingValues": [""] } ], "profile": "data-package" From c3ddb3abd86977f0d1a9b10419338d71de91f22b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 22 Nov 2022 14:50:51 -0500 Subject: [PATCH 6/6] Remove validation run on push to main; add cron validation --- .github/workflows/validation.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/validation.yml b/.github/workflows/validation.yml index 59eb937d9..28cb1f911 100644 --- a/.github/workflows/validation.yml +++ b/.github/workflows/validation.yml @@ -1,12 +1,13 @@ name: validation on: - push: - branches: - - main pull_request: branches: - main + schedule: + schedule: # run automatically on main branch each Tuesday at 11am + - cron: "0 16 * * 2" + jobs: