Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Views for Macrostrat schema and column-builder application #100

Merged
merged 9 commits into from
Nov 7, 2024
3 changes: 3 additions & 0 deletions cli/macrostrat/cli/database/migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ def run_migrations(
if name is not None and name == _name:
break

# Notify PostgREST to reload the schema cache
db.run_sql("NOTIFY pgrst, 'reload schema';")


def migration_has_been_run(*names: str):
db = get_db()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
Drop all views so they can be recreated.
TODO: automate this process.
*/
DROP VIEW IF EXISTS macrostrat_api.projects;
DROP VIEW IF EXISTS macrostrat_api.cols;
DROP VIEW IF EXISTS macrostrat_api.col_groups;
DROP VIEW IF EXISTS macrostrat_api.environs;
DROP VIEW IF EXISTS macrostrat_api.liths;
DROP VIEW IF EXISTS macrostrat_api.intervals;
DROP VIEW IF EXISTS macrostrat_api.timescales;
DROP VIEW IF EXISTS macrostrat_api.strat_tree;
DROP VIEW IF EXISTS macrostrat_api.refs;
DROP VIEW IF EXISTS macrostrat_api.units;
DROP VIEW IF EXISTS macrostrat_api.col_refs;
DROP VIEW IF EXISTS macrostrat_api.unit_environs;
DROP VIEW IF EXISTS macrostrat_api.unit_liths;
DROP VIEW IF EXISTS macrostrat_api.sections;
DROP VIEW IF EXISTS macrostrat_api.strat_names;
DROP VIEW IF EXISTS macrostrat_api.unit_strat_names;
DROP VIEW IF EXISTS macrostrat_api.units_strat_names;
DROP VIEW IF EXISTS macrostrat_api.strat_names_ref;
DROP VIEW IF EXISTS macrostrat_api.col_group_with_cols;
DROP VIEW IF EXISTS macrostrat_api.environ_unit;
DROP VIEW IF EXISTS macrostrat_api.econ_unit;
DROP VIEW IF EXISTS macrostrat_api.lith_attr_unit;
DROP VIEW IF EXISTS macrostrat_api.lith_unit;
DROP VIEW IF EXISTS macrostrat_api.unit_strat_name_expanded;
DROP VIEW IF EXISTS macrostrat_api.col_sections;
DROP VIEW IF EXISTS macrostrat_api.col_ref_expanded;
DROP VIEW IF EXISTS macrostrat_api.strat_names_meta;
DROP VIEW IF EXISTS macrostrat_api.unit_boundaries;
DROP VIEW IF EXISTS macrostrat_api.col_section_data;
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ ALTER TABLE macrostrat.col_refs
ADD CONSTRAINT col_refs_ref_fk FOREIGN KEY (ref_id) REFERENCES macrostrat.refs(id) ON DELETE CASCADE;

/* I had to make the id the primary key of intervals first before adding the foreign key */
ALTER TABLE macrostrat.intervals
ADD PRIMARY KEY (id);
ALTER TABLE macrostrat.intervals ADD PRIMARY KEY (id);

/* no issues
col_notes were not perserved, in mariaDB this is a separte table.
Expand Down Expand Up @@ -59,6 +58,8 @@ ALTER TABLE macrostrat.unit_econs
/*
23769 rows updated to have null ref_ids instead of 0...
*/
-- pg_loader apparently adds NOT NULL constraints in some cases
ALTER TABLE macrostrat.unit_environs ALTER COLUMN ref_id DROP NOT NULL;
UPDATE macrostrat.unit_environs
SET ref_id = NULL
WHERE ref_id = 0;
Expand All @@ -77,7 +78,8 @@ ALTER TABLE macrostrat.unit_liths
ADD CONSTRAINT unit_liths_units_fk FOREIGN KEY (unit_id) REFERENCES macrostrat.units(id) ON DELETE CASCADE;

/* no issues */
ALTER TABLE macrostrat.unit_lith_atts
-- now, we have renamed the table unit_lith_atts -> unit_liths_atts
ALTER TABLE macrostrat.unit_liths_atts
ADD CONSTRAINT unit_liths_atts_unit_liths_fk FOREIGN KEY (unit_lith_id) REFERENCES macrostrat.unit_liths(id) ON DELETE CASCADE,
ADD CONSTRAINT unit_liths_atts_lith_atts_fk FOREIGN KEY (lith_att_id) REFERENCES macrostrat.lith_atts(id) ON DELETE CASCADE;

Expand Down Expand Up @@ -108,11 +110,15 @@ ALTER TABLE macrostrat.unit_strat_names
ADD CONSTRAINT unit_strat_names_units_fk FOREIGN KEY (unit_id) REFERENCES macrostrat.units(id) ON DELETE CASCADE,
ADD CONSTRAINT unit_strat_names_strat_names_fk FOREIGN KEY (strat_name_id) REFERENCES macrostrat.strat_names(id) ON DELETE CASCADE;

-- Remove the NOT NULL constraint on concept_id
ALTER TABLE macrostrat.strat_names ALTER COLUMN concept_id DROP NOT NULL;
UPDATE macrostrat.strat_names
SET concept_id = NULL
WHERE concept_id = 0;

/* BREAKS!!! there is a non-null constraint on ref_id.. but 0 means none so. */
-- Remove the NOT NULL constraint on ref_id
ALTER TABLE macrostrat.strat_names ALTER COLUMN ref_id DROP NOT NULL;
UPDATE macrostrat.strat_names
SET ref_id = NULL
WHERE ref_id = 0;
Expand All @@ -123,6 +129,13 @@ WHERE sn.concept_id NOT IN (SELECT concept_id FROM macrostrat.strat_names_meta);
ALTER TABLE macrostrat.strat_names
ADD CONSTRAINT strat_names_strat_names_meta_fk FOREIGN KEY (concept_id) REFERENCES macrostrat.strat_names_meta(concept_id) ON DELETE CASCADE;

-- There are 6,000+ strat_name_meta entries with interval_id = 0

ALTER TABLE macrostrat.strat_names_meta ALTER COLUMN interval_id DROP NOT NULL;
UPDATE macrostrat.strat_names_meta
SET interval_id = NULL
WHERE interval_id = 0;

ALTER TABLE macrostrat.strat_names_meta
ADD CONSTRAINT strat_names_meta_intervals_fk FOREIGN KEY(interval_id) REFERENCES macrostrat.intervals(id) ON DELETE CASCADE,
ADD CONSTRAINT strat_names_meta_refs_fk FOREIGN KEY(ref_id) REFERENCES macrostrat.refs(id) ON DELETE CASCADE;
Expand All @@ -141,30 +154,109 @@ ALTER TABLE macrostrat.strat_names_places
DELETE FROM macrostrat.timescales_intervals
WHERE interval_id NOT IN (SELECT id from macrostrat.intervals);

/* I had to make the id the primary key of intervals first before adding the foreign key */
ALTER TABLE macrostrat.intervals
ADD PRIMARY KEY (id);

ALTER TABLE macrostrat.timescales_intervals
ADD CONSTRAINT timescales_intervals_timescales_fk FOREIGN KEY (timescale_id) REFERENCES macrostrat.timescales(id) ON DELETE CASCADE,
ADD CONSTRAINT timescales_intervals_intervals_fk FOREIGN KEY (interval_id) REFERENCES macrostrat.intervals(id) ON DELETE CASCADE;

/* 2 rows deleted for a col_id of 0,
one was a `test_delete_me`
the other was Lane Shale, unit_id 42143
*/
the other was Lane Shale, unit_id 42143 */
DELETE FROM macrostrat.units
WHERE col_id NOT IN (SELECT id FROM macrostrat.cols);
WHERE col_id NOT IN (SELECT id FROM macrostrat.cols)
-- Ensure that a maximum of two units are deleted, just for sanity.
AND ((SELECT count(*) FROM macrostrat.units WHERE col_id = 0) <= 2);


-- Create fk constraints on units_sections table
-- Delete bad units_sections record
DELETE FROM macrostrat.units_sections WHERE unit_id NOT IN (SELECT id FROM macrostrat.units);

ALTER TABLE macrostrat.units_sections
ADD CONSTRAINT units_sections_units_fk FOREIGN KEY (unit_id) REFERENCES macrostrat.units(id) ON DELETE CASCADE,
ADD CONSTRAINT units_sections_sections_fk FOREIGN KEY (section_id) REFERENCES macrostrat.sections(id) ON DELETE CASCADE;


/* Some units are not tied to the correct sections... */
/** New addition 2024-11-06
We want to keep the NOT NULL constraint on section_id (at least for now)
so we have to reconstruct sections in a few cases.

UPDATE macrostrat.units
set section_id = NULL
where section_id not in (select id from macrostrat.sections);
- 17 units exist with section_id = 0
- 941 units exist with a section_id that is not a valid section,
however all but 5 of those (all from New Zealand)
have a valid section_id in the units_sections table.

SELECT * FROM macrostrat.units
WHERE section_id NOT IN (select id from macrostrat.sections)
AND section_id NOT IN (SELECT id FROM macrostrat.sections);
*/

/** Add sections to contain a few stray units from New Zealand */
WITH units AS (
SELECT id, col_id, fo, lo, fo_h, lo_h
FROM macrostrat.units
WHERE id NOT IN (SELECT unit_id FROM macrostrat.units_sections)
AND section_id NOT IN (SELECT id FROM macrostrat.sections)
LIMIT 5 -- there should be only 5 units that match this criteria
), new_sections AS (
INSERT INTO macrostrat.sections (col_id, fo, lo, fo_h, lo_h)
SELECT
col_id, fo, lo, fo_h, lo_h
FROM units
GROUP BY col_id, fo, lo, fo_h, lo_h
RETURNING col_id, id section_id
), new_units_sections AS (
INSERT INTO macrostrat.units_sections (unit_id, section_id, col_id)
SELECT u.id, ns.section_id, ns.col_id
FROM units u
JOIN new_sections ns
ON u.col_id = ns.col_id
RETURNING unit_id, section_id, col_id
)
-- Update legacy unit.section_id relationships to mirror the macrostrat.units_sections links
UPDATE macrostrat.units u
SET section_id = us.section_id
FROM new_units_sections us
WHERE u.id = us.unit_id
AND u.col_id = us.col_id;


/** Update legacy section_id field for cases where it references a non-existent section.
TODO: we may want to delete this legacy field if it isn't needed.
*/
UPDATE macrostrat.units u
SET section_id = us.section_id
FROM macrostrat.units_sections us
WHERE u.id = us.unit_id
AND u.col_id = us.col_id
AND u.section_id NOT IN (SELECT id FROM macrostrat.sections);

/** Only a few units that are totally unlinked to sections
SELECT * FROM macrostrat.units
WHERE id NOT IN (SELECT unit_id FROM macrostrat.units_sections)
AND section_id NOT IN (SELECT id FROM macrostrat.sections);

-- fo, lo, fo_h and lo_h seem to match in all cases
*/


/* Reconstruct section_id field from units_sections table.
TODO: the col_id and section_id fields in the "units_columns" table
are the 'master' version of the link, and the others are around for
legacy purposes.
*/
UPDATE macrostrat.units u
SET section_id = us.section_id,
col_id = us.col_id
FROM macrostrat.units_sections us
WHERE u.id = us.unit_id
AND u.section_id NOT IN (SELECT id FROM macrostrat.sections);

ALTER TABLE macrostrat.units
ADD CONSTRAINT units_cols_fk FOREIGN KEY (col_id) REFERENCES macrostrat.cols(id) ON DELETE CASCADE,
ADD CONSTRAINT units_sections_fk FOREIGN KEY (section_id) REFERENCES macrostrat.sections(id) ON DELETE CASCADE,
ADD CONSTRAINT units_intervals_fo_fk FOREIGN KEY (fo) REFERENCES macrostrat.intervals(id) ON DELETE CASCADE,
ADD CONSTRAINT units_intervals_lo_fk FOREIGN KEY (lo) REFERENCES macrostrat.intervals(id) ON DELETE CASCADE;
ADD CONSTRAINT units_intervals_fo_fk FOREIGN KEY (fo) REFERENCES macrostrat.intervals(id) ON DELETE RESTRICT,
ADD CONSTRAINT units_intervals_lo_fk FOREIGN KEY (lo) REFERENCES macrostrat.intervals(id) ON DELETE RESTRICT;

ALTER TABLE macrostrat.sections
ADD CONSTRAINT sections_cols_fk FOREIGN KEY (col_id) REFERENCES macrostrat.cols(id) ON DELETE CASCADE;
Expand Down Expand Up @@ -198,6 +290,7 @@ SET project_id = c.project_id
FROM macrostrat.cols c
WHERE c.col_group_id = cg.id;


--
-- /* unit_boundaries table, needs a unit_id and ref_id fk
-- lots of 0's in the unit_id row... not sure why
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/** Useful queries to assess database structure for the Macrostrat schema */

/** Find units that are replicated elsewhere... */
SELECT * FROM macrostrat.units
WHERE id NOT IN (SELECT unit_id FROM macrostrat.units_sections)
AND section_id NOT IN (SELECT id FROM macrostrat.sections);

/** Find units in multiple sections */
SELECT * FROM macrostrat.units
WHERE id IN (SELECT unit_id FROM macrostrat.units_sections
GROUP BY unit_id
HAVING COUNT(unit_id) > 1);



/** Find units_sections entries correlated to units direct links to sections */
SELECT u.id, u.section_id, us.section_id, u.col_id, us.col_id
FROM macrostrat.units u
JOIN macrostrat.units_sections us
ON u.id = us.unit_id
WHERE u.section_id != us.section_id
OR u.col_id != us.col_id;

/** 258 units are in more than one section/column
TODO: we should decide if this will be allowed (i.e., do we put a unique constraint in units_sections?)
*/

SELECT unit_id, array_agg(col_id), array_agg(section_id) FROM macrostrat.units_sections
GROUP BY unit_id
HAVING COUNT(unit_id) > 1;

-- Find units that reference an invalid section
SELECT * FROM macrostrat.units
WHERE section_id NOT IN (SELECT id FROM macrostrat.sections);


Loading
Loading