Skip to content

Commit

Permalink
Update workspace evaluator to V2 and add more reports
Browse files Browse the repository at this point in the history
  • Loading branch information
akbog committed May 30, 2024
1 parent 9dcd8a3 commit 9f0d639
Show file tree
Hide file tree
Showing 24 changed files with 126 additions and 76 deletions.
17 changes: 7 additions & 10 deletions reports/classification/all_pii.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
-- Reports all PII classifiers
-- Note: Classifiers must start with "PII."

SELECT DISTINCT
t.table_ref,
c.column_name,
c.classifiers
FROM
sdf.information_schema.tables t
,
sdf.information_schema.columns c
SELECT
columns.table_id,
columns.column_name,
columns.classifiers
FROM
sdf.information_schema.columns
WHERE
t.table_ref = c.table_ref
AND c.classifiers LIKE '%PII.%'
cardinality(filter(classifiers, element -> contains(element, 'PII.'))) > 0;
Empty file.
11 changes: 11 additions & 0 deletions reports/classification/most_used_column_classifiers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
-- SDF STANDARD REPORT --
-- Reports the most used column classifiers in the workspace

SELECT classifier, COUNT(*) AS usage_count
FROM (
SELECT classifier
FROM sdf.information_schema.columns,
UNNEST(classifiers) AS t(classifier)
) unnested
GROUP BY classifier
ORDER BY usage_count DESC;
12 changes: 12 additions & 0 deletions reports/classification/most_used_table_classifiers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- SDF STANDARD REPORT --
-- Reports the most used table classifiers in the workspace

SELECT classifier, COUNT(*) AS usage_count
FROM (
SELECT classifier
FROM sdf.information_schema.tables,
UNNEST(classifiers) AS t(classifier)
) unnested
GROUP BY classifier
ORDER BY usage_count DESC;

4 changes: 0 additions & 4 deletions reports/classification/unique_classifiers.sql

This file was deleted.

6 changes: 6 additions & 0 deletions reports/classification/unique_column_classifiers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- SDF STANDARD REPORT --
-- Reports all unique column classifiers in the workspace

SELECT DISTINCT classifier
FROM sdf.information_schema.columns
CROSS JOIN UNNEST(classifiers) AS t(classifier);
6 changes: 6 additions & 0 deletions reports/classification/unique_table_classifiers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- SDF STANDARD REPORT --
-- Reports all unique table classifiers in the workspace

SELECT DISTINCT classifier
FROM sdf.information_schema.tables
CROSS JOIN UNNEST(classifiers) AS t(classifier);
3 changes: 1 addition & 2 deletions reports/classification/unused_classifiers.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
-- SDF STANDARD REPORT --
-- Reports all unused classifiers in the workspace

-- HARD TO GET WITH CURRENT INFO SCHEMA DESIGN
-- Requires some central table for all classifiers defined in workspace to be present
14 changes: 8 additions & 6 deletions reports/documentation/column_description_coverage.sql
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
-- SDF STANDARD REPORT --
-- Reports the percentage of columns that have descriptions


WITH columns_with_description AS (
SELECT
SELECT
1 as idx,
count(column_name) as c_desc
FROM sdf.information_schema.columns AS c
WHERE c.catalog_name != 'system'
WHERE c.table_purpose != 'system' AND c.table_purpose != 'external-system'
AND c.description IS NOT NULL
),
columns_with_no_description AS (
SELECT
1 as idx,
count(column_name) as c_no_desc
FROM sdf.information_schema.columns AS c
WHERE c.catalog_name != 'system'
WHERE c.table_purpose != 'system' AND c.table_purpose != 'external-system'
AND c.description IS NULL
)

SELECT
c_desc as cols_w_desc,
c_no_desc as cols_w_no_desc,
(CAST (c_desc AS REAL) / CAST(c_no_desc AS REAL)) as col_desc_pct,
CASE
WHEN c_no_desc = 0 THEN 100
ELSE (CAST(c_desc AS REAL) / CAST(c_no_desc AS REAL)) * 100
END AS col_desc_pct,
c_desc + c_no_desc as total_cols
from columns_with_description cw join columns_with_no_description cd USING (idx)
from columns_with_description cw join columns_with_no_description cd USING (idx)
9 changes: 6 additions & 3 deletions reports/documentation/table_description_coverage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,24 @@ WITH tables_with_description AS (
1 as idx,
count(table_name) as t_desc
FROM sdf.information_schema.tables AS t
WHERE t.catalog_name != 'system'
WHERE t.purpose != 'system' AND t.purpose != 'external-system'
AND t.description IS NOT NULL
),
tables_with_no_description AS (
SELECT
1 as idx,
count(table_name) as t_no_desc
FROM sdf.information_schema.tables AS t
WHERE t.catalog_name != 'system'
WHERE t.purpose != 'system' AND t.purpose != 'external-system'
AND t.description IS NULL
)

SELECT
t_desc as tables_w_desc,
t_no_desc as tables_w_no_desc,
(CAST (t_desc AS REAL) / CAST(t_no_desc AS REAL)) as table_desc_pct,
CASE
WHEN t_no_desc = 0 THEN 100
ELSE (CAST(t_desc AS REAL) / CAST(t_no_desc AS REAL)) * 100
END AS table_desc_pct,
t_desc + t_no_desc as total_tables
from tables_with_description tw join tables_with_no_description td USING (idx)
4 changes: 2 additions & 2 deletions reports/documentation/undocumented_columns.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
-- Reports all undocumented columns in the workspace

SELECT
table_ref,
table_id,
column_name
FROM sdf.information_schema.columns AS c
WHERE c.catalog_name != 'system'
WHERE c.table_purpose != 'system' AND c.table_purpose != 'external-system'
AND c.description IS NULL
4 changes: 2 additions & 2 deletions reports/documentation/undocumented_models.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-- Reports all undocumented models in the workspace

SELECT
table_ref as table_ref
table_id
FROM sdf.information_schema.tables AS t
WHERE t.catalog_name != 'system'
WHERE t.purpose != 'system' AND t.purpose != 'external-system'
AND t.description IS NULL
39 changes: 17 additions & 22 deletions reports/global_optimization/dead_columns.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,21 @@
-- For all models in the workspace with both upstream & downstream dependencies,
-- find columns which are unused.

-- INEFFICIENT QUERY!!
WITH tables_with_lineage AS (
SELECT
tl.table_id
FROM
sdf.information_schema.tables tl
WHERE LENGTH_ARRAY(tl.depends_on) != 0
AND LENGTH_ARRAY(tl.depended_on_by) != 0
AND (tl.purpose != 'system' AND tl.purpose != 'external-system')
)


-- WITH prefiltered_tables AS (
-- SELECT * FROM sdf.information_schema.tables it
-- WHERE EXISTS (
-- SELECT 1
-- FROM sdf.information_schema.tables downstream_tables
-- WHERE downstream_tables.dependencies ILIKE '%' || it.table_ref || '%'
-- ) AND it.dependencies IS NOT NULL
-- )

-- SELECT ic.column_name, ic.table_ref
-- FROM sdf.information_schema.columns ic
-- JOIN prefiltered_tables it ON ic.table_ref = it.table_ref
-- WHERE NOT EXISTS (
-- SELECT 1
-- FROM sdf.information_schema.columns downstream
-- JOIN sdf.information_schema.tables downstream_tables ON downstream.table_ref = downstream_tables.table_ref
-- WHERE (downstream.lineage_copy ILIKE '%' || ic.column_name || '%'
-- OR downstream.lineage_modify ILIKE '%' || ic.column_name || '%'
-- OR downstream_tables.lineage_scan ILIKE '%' || ic.column_name || '%')
-- );
SELECT
table_id, from_column_id AS dead_column_id
FROM
tables_with_lineage
JOIN
sdf.information_schema.column_lineage cl ON cl.from_table_id = tables_with_lineage.table_id
WHERE
cl.to_table_id IS NULL;
16 changes: 0 additions & 16 deletions reports/global_optimization/dead_tables.sql

This file was deleted.

1 change: 1 addition & 0 deletions reports/global_optimization/max_dag_depth.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
-- Reports the maximum depth of the workspace dag
-- For example:
-- A -> B || Maximum depth of 1
-- Requires either views or table functions to compute
9 changes: 9 additions & 0 deletions reports/structure/island_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- SDF STANDARD REPORT --
-- Reports all island tables in the workspace.
-- Island tables do not have any upstream or downstream dependencies

SELECT table_id
FROM sdf.information_schema.tables
WHERE LENGTH_ARRAY(tables.depends_on) = 0
AND LENGTH_ARRAY(tables.depended_on_by) = 0
AND purpose != 'system' AND purpose != 'external-system';
8 changes: 8 additions & 0 deletions reports/structure/leaf_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- SDF STANDARD REPORT --
-- Reports all leaf tables in the workspace.
-- For all tables in the workspace, report which have
-- no downstream dependencies

SELECT from_table_id
FROM sdf.information_schema.table_lineage
WHERE table_lineage.to_table_id IS NULL;
9 changes: 9 additions & 0 deletions reports/structure/middle_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- SDF STANDARD REPORT --
-- Reports all middle tables in the workspace.
-- For all tables in the workspace, report which have
-- both upstream and downstream dependencies

SELECT table_id
FROM sdf.information_schema.tables
WHERE LENGTH_ARRAY(tables.depends_on) != 0
AND LENGTH_ARRAY(tables.depended_on_by) != 0;
3 changes: 0 additions & 3 deletions reports/structure/most_col_deps.sql

This file was deleted.

7 changes: 7 additions & 0 deletions reports/structure/most_immediate_col_deps.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- SDF STANDARD REPORT --
-- Reports all columns with the most direct downstream dependencies
-- in descending order

SELECT column_id, LENGTH_ARRAY(depended_on_by) AS downstream_dep_count
FROM sdf.information_schema.columns
ORDER BY downstream_dep_count DESC;
7 changes: 7 additions & 0 deletions reports/structure/most_immeditate_table_deps.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- SDF STANDARD REPORT --
-- Reports all columns with the most direct downstream dependencies
-- in descending order

SELECT table_id, LENGTH_ARRAY(depended_on_by) AS downstream_dep_count
FROM sdf.information_schema.tables
ORDER BY downstream_dep_count DESC;
3 changes: 0 additions & 3 deletions reports/structure/most_table_deps.sql

This file was deleted.

6 changes: 3 additions & 3 deletions reports/structure/num_columns.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
-- SDF STANDARD REPORT --
-- Reports the number of columns per table in descending order
select
distinct table_ref,
distinct table_id,
count(column_name) as num_columns
from sdf.information_schema.columns as c
where c.catalog_name != 'system'
group by table_ref
where c.table_purpose != 'system' and c.table_purpose != 'external-system'
group by table_id
order by num_columns desc;
4 changes: 4 additions & 0 deletions reports/structure/root_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
-- Reports all root tables in the workspace
-- Root tables are defined as: Tables which have no upstream dependencies,
-- but do have downstream dependents

SELECT to_table_id
FROM sdf.information_schema.table_lineage
WHERE table_lineage.from_table_id IS NULL;

0 comments on commit 9f0d639

Please sign in to comment.