From 3d987c3c2e0ae59f12979029bad74511dae6dc15 Mon Sep 17 00:00:00 2001 From: James McNeill <55981540+jpmmcneill@users.noreply.github.com> Date: Sat, 23 Jul 2022 22:54:57 +0100 Subject: [PATCH] Feat: refactor largest connected subgraph (#20) * rename macro to largest-connected-subgraph * make the largest_connected_subgraph macro make a table that is vertex focused rather than a graph * fix unit tests after refactor * fix readme error for largest connected subgraph --- README.md | 29 +++++++-------- ...st_connected_subgraph_1_subgraph_data.csv} | 0 ...cted_subgraph_2_subgraph_no_edge_data.csv} | 0 ...cted_subgraph_3_subgraph_no_edge_data.csv} | 0 ...st_connected_subgraph_4_subgraph_data.csv} | 0 ...ted_subgraph_graph_id_3_subgraph_data.csv} | 0 ...gest_connected_subgraph_graph_id_data.csv} | 0 ...rgest_connected_subgraph_no_data_data.csv} | 0 ...nected_subgraph_no_data_graph_id_data.csv} | 0 .../test_largest_connected_subgraph.yml} | 16 ++++----- ..._largest_connected_subgraph_1_subgraph.sql | 22 ++++++++++++ ..._connected_subgraph_2_subgraph_no_edge.sql | 29 +++++++++++++++ ..._connected_subgraph_3_subgraph_no_edge.sql | 20 +++++++++++ ..._largest_connected_subgraph_4_subgraph.sql | 26 ++++++++++++++ ...st_largest_connected_subgraph_graph_id.sql | 27 ++++++++++++++ ...connected_subgraph_graph_id_3_subgraph.sql | 36 +++++++++++++++++++ ...est_largest_connected_subgraph_no_data.sql | 19 ++++++++++ ...st_connected_subgraph_no_data_graph_id.sql | 20 +++++++++++ .../test_largest_conn_subgraph_1_subgraph.sql | 26 -------------- ...rgest_conn_subgraph_2_subgraph_no_edge.sql | 31 ---------------- ...rgest_conn_subgraph_3_subgraph_no_edge.sql | 20 ----------- .../test_largest_conn_subgraph_4_subgraph.sql | 24 ------------- .../test_largest_conn_subgraph_graph_id.sql | 25 ------------- ...gest_conn_subgraph_graph_id_3_subgraph.sql | 35 ------------------ .../test_largest_conn_subgraph_no_data.sql | 19 ---------- ...largest_conn_subgraph_no_data_graph_id.sql | 20 ----------- ...ier.sql => largest_connected_subgraph.sql} | 29 ++++++++------- tests/generic/graph_is_connected.sql | 2 +- 28 files changed, 240 insertions(+), 235 deletions(-) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_1_subgraph_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_2_subgraph_no_edge_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_3_subgraph_no_edge_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_4_subgraph_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_3_subgraph_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_data.csv} (100%) rename integration_tests/data/{test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_graph_id_data.csv => test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id_data.csv} (100%) rename integration_tests/models/{test_largest_connected_subgraph_identifier/test_largest_connected_subgraph_identifier.yml => test_largest_connected_subgraph/test_largest_connected_subgraph.yml} (76%) create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data.sql create mode 100644 integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_1_subgraph.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_2_subgraph_no_edge.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_3_subgraph_no_edge.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_4_subgraph.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id_3_subgraph.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data.sql delete mode 100644 integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data_graph_id.sql rename macros/{largest_connected_subgraph_identifier.sql => largest_connected_subgraph.sql} (88%) diff --git a/README.md b/README.md index 763bcdc..9c36ace 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ from the integration_tests folder. sqlfluff does not currently support macros, m - [graph_is_connected](#graph_is_connected) **[Macros](#macros)** - - [largest_connected_subgraph_identifier](#largest_connected_subgraph_identifier) + - [largest_connected_subgraph](#largest_connected_subgraph) **[Helper Macros](#helper-macros)** - [array_agg](#array_agg) @@ -238,7 +238,7 @@ flowchart ``` ## Macros -### [largest_connected_subgraph_identifier](macros/largest_connected_subgraph_identifier.sql) +### [largest_connected_subgraph](macros/largest_connected_subgraph.sql) Arguments: - input: the input node (inputted as `ref(...)` or `source(...)`) or CTE (inputted as a string) @@ -250,7 +250,7 @@ Arguments: **Usage:** ```sql with subgraphs as ( - {{ dbt_graph_theory.largest_connected_subgraph_identifier( + {{ dbt_graph_theory.largest_connected_subgraph( input=ref('example_model'), edge_id='edge_id_field_name', vertex_1='vertex_1_field_name', @@ -264,7 +264,7 @@ with subgraphs as ( ```sql ... subgraphs as ( - {{ dbt_graph_theory.largest_connected_subgraph_identifier( + {{ dbt_graph_theory.largest_connected_subgraph( input='example_cte', edge_id='different_edge_id_field_name', vertex_1='different_vertex_1_field_name', @@ -290,16 +290,17 @@ flowchart The following table is returned: -| edge_id | vertex_1 | vertex_2 | subgraph_id | subgraph_members | -|:-------:|:--------:|:--------:|:-----------:|:------------------:| -| 1 | A | B | 1 | ['A', 'B', 'C'] | -| 2 | B | C | 1 | ['A', 'B', 'C'] | -| 3 | D | E | 2 |['D', 'E', 'F', 'G']| -| 4 | E | F | 2 |['D', 'E', 'F', 'G']| -| 5 | D | F | 2 |['D', 'E', 'F', 'G']| -| 6 | E | G | 2 |['D', 'E', 'F', 'G']| - -subgraph_id is designed to be unique to both the graph and subgraph level. +| vertex | subgraph_id | subgraph_members | +|:--------:|:-----------:|:------------------:| +| A | 1 | ['A', 'B', 'C'] | +| B | 1 | ['A', 'B', 'C'] | +| C | 2 | ['A', 'B', 'C'] | +| D | 2 |['D', 'E', 'F', 'G']| +| E | 2 |['D', 'E', 'F', 'G']| +| F | 2 |['D', 'E', 'F', 'G']| +| G | 2 |['D', 'E', 'F', 'G']| + +subgraph_id is designed to be unique to both the graph and subgraph level. When graph_id is defined, the output is also at a graph_id level. ## Helper Macros Note that the below are designed for internal (ie. dbt-graph-theory) use only. Use them at your own risk! diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_1_subgraph_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_1_subgraph_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_2_subgraph_no_edge_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_2_subgraph_no_edge_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_3_subgraph_no_edge_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_3_subgraph_no_edge_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_4_subgraph_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_4_subgraph_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_3_subgraph_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_3_subgraph_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_graph_id_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_data.csv diff --git a/integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_graph_id_data.csv b/integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id_data.csv similarity index 100% rename from integration_tests/data/test_largest_conn_subgraph/test_largest_conn_subgraph_no_data_graph_id_data.csv rename to integration_tests/data/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id_data.csv diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_connected_subgraph_identifier.yml b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph.yml similarity index 76% rename from integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_connected_subgraph_identifier.yml rename to integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph.yml index f6be224..aed4277 100644 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_connected_subgraph_identifier.yml +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph.yml @@ -1,41 +1,41 @@ version: 2 models: - - name: test_largest_conn_subgraph_1_subgraph + - name: test_largest_connected_subgraph_1_subgraph description: Unit test checking a situation with 1 subgraph and no graph_id. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_4_subgraph + - name: test_largest_connected_subgraph_4_subgraph description: Unit test checking a situation with 4 subgraphs and no graph_id. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_2_subgraph_no_edge + - name: test_largest_connected_subgraph_2_subgraph_no_edge description: Unit test checking a situation with 1 subgraph, no connecting edges and no graph_id. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_3_subgraph_no_edge + - name: test_largest_connected_subgraph_3_subgraph_no_edge description: Unit test checking a situation with 3 subgraphs, no connecting edges and no graph_id. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_graph_id + - name: test_largest_connected_subgraph_graph_id description: Unit test checking a situation with 2 graph ids, one with two subgraphs and one with 1 subgraph. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_graph_id_3_subgraph + - name: test_largest_connected_subgraph_graph_id_3_subgraph description: Unit test checking a situation with 2 graph ids, with three subgraphs each. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_no_data + - name: test_largest_connected_subgraph_no_data description: Unit test checking a situation no data. tests: - dbt_expectations.expect_table_row_count_to_equal: value: 0 - - name: test_largest_conn_subgraph_no_data_graph_id + - name: test_largest_connected_subgraph_no_data_graph_id description: Unit test checking a situation no data but with graph_id defined. tests: - dbt_expectations.expect_table_row_count_to_equal: diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph.sql new file mode 100644 index 0000000..fe24640 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_1_subgraph.sql @@ -0,0 +1,22 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_1_subgraph_data') + ) }} +), + +subgraph_members as ( + select v.* from ( + values + ('A', '1', array['A', 'B', 'C', 'D', 'E']), + ('B', '1', array['A', 'B', 'C', 'D', 'E']), + ('C', '1', array['A', 'B', 'C', 'D', 'E']), + ('D', '1', array['A', 'B', 'C', 'D', 'E']), + ('E', '1', array['A', 'B', 'C', 'D', 'E']) + ) as v (vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge.sql new file mode 100644 index 0000000..c0bbd2b --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_2_subgraph_no_edge.sql @@ -0,0 +1,29 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_2_subgraph_no_edge_data') + ) }} +), + +-- recast because vertex_2 is all null in seed data, interpreted as int dtype +recast_computed as ( + select + vertex::text as vertex, + subgraph_id, + subgraph_members + from + computed +), + +subgraph_members as ( + select v.* from ( + values + ('A', '1', array['A']), + ('B', '2', array['B']) + ) as v (vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'recast_computed', + 'subgraph_members', + fields=["vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge.sql new file mode 100644 index 0000000..fbd8df6 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_3_subgraph_no_edge.sql @@ -0,0 +1,20 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_3_subgraph_no_edge_data') + ) }} +), + +subgraph_members as ( + select v.* from ( + values + ('A', '1', array['A']), + ('B', '2', array['B']), + ('C', '3', array['C']) + ) as v (vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph.sql new file mode 100644 index 0000000..e9ea959 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_4_subgraph.sql @@ -0,0 +1,26 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_4_subgraph_data') + ) }} +), + +subgraph_members as ( + select v.* from ( + values + ('A', '1', array['A', 'B', 'C', 'D']), + ('B', '1', array['A', 'B', 'C', 'D']), + ('C', '1', array['A', 'B', 'C', 'D']), + ('D', '1', array['A', 'B', 'C', 'D']), + ('E', '2', array['E', 'F']), + ('F', '2', array['E', 'F']), + ('G', '3', array['G']), + ('H', '4', array['H', 'I']), + ('I', '4', array['H', 'I']) + ) as v (vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id.sql new file mode 100644 index 0000000..b036c52 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id.sql @@ -0,0 +1,27 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_graph_id_data'), + graph_id='graph_id' + ) }} +), + +subgraph_members as ( + select v.* from ( + values + (1, 'A', '1__1', array['A', 'B', 'C', 'D']), + (1, 'B', '1__1', array['A', 'B', 'C', 'D']), + (1, 'C', '1__1', array['A', 'B', 'C', 'D']), + (1, 'D', '1__1', array['A', 'B', 'C', 'D']), + (1, 'E', '1__2', array['E']), + (2, 'A', '2__1', array['A', 'B', 'C', 'D']), + (2, 'B', '2__1', array['A', 'B', 'C', 'D']), + (2, 'C', '2__1', array['A', 'B', 'C', 'D']), + (2, 'D', '2__1', array['A', 'B', 'C', 'D']) + ) as v (graph_id, vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph.sql new file mode 100644 index 0000000..60fe15f --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_graph_id_3_subgraph.sql @@ -0,0 +1,36 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_graph_id_3_subgraph_data'), + graph_id='graph_id' + ) }} +), + +subgraph_members as ( + select v.* from ( + values + (1, 'A', '1__1', array['A', 'B', 'C', 'D']), + (1, 'B', '1__1', array['A', 'B', 'C', 'D']), + (1, 'C', '1__1', array['A', 'B', 'C', 'D']), + (1, 'D', '1__1', array['A', 'B', 'C', 'D']), + (1, 'E', '1__2', array['E', 'F', 'G', 'H']), + (1, 'F', '1__2', array['E', 'F', 'G', 'H']), + (1, 'G', '1__2', array['E', 'F', 'G', 'H']), + (1, 'H', '1__2', array['E', 'F', 'G', 'H']), + (1, 'I', '1__3', array['I', 'J']), + (1, 'J', '1__3', array['I', 'J']), + (2, 'A', '2__1', array['A', 'B', 'C']), + (2, 'B', '2__1', array['A', 'B', 'C']), + (2, 'C', '2__1', array['A', 'B', 'C']), + (2, 'D', '2__2', array['D']), + (2, 'E', '2__3', array['E', 'F', 'G', 'H']), + (2, 'F', '2__3', array['E', 'F', 'G', 'H']), + (2, 'G', '2__3', array['E', 'F', 'G', 'H']), + (2, 'H', '2__3', array['E', 'F', 'G', 'H']) + ) as v (graph_id, vertex, subgraph_id, subgraph_members) +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data.sql new file mode 100644 index 0000000..d8c4874 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data.sql @@ -0,0 +1,19 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_no_data_data') + ) }} +), + +subgraph_members as ( + select v.* from ( + values + (null::text, null::text, array[null]) + ) as v (vertex, subgraph_id, subgraph_members) + where false +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id.sql b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id.sql new file mode 100644 index 0000000..4f90385 --- /dev/null +++ b/integration_tests/models/test_largest_connected_subgraph/test_largest_connected_subgraph_no_data_graph_id.sql @@ -0,0 +1,20 @@ +with computed as ( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( + input=ref('test_largest_connected_subgraph_no_data_graph_id_data'), + graph_id='graph_id' + ) }} +), + +subgraph_members as ( + select v.* from ( + values + (null::integer, null::text, null::text, array[null]) + ) as v (graph_id, vertex, subgraph_id, subgraph_members) + where false +) + +select * from {{ cte_difference( + 'computed', + 'subgraph_members', + fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"] +) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_1_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_1_subgraph.sql deleted file mode 100644 index dea1aef..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_1_subgraph.sql +++ /dev/null @@ -1,26 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_1_subgraph_data') - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (1, 'A', 'B', '1', array['A', 'B', 'C', 'D', 'E']), - (2, 'B', 'C', '1', array['A', 'B', 'C', 'D', 'E']), - (3, 'C', 'D', '1', array['A', 'B', 'C', 'D', 'E']), - (4, 'B', 'D', '1', array['A', 'B', 'C', 'D', 'E']), - (5, 'A', 'C', '1', array['A', 'B', 'C', 'D', 'E']), - (6, 'B', 'E', '1', array['A', 'B', 'C', 'D', 'E']), - (7, 'E', 'D', '1', array['A', 'B', 'C', 'D', 'E']), - (8, 'A', null, '1', array['A', 'B', 'C', 'D', 'E']), - (9, 'E', null, '1', array['A', 'B', 'C', 'D', 'E']) - ) as v (id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_2_subgraph_no_edge.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_2_subgraph_no_edge.sql deleted file mode 100644 index 7163f05..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_2_subgraph_no_edge.sql +++ /dev/null @@ -1,31 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_2_subgraph_no_edge_data') - ) }} -), - --- recast because vertex_2 is all null in seed data, interpreted as int dtype -recast_computed as ( - select - id, - vertex_1, - vertex_2::text as vertex_2, - subgraph_id, - subgraph_members - from - computed -), - -subgraph_members as ( - select v.* from ( - values - (1, 'A', null, '1', array['A']), - (2, 'B', null, '2', array['B']) - ) as v (id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'recast_computed', - 'subgraph_members', - fields=["id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_3_subgraph_no_edge.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_3_subgraph_no_edge.sql deleted file mode 100644 index e8067d6..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_3_subgraph_no_edge.sql +++ /dev/null @@ -1,20 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_3_subgraph_no_edge_data') - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (1, 'A', null, '1', array['A']), - (2, 'B', null, '2', array['B']), - (3, null, 'C', '3', array['C']) - ) as v (id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_4_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_4_subgraph.sql deleted file mode 100644 index 4e509a7..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_4_subgraph.sql +++ /dev/null @@ -1,24 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_4_subgraph_data') - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (1, 'A', 'B', '1', array['A', 'B', 'C', 'D']), - (2, 'B', 'C', '1', array['A', 'B', 'C', 'D']), - (3, 'C', 'D', '1', array['A', 'B', 'C', 'D']), - (4, 'E', null, '2', array['E', 'F']), - (5, 'E', 'F', '2', array['E', 'F']), - (6, 'G', null, '3', array['G']), - (7, 'H', 'I', '4', array['H', 'I']) - ) as v (id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id.sql deleted file mode 100644 index 5ffda7a..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id.sql +++ /dev/null @@ -1,25 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_graph_id_data'), - graph_id='graph_id' - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (1, 1, 'A', 'B', '1__1', array['A', 'B', 'C', 'D']), - (1, 2, 'B', 'C', '1__1', array['A', 'B', 'C', 'D']), - (1, 3, 'C', 'D', '1__1', array['A', 'B', 'C', 'D']), - (1, 4, 'E', null, '1__2', array['E']), - (2, 1, 'A', 'B', '2__1', array['A', 'B', 'C', 'D']), - (2, 2, 'B', 'C', '2__1', array['A', 'B', 'C', 'D']), - (2, 3, 'C', 'D', '2__1', array['A', 'B', 'C', 'D']) - ) as v (graph_id, id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["graph_id", "id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id_3_subgraph.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id_3_subgraph.sql deleted file mode 100644 index 4761347..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_graph_id_3_subgraph.sql +++ /dev/null @@ -1,35 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_graph_id_3_subgraph_data'), - graph_id='graph_id' - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (1, 1, 'A', 'B', '1__1', array['A', 'B', 'C', 'D']), - (1, 2, 'B', 'A', '1__1', array['A', 'B', 'C', 'D']), - (1, 3, 'A', 'C', '1__1', array['A', 'B', 'C', 'D']), - (1, 4, 'C', 'D', '1__1', array['A', 'B', 'C', 'D']), - (1, 5, 'D', 'A', '1__1', array['A', 'B', 'C', 'D']), - (1, 6, 'E', 'F', '1__2', array['E', 'F', 'G', 'H']), - (1, 7, 'F', 'G', '1__2', array['E', 'F', 'G', 'H']), - (1, 8, 'G', 'E', '1__2', array['E', 'F', 'G', 'H']), - (1, 9, 'F', 'H', '1__2', array['E', 'F', 'G', 'H']), - (1, 10, 'I', 'J', '1__3', array['I', 'J']), - (2, 1, 'A', 'B', '2__1', array['A', 'B', 'C']), - (2, 2, 'B', 'C', '2__1', array['A', 'B', 'C']), - (2, 3, 'D', null, '2__2', array['D']), - (2, 4, 'E', 'F', '2__3', array['E', 'F', 'G', 'H']), - (2, 5, 'F', 'G', '2__3', array['E', 'F', 'G', 'H']), - (2, 6, 'G', 'E', '2__3', array['E', 'F', 'G', 'H']), - (2, 7, 'F', 'H', '2__3', array['E', 'F', 'G', 'H']) - ) as v (graph_id, id, vertex_1, vertex_2, subgraph_id, subgraph_members) -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["graph_id", "id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data.sql deleted file mode 100644 index 3c95fc0..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data.sql +++ /dev/null @@ -1,19 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_no_data_data') - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (null::integer, null::integer, null::integer, null::text, array[null]) - ) as v (id, vertex_1, vertex_2, subgraph_id, subgraph_members) - where false -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data_graph_id.sql b/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data_graph_id.sql deleted file mode 100644 index a99dc57..0000000 --- a/integration_tests/models/test_largest_connected_subgraph_identifier/test_largest_conn_subgraph_no_data_graph_id.sql +++ /dev/null @@ -1,20 +0,0 @@ -with computed as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( - input=ref('test_largest_conn_subgraph_no_data_graph_id_data'), - graph_id='graph_id' - ) }} -), - -subgraph_members as ( - select v.* from ( - values - (null::integer, null::integer, null::integer, null::integer, null::text, array[null]) - ) as v (graph_id, id, vertex_1, vertex_2, subgraph_id, subgraph_members) - where false -) - -select * from {{ cte_difference( - 'computed', - 'subgraph_members', - fields=["graph_id", "id", "vertex_1", "vertex_2", "subgraph_id", "subgraph_members"] -) }} diff --git a/macros/largest_connected_subgraph_identifier.sql b/macros/largest_connected_subgraph.sql similarity index 88% rename from macros/largest_connected_subgraph_identifier.sql rename to macros/largest_connected_subgraph.sql index ee0657a..624adff 100644 --- a/macros/largest_connected_subgraph_identifier.sql +++ b/macros/largest_connected_subgraph.sql @@ -1,4 +1,4 @@ -{% macro largest_connected_largest_conn_subgraph( +{% macro largest_connected_largest_connected_subgraph( input, edge_id='id', vertex_1='vertex_1', @@ -150,20 +150,25 @@ from node_subgraphs ), - join_detail as ( - select - _input.*, + largest_connected_subgraphs as ( + -- join in the input to preserve data types on graph_id and vertex. + select distinct + {{ '_input.' ~ graph_id ~ ',' if graph_id }} + _output.vertex, concat( - {{ '_input.'~graph_id if graph_id else "''" }}, + {{ '_output.graph_id' if graph_id else "''" }}, {{ "'__'," if graph_id }} - subgraphs.subgraph_id + subgraph_id ) as subgraph_id, - subgraphs.subgraph_members - from {{ input }} as _input - left join generate_subgraph_id as subgraphs on - coalesce(_input.{{ vertex_1 }}::text, _input.{{ vertex_2 }}::text) = subgraphs.vertex - {{ 'and _input.' ~ graph_id ~ '::text = subgraphs.graph_id' if graph_id }} + subgraph_members + from generate_subgraph_id as _output + left join {{ input }} as _input on + ( + _output.vertex = _input.{{ vertex_1 }}::text or + _output.vertex = _input.{{ vertex_2 }}::text + ) + {{ 'and _output.graph_id = _input.' ~ graph_id ~ '::text' if graph_id }} ) - select * from join_detail + select * from largest_connected_subgraphs {% endmacro %} diff --git a/tests/generic/graph_is_connected.sql b/tests/generic/graph_is_connected.sql index cfc19da..0d05ea2 100644 --- a/tests/generic/graph_is_connected.sql +++ b/tests/generic/graph_is_connected.sql @@ -7,7 +7,7 @@ ) %} with connected_subgraphs as ( - {{ dbt_graph_theory.largest_connected_largest_conn_subgraph( + {{ dbt_graph_theory.largest_connected_largest_connected_subgraph( input=model, edge_id=edge_id, vertex_1=vertex_1,