Skip to content

Commit

Permalink
Feat: refactor largest connected subgraph (#20)
Browse files Browse the repository at this point in the history
* rename macro to largest-connected-subgraph

* make the largest_connected_subgraph macro make a table that is vertex focused rather than a graph

* fix unit tests after refactor

* fix readme error for largest connected subgraph
  • Loading branch information
jpmmcneill authored Jul 23, 2022
1 parent 3f07b8d commit 3d987c3
Show file tree
Hide file tree
Showing 28 changed files with 240 additions and 235 deletions.
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ from the integration_tests folder. sqlfluff does not currently support macros, m
- [graph_is_connected](#graph_is_connected)

**[Macros](#macros)**
- [largest_connected_subgraph_identifier](#largest_connected_subgraph_identifier)
- [largest_connected_subgraph](#largest_connected_subgraph)

**[Helper Macros](#helper-macros)**
- [array_agg](#array_agg)
Expand Down Expand Up @@ -238,7 +238,7 @@ flowchart
```

## Macros
### [largest_connected_subgraph_identifier](macros/largest_connected_subgraph_identifier.sql)
### [largest_connected_subgraph](macros/largest_connected_subgraph.sql)

Arguments:
- input: the input node (inputted as `ref(...)` or `source(...)`) or CTE (inputted as a string)
Expand All @@ -250,7 +250,7 @@ Arguments:
**Usage:**
```sql
with subgraphs as (
{{ dbt_graph_theory.largest_connected_subgraph_identifier(
{{ dbt_graph_theory.largest_connected_subgraph(
input=ref('example_model'),
edge_id='edge_id_field_name',
vertex_1='vertex_1_field_name',
Expand All @@ -264,7 +264,7 @@ with subgraphs as (
```sql
...
subgraphs as (
{{ dbt_graph_theory.largest_connected_subgraph_identifier(
{{ dbt_graph_theory.largest_connected_subgraph(
input='example_cte',
edge_id='different_edge_id_field_name',
vertex_1='different_vertex_1_field_name',
Expand All @@ -290,16 +290,17 @@ flowchart

The following table is returned:

| edge_id | vertex_1 | vertex_2 | subgraph_id | subgraph_members |
|:-------:|:--------:|:--------:|:-----------:|:------------------:|
| 1 | A | B | 1 | ['A', 'B', 'C'] |
| 2 | B | C | 1 | ['A', 'B', 'C'] |
| 3 | D | E | 2 |['D', 'E', 'F', 'G']|
| 4 | E | F | 2 |['D', 'E', 'F', 'G']|
| 5 | D | F | 2 |['D', 'E', 'F', 'G']|
| 6 | E | G | 2 |['D', 'E', 'F', 'G']|

subgraph_id is designed to be unique to both the graph and subgraph level.
| vertex | subgraph_id | subgraph_members |
|:--------:|:-----------:|:------------------:|
| A | 1 | ['A', 'B', 'C'] |
| B | 1 | ['A', 'B', 'C'] |
| C | 2 | ['A', 'B', 'C'] |
| D | 2 |['D', 'E', 'F', 'G']|
| E | 2 |['D', 'E', 'F', 'G']|
| F | 2 |['D', 'E', 'F', 'G']|
| G | 2 |['D', 'E', 'F', 'G']|

subgraph_id is designed to be unique to both the graph and subgraph level. When graph_id is defined, the output is also at a graph_id level.

## Helper Macros
Note that the below are designed for internal (ie. dbt-graph-theory) use only. Use them at your own risk!
Expand Down
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
version: 2
models:
- name: test_largest_conn_subgraph_1_subgraph
- name: test_largest_connected_subgraph_1_subgraph
description: Unit test checking a situation with 1 subgraph and no graph_id.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_4_subgraph
- name: test_largest_connected_subgraph_4_subgraph
description: Unit test checking a situation with 4 subgraphs and no graph_id.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_2_subgraph_no_edge
- name: test_largest_connected_subgraph_2_subgraph_no_edge
description: Unit test checking a situation with 1 subgraph, no connecting edges and no graph_id.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_3_subgraph_no_edge
- name: test_largest_connected_subgraph_3_subgraph_no_edge
description: Unit test checking a situation with 3 subgraphs, no connecting edges and no graph_id.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_graph_id
- name: test_largest_connected_subgraph_graph_id
description: Unit test checking a situation with 2 graph ids, one with two subgraphs and one with 1 subgraph.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_graph_id_3_subgraph
- name: test_largest_connected_subgraph_graph_id_3_subgraph
description: Unit test checking a situation with 2 graph ids, with three subgraphs each.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_no_data
- name: test_largest_connected_subgraph_no_data
description: Unit test checking a situation no data.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
value: 0
- name: test_largest_conn_subgraph_no_data_graph_id
- name: test_largest_connected_subgraph_no_data_graph_id
description: Unit test checking a situation no data but with graph_id defined.
tests:
- dbt_expectations.expect_table_row_count_to_equal:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_1_subgraph_data')
) }}
),

subgraph_members as (
select v.* from (
values
('A', '1', array['A', 'B', 'C', 'D', 'E']),
('B', '1', array['A', 'B', 'C', 'D', 'E']),
('C', '1', array['A', 'B', 'C', 'D', 'E']),
('D', '1', array['A', 'B', 'C', 'D', 'E']),
('E', '1', array['A', 'B', 'C', 'D', 'E'])
) as v (vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_2_subgraph_no_edge_data')
) }}
),

-- recast because vertex_2 is all null in seed data, interpreted as int dtype
recast_computed as (
select
vertex::text as vertex,
subgraph_id,
subgraph_members
from
computed
),

subgraph_members as (
select v.* from (
values
('A', '1', array['A']),
('B', '2', array['B'])
) as v (vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'recast_computed',
'subgraph_members',
fields=["vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_3_subgraph_no_edge_data')
) }}
),

subgraph_members as (
select v.* from (
values
('A', '1', array['A']),
('B', '2', array['B']),
('C', '3', array['C'])
) as v (vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_4_subgraph_data')
) }}
),

subgraph_members as (
select v.* from (
values
('A', '1', array['A', 'B', 'C', 'D']),
('B', '1', array['A', 'B', 'C', 'D']),
('C', '1', array['A', 'B', 'C', 'D']),
('D', '1', array['A', 'B', 'C', 'D']),
('E', '2', array['E', 'F']),
('F', '2', array['E', 'F']),
('G', '3', array['G']),
('H', '4', array['H', 'I']),
('I', '4', array['H', 'I'])
) as v (vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_graph_id_data'),
graph_id='graph_id'
) }}
),

subgraph_members as (
select v.* from (
values
(1, 'A', '1__1', array['A', 'B', 'C', 'D']),
(1, 'B', '1__1', array['A', 'B', 'C', 'D']),
(1, 'C', '1__1', array['A', 'B', 'C', 'D']),
(1, 'D', '1__1', array['A', 'B', 'C', 'D']),
(1, 'E', '1__2', array['E']),
(2, 'A', '2__1', array['A', 'B', 'C', 'D']),
(2, 'B', '2__1', array['A', 'B', 'C', 'D']),
(2, 'C', '2__1', array['A', 'B', 'C', 'D']),
(2, 'D', '2__1', array['A', 'B', 'C', 'D'])
) as v (graph_id, vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_graph_id_3_subgraph_data'),
graph_id='graph_id'
) }}
),

subgraph_members as (
select v.* from (
values
(1, 'A', '1__1', array['A', 'B', 'C', 'D']),
(1, 'B', '1__1', array['A', 'B', 'C', 'D']),
(1, 'C', '1__1', array['A', 'B', 'C', 'D']),
(1, 'D', '1__1', array['A', 'B', 'C', 'D']),
(1, 'E', '1__2', array['E', 'F', 'G', 'H']),
(1, 'F', '1__2', array['E', 'F', 'G', 'H']),
(1, 'G', '1__2', array['E', 'F', 'G', 'H']),
(1, 'H', '1__2', array['E', 'F', 'G', 'H']),
(1, 'I', '1__3', array['I', 'J']),
(1, 'J', '1__3', array['I', 'J']),
(2, 'A', '2__1', array['A', 'B', 'C']),
(2, 'B', '2__1', array['A', 'B', 'C']),
(2, 'C', '2__1', array['A', 'B', 'C']),
(2, 'D', '2__2', array['D']),
(2, 'E', '2__3', array['E', 'F', 'G', 'H']),
(2, 'F', '2__3', array['E', 'F', 'G', 'H']),
(2, 'G', '2__3', array['E', 'F', 'G', 'H']),
(2, 'H', '2__3', array['E', 'F', 'G', 'H'])
) as v (graph_id, vertex, subgraph_id, subgraph_members)
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_no_data_data')
) }}
),

subgraph_members as (
select v.* from (
values
(null::text, null::text, array[null])
) as v (vertex, subgraph_id, subgraph_members)
where false
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["vertex", "subgraph_id", "subgraph_members"]
) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
with computed as (
{{ dbt_graph_theory.largest_connected_largest_connected_subgraph(
input=ref('test_largest_connected_subgraph_no_data_graph_id_data'),
graph_id='graph_id'
) }}
),

subgraph_members as (
select v.* from (
values
(null::integer, null::text, null::text, array[null])
) as v (graph_id, vertex, subgraph_id, subgraph_members)
where false
)

select * from {{ cte_difference(
'computed',
'subgraph_members',
fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
) }}

This file was deleted.

This file was deleted.

Loading

0 comments on commit 3d987c3

Please sign in to comment.