Merge pull request #335 from cynkra/f-3-compound

- `dm_add_pk()` and `dm_add_fk()` support compound keys via the `c()` notation, e.g. `dm_add_pk(dm, table, c(col1, col2))`. `dm_nycflights13()` returns a data model with compound keys by default. Use `compound = FALSE` to return the data model from dm 0.1.13 or earlier (#3). - `dm_get_all_fks()` includes `parent_pk_cols` column that describes the primary key columns of the parent table (#335). - `dm_examine_constraints()` and other check functions count the number of rows that violate constraints for primary and foreign keys (#335). - `copy_dm_to(set_key_constraints = FALSE)` downgrades unique indexess to regular indexes (#335). - `rows_truncate()` implemented for data frames (#335). - `dm_enum_fk_candidates()` enumerates column in the order they apper in the table (#335).
cynkra · Apr 28, 2021 · c2d4b5f · c2d4b5f
2 parents 3ab1b67 + 1184568
commit c2d4b5f
Show file tree

Hide file tree

Showing 59 changed files with 1,215 additions and 578 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -33,6 +33,7 @@ S3method(distinct,zoomed_dm)
 S3method(filter,dm)
 S3method(filter,zoomed_dm)
 S3method(format,dm)
+S3method(format,dm_keys)
 S3method(format,zoomed_df)
 S3method(format,zoomed_dm)
 S3method(full_join,dm)
@@ -73,6 +74,7 @@ S3method(rename,zoomed_dm)
 S3method(right_join,dm)
 S3method(right_join,zoomed_dm)
 S3method(rows_insert,tbl_dbi)
+S3method(rows_truncate,data.frame)
 S3method(rows_truncate,tbl_sql)
 S3method(rows_update,tbl_dbi)
 S3method(select,dm)
@@ -126,6 +128,9 @@ S3method(ungroup,dm)
 S3method(ungroup,zoomed_dm)
 S3method(unite,dm)
 S3method(unite,zoomed_dm)
+S3method(vec_cast,dm_keys.dm_keys)
+S3method(vec_proxy_compare,dm_keys)
+S3method(vec_ptype2,dm_keys.dm_keys)
 S3method(vec_ptype_abbr,dm_keys)
 export("%>%")
 export(anti_join)
@@ -310,5 +315,9 @@ importFrom(utils,head)
 importFrom(utils,packageVersion)
 importFrom(utils,str)
 importFrom(utils,tail)
+importFrom(vctrs,vec_cast)
+importFrom(vctrs,vec_data)
+importFrom(vctrs,vec_proxy_compare)
+importFrom(vctrs,vec_ptype2)
 importFrom(vctrs,vec_ptype_abbr)
 importFrom(vctrs,vec_slice)
diff --git a/R/db-helpers.R b/R/db-helpers.R
@@ -25,7 +25,7 @@ get_pid <- function() {
 }
 
 # Internal copy helper functions
-build_copy_data <- function(dm, dest, table_names) {
+build_copy_data <- function(dm, dest, table_names, set_key_constraints) {
  source <-
  dm %>%
  dm_apply_filters() %>%
@@ -42,6 +42,13 @@ build_copy_data <- function(dm, dest, table_names) {
  dm_get_all_pks_impl(dm) %>%
  transmute(source_name = table, column = pk_col, pk = TRUE)
 
+ # FIXME: COMPOUND: Need support for multiple primary keys, https://github.com/r-dbi/DBI/pull/351
+ # Discard primary keys of length > 1 for now
+ pks_flat <-
+ pks %>%
+ filter(lengths(column) == 1) %>%
+ mutate(column = as.character(column))
+
  fks <-
  dm_get_all_fks_impl(dm) %>%
  transmute(source_name = child_table, column = child_fk_cols, fk = TRUE)
@@ -55,27 +62,39 @@ build_copy_data <- function(dm, dest, table_names) {
  mutate(type = map(df, ~ map_chr(., ~ DBI::dbDataType(dest_con, .)))) %>%
  select(-df) %>%
  unnest(c(column, type)) %>%
- left_join(pks, by = c("source_name", "column")) %>%
+ left_join(pks_flat, by = c("source_name", "column")) %>%
  mutate(full_type = paste0(type, if_else(pk, " NOT NULL PRIMARY KEY", "", ""))) %>%
  group_by(source_name) %>%
  summarize(types = list(deframe(tibble(column, full_type))))
 
  copy_data_unique_indexes <-
  pks %>%
- transmute(source_name, unique_indexes = map(as.list(column), list))
+ group_by(source_name) %>%
+ summarize(unique_indexes = list(column)) %>%
+ ungroup()
 
- copy_data_indexes <-
+ copy_data_non_unique_indexes <-
  fks %>%
  select(source_name, column) %>%
  group_by(source_name) %>%
- summarize(indexes = map(list(column), as.list))
+ summarize(indexes = list(column))
+
+ copy_data_indexes <-
+ full_join(copy_data_unique_indexes, copy_data_non_unique_indexes, by = "source_name") %>%
+ mutate(indexes = map2(indexes, unique_indexes, setdiff))
+
+ # Downgrade unique indexes to non-unique indexes
+ if (!set_key_constraints) {
+ copy_data_indexes <-
+ copy_data_indexes %>%
+ mutate(indexes = map2(indexes, unique_indexes, c)) %>%
+ mutate(unique_indexes = list(new_keys()))
+ }
 
  copy_data <-
  copy_data_base %>%
  inner_join(copy_data_types, by = "source_name") %>%
- left_join(copy_data_unique_indexes, by = "source_name") %>%
- left_join(copy_data_indexes, by = "source_name") %>%
- mutate(indexes = map2(indexes, unique_indexes, setdiff))
+ left_join(copy_data_indexes, by = "source_name")
  } else {
  copy_data <-
  copy_data_base

diff --git a/R/db-interface.R b/R/db-interface.R
@@ -15,9 +15,11 @@
 #' @param dest An object of class `"src"` or `"DBIConnection"`.
 #' @param dm A `dm` object.
 #' @param overwrite,types,indexes,unique_indexes Must remain `NULL`.
-#' @param set_key_constraints Boolean variable, if `TRUE` will mirror `dm` key constraints on a database.
+#' @param set_key_constraints If `TRUE` will mirror `dm` primary and foreign key constraints on a database
+#' and create unique indexes.
+#' Set to `FALSE` if your data model currently does not satisfy primary or foreign key constraints.
 #' @param unique_table_names Deprecated.
-#' @param temporary Boolean variable, if `TRUE`, only temporary tables will be created.
+#' @param temporary If `TRUE`, only temporary tables will be created.
 #' These tables will vanish when disconnecting from the database.
 #' @param schema Name of schema to copy the `dm` to.
 #' If `schema` is provided, an error will be thrown if `temporary = FALSE` or
@@ -181,7 +183,7 @@ copy_dm_to <- function(dest, dm, ...,
  return(dm)
  }
 
- copy_data <- build_copy_data(dm, dest, table_names_out)
+ copy_data <- build_copy_data(dm, dest, table_names_out, set_key_constraints)
 
  new_tables <- copy_list_of_tables_to(
  dest,

diff --git a/R/disambiguate.R b/R/disambiguate.R
@@ -47,14 +47,21 @@ get_table_colnames <- function(dm, tables = NULL) {
  select(table, column) %>%
  unnest(column)
 
- pks <-
- dm_get_all_pks_impl(dm) %>%
- rename(column = pk_col)
+ pks <- dm_get_all_pks_def_impl(def)
+
+ if (nrow(pks) == 0) {
+ return(table_colnames)
+ }
+
+ keep_colnames <-
+ pks %>%
+ rename(column = pk_col) %>%
+ unnest(column)
 
  table_colnames %>%
  # in case of flattening, the primary key columns will never be responsible for the name
  # of the resulting column in the end, so they do not need to be disambiguated
- anti_join(pks, by = c("table", "column"))
+ anti_join(keep_colnames, by = c("table", "column"))
 }
 
 compute_disambiguate_cols_recipe <- function(table_colnames, sep) {

diff --git a/R/dm.R b/R/dm.R
@@ -301,14 +301,6 @@ dm_get_def <- function(x) {
  unclass(x)$def
 }
 
-dm_get_data_model_pks <- function(x) {
- # FIXME: Obliterate
-
- dm_get_def(x) %>%
- select(table, pks) %>%
- unnest_pks()
-}
-
 unnest_pks <- function(def) {
  # Optimized
  pk_df <- tibble(
@@ -320,39 +312,11 @@ unnest_pks <- function(def) {
  # FIXME: Should work better with dplyr 0.9.0
  if (!("column" %in% names(pk_df))) {
  pk_df$column <- character()
- } else {
- # This is expected to break with compound keys
- pk_df$column <- flatten_chr(pk_df$column)
  }
 
  pk_df
 }
 
-dm_get_data_model_fks <- function(x) {
- # FIXME: Obliterate
-
- fk_df <-
- dm_get_def(x) %>%
- select(ref = table, fks, pks) %>%
- filter(map_lgl(fks, has_length)) %>%
- unnest(pks)
-
- if (nrow(fk_df) == 0) {
- return(tibble(
- table = character(), column = character(),
- ref = character(), ref_col = character()
- ))
- }
-
- fk_df %>%
- # This is expected to break with compound keys
- mutate(ref_col = flatten_chr(column)) %>%
- select(-column) %>%
- unnest(fks) %>%
- mutate(column = flatten_chr(column)) %>%
- select(ref, column, table, ref_col)
-}
-
 #' Get filter expressions
 #'
 #' `dm_get_filters()` returns the filter expressions that have been applied to a `dm` object.

diff --git a/R/dplyr.R b/R/dplyr.R
@@ -147,7 +147,7 @@ slice.zoomed_dm <- function(.data, ..., .keep_pk = NULL) {
  orig_pk <- dm_get_pk_impl(.data, orig_name_zoomed(.data))
  tracked_cols <- get_tracked_cols(.data)
  if (is_null(.keep_pk)) {
- if (has_length(orig_pk) && orig_pk %in% tracked_cols) {
+ if (has_length(orig_pk) && any(unlist(orig_pk) %in% tracked_cols)) {
  message(
  paste(
  "Keeping PK column, but `slice.zoomed_dm()` can potentially damage the uniqueness of PK columns (duplicated indices).",
@@ -566,3 +566,17 @@ safe_count <- function(x, ..., wt = NULL, sort = FALSE, name = NULL, .drop = gro
  }
  ungroup(out)
 }
+
+new_tracked_cols <- function(dm, selected) {
+ tracked_cols <- get_tracked_cols(dm)
+ old_tracked_names <- names(tracked_cols)
+ # the new tracked keys need to be the remaining original column names
+ # and their name needs to be the newest one (tidyselect-syntax)
+ # `intersect(selected, old_tracked_names)` is empty, return `NULL`
+
+ selected_match <- selected[selected %in% old_tracked_names]
+ set_names(
+ tracked_cols[selected_match],
+ names(selected_match)
+ )
+}
diff --git a/R/draw-dm.R b/R/draw-dm.R
@@ -100,14 +100,18 @@ dm_get_data_model <- function(x, column_types) {
  stringsAsFactors = FALSE
  )
 
- references_for_columns <- dm_get_data_model_fks(x)
+ references_for_columns <-
+ dm_get_all_fks_impl(x) %>%
+ transmute(table = child_table, column = format(child_fk_cols), ref = parent_table, ref_col = format(parent_pk_cols))
 
  references <-
  references_for_columns %>%
  mutate(ref_id = row_number(), ref_col_num = 1L)
 
  keys <-
- dm_get_data_model_pks(x) %>%
+ dm_get_all_pks_impl(x) %>%
+ mutate(column = format(pk_col)) %>%
+ select(-pk_col) %>%
  mutate(key = 1L)
 
  if (column_types) {
@@ -118,9 +122,9 @@ dm_get_data_model <- function(x, column_types) {
 
  columns <-
  types %>%
- left_join(keys, by = c("table", "column")) %>%
+ full_join(keys, by = c("table", "column")) %>%
+ full_join(references_for_columns, by = c("table", "column")) %>%
  mutate(key = coalesce(key, 0L)) %>%
- left_join(references_for_columns, by = c("table", "column")) %>%
  # for compatibility with print method from {datamodelr}
  as.data.frame()
 

diff --git a/R/examine-constraints.R b/R/examine-constraints.R
@@ -90,20 +90,23 @@ check_pk_constraints <- function(dm) {
  pks <- dm_get_all_pks_impl(dm)
  if (nrow(pks) == 0) {
  return(tibble(
- table = character(0),
- kind = character(0),
- column = character(0),
- ref_table = NA_character_,
- is_key = logical(0),
- problem = character(0)
+ table = character(),
+ kind = character(),
+ column = new_keys(),
+ ref_table = character(),
+ is_key = logical(),
+ problem = character()
  ))
  }
  table_names <- pull(pks, table)
 
  tbls <- map(set_names(table_names), ~ tbl(dm, .))
 
- tbl_is_pk <- map2_dfr(tbls, pks$pk_col, enum_pk_candidates_impl) %>%
- mutate(table = table_names) %>%
+ tbl_is_pk <-
+ tibble(table = table_names, tbls, column = pks$pk_col) %>%
+ mutate(candidate = map2(tbls, column, ~ enum_pk_candidates_impl(.x, list(.y)))) %>%
+ select(-column, -tbls) %>%
+ unnest(candidate) %>%
  rename(is_key = candidate, problem = why)
 
  tibble(
@@ -121,11 +124,11 @@ check_fk_constraints <- function(dm) {
  cts <- pull(fks, child_table) %>% map(tbl, src = dm)
  fks_tibble <- mutate(fks, t1 = cts, t2 = pts) %>%
  select(t1, t1_name = child_table, colname = child_fk_cols, t2, t2_name = parent_table, pk = pk_col)
- mutate(
- fks_tibble,
- problem = pmap_chr(fks_tibble, check_fk),
- is_key = if_else(problem == "", TRUE, FALSE),
- kind = "FK"
- ) %>%
+ fks_tibble %>%
+ mutate(
+  problem = pmap_chr(fks_tibble, check_fk),
+  is_key = if_else(problem == "", TRUE, FALSE),
+  kind = "FK"
+  ) %>%
  select(table = t1_name, kind, column = colname, ref_table = t2_name, is_key, problem)
 }
diff --git a/R/filter-helpers.R b/R/filter-helpers.R
@@ -34,8 +34,8 @@ get_by <- function(dm, lhs_name, rhs_name) {
  if (length(lhs_col) > 1 || length(rhs_col) > 1) abort_no_cycles(create_graph_from_dm(dm))
  # Construct a `by` argument of the form `c("lhs_col[1]" = "rhs_col[1]", ...)`
  # as required by `*_join()`
- by <- rhs_col
- names(by) <- lhs_col
+ by <- get_key_cols(rhs_col)
+ names(by) <- get_key_cols(lhs_col)
  by
 }