-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
New command: dbt clone
#7258
New command: dbt clone
#7258
Changes from 4 commits
ce1dc81
7e94302
df4ff4c
2543d5d
5f08d70
a08eea3
0b25b9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
kind: Features | ||
body: 'New command: ''dbt clone''' | ||
time: 2023-04-01T19:36:14.622217+02:00 | ||
custom: | ||
Author: jtcohen6 | ||
Issue: "7256" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1362,6 +1362,20 @@ def this(self) -> Optional[RelationProxy]: | |
return None | ||
return self.db_wrapper.Relation.create_from(self.config, self.model) | ||
|
||
@contextproperty | ||
def state_relation(self) -> Optional[RelationProxy]: | ||
""" | ||
For commands which add information about this node's corresponding | ||
production version (via a --state artifact), access the Relation | ||
object for that stateful other | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. other feels vague (well, really, just Latinate/French in phrasing :) ). "source node", etc.? |
||
""" | ||
if getattr(self.model, "state_relation", None): | ||
return self.db_wrapper.Relation.create_from_node( | ||
self.config, self.model.state_relation # type: ignore | ||
) | ||
else: | ||
return None | ||
|
||
|
||
# This is called by '_context_for', used in 'render_with_context' | ||
def generate_parser_model_context( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,7 @@ | |
GraphMemberNode, | ||
ResultNode, | ||
BaseNode, | ||
RelationalNode, | ||
) | ||
from dbt.contracts.graph.unparsed import SourcePatch | ||
from dbt.contracts.files import SourceFile, SchemaSourceFile, FileHash, AnySourceFile | ||
|
@@ -1032,6 +1033,30 @@ def merge_from_artifact( | |
sample = list(islice(merged, 5)) | ||
fire_event(MergedFromState(num_merged=len(merged), sample=sample)) | ||
|
||
# Called by CloneTask.defer_to_manifest | ||
def add_from_artifact( | ||
self, | ||
other: "WritableManifest", | ||
) -> None: | ||
"""Update this manifest by *adding* information about each node's location | ||
in the other manifest. | ||
|
||
Only non-ephemeral refable nodes are examined. | ||
""" | ||
refables = set(NodeType.refable()) | ||
for unique_id, node in other.nodes.items(): | ||
current = self.nodes.get(unique_id) | ||
if current and (node.resource_type in refables and not node.is_ephemeral): | ||
other_node = other.nodes[unique_id] | ||
state_relation = RelationalNode( | ||
other_node.database, other_node.schema, other_node.alias | ||
) | ||
self.nodes[unique_id] = current.replace(state_relation=state_relation) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're storing information about each node's production-state counterpart, right on the node entry in the manifest. I'm open to discussing whether this is the right approach. It feels better than passing the entire other manifest around into other methods/runners. |
||
|
||
# Rebuild the flat_graph, which powers the 'graph' context variable, | ||
# now that we've deferred some nodes | ||
self.build_flat_graph() | ||
|
||
# Methods that were formerly in ParseResult | ||
|
||
def add_macro(self, source_file: SourceFile, macro: Macro): | ||
|
@@ -1232,6 +1257,8 @@ def __post_serialize__(self, dct): | |
for unique_id, node in dct["nodes"].items(): | ||
if "config_call_dict" in node: | ||
del node["config_call_dict"] | ||
if "state_relation" in node: | ||
del node["state_relation"] | ||
Comment on lines
+1344
to
+1345
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't feel like |
||
return dct | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -229,16 +229,20 @@ def add_node(self, value: str): | |
|
||
|
||
@dataclass | ||
class ParsedNodeMandatory(GraphNode, HasRelationMetadata, Replaceable): | ||
class RelationalNode(HasRelationMetadata): | ||
alias: str | ||
checksum: FileHash | ||
config: NodeConfig = field(default_factory=NodeConfig) | ||
|
||
@property | ||
def identifier(self): | ||
return self.alias | ||
|
||
|
||
@dataclass | ||
class ParsedNodeMandatory(GraphNode, RelationalNode, Replaceable): | ||
checksum: FileHash | ||
config: NodeConfig = field(default_factory=NodeConfig) | ||
|
||
|
||
# This needs to be in all ManifestNodes and also in SourceDefinition, | ||
# because of "source freshness" | ||
@dataclass | ||
|
@@ -310,7 +314,7 @@ def __post_serialize__(self, dct): | |
@classmethod | ||
def _deserialize(cls, dct: Dict[str, int]): | ||
# The serialized ParsedNodes do not differ from each other | ||
# in fields that would allow 'from_dict' to distinguis | ||
# in fields that would allow 'from_dict' to distinguish | ||
# between them. | ||
resource_type = dct["resource_type"] | ||
if resource_type == "model": | ||
|
@@ -567,6 +571,7 @@ class HookNode(CompiledNode): | |
class ModelNode(CompiledNode): | ||
resource_type: NodeType = field(metadata={"restrict": [NodeType.Model]}) | ||
access: AccessType = AccessType.Protected | ||
state_relation: Optional[RelationalNode] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the approach of storing stateful information on each node about its prod counterpart, we do need a place on the node object to do that. For now, I'm adding a new attribute (default |
||
|
||
|
||
# TODO: rm? | ||
|
@@ -593,6 +598,7 @@ class SeedNode(ParsedNode): # No SQLDefaults! | |
# and we need the root_path to load the seed later | ||
root_path: Optional[str] = None | ||
depends_on: MacroDependsOn = field(default_factory=MacroDependsOn) | ||
state_relation: Optional[RelationalNode] = None | ||
|
||
def same_seeds(self, other: "SeedNode") -> bool: | ||
# for seeds, we check the hashes. If the hashes are different types, | ||
|
@@ -787,6 +793,7 @@ class IntermediateSnapshotNode(CompiledNode): | |
class SnapshotNode(CompiledNode): | ||
resource_type: NodeType = field(metadata={"restrict": [NodeType.Snapshot]}) | ||
config: SnapshotConfig | ||
state_relation: Optional[RelationalNode] = None | ||
|
||
|
||
# ==================================== | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
{% macro can_clone_tables() %} | ||
{{ return(adapter.dispatch('can_clone_tables', 'dbt')()) }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro default__can_clone_tables() %} | ||
{{ return(False) }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro snowflake__can_clone_tables() %} | ||
{{ return(True) }} | ||
{% endmacro %} | ||
Comment on lines
+1
to
+13
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This kind of True/False conditional behavior might be better as an adapter property/method (Python), since it's really a property of the adapter / data platform, rather than something a specific user wants to reimplement. Comparable to the "boolean macros" we defined for logic around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to state the obvious: any There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I concur with both comments |
||
|
||
|
||
{% macro get_pointer_sql(to_relation) %} | ||
{{ return(adapter.dispatch('get_pointer_sql', 'dbt')(to_relation)) }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro default__get_pointer_sql(to_relation) %} | ||
{% set pointer_sql %} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel uncomfortable with this use of pointer. I'm not exactly sure why were drawing that comparison, when perhaps we should use something closer to reference or allude to the fact that there's a shadow copy. In my mind pointer is just a very specific thing and somewhat anachronistic in a SQL context. |
||
select * from {{ to_relation }} | ||
{% endset %} | ||
{{ return(pointer_sql) }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro get_clone_table_sql(this_relation, state_relation) %} | ||
{{ return(adapter.dispatch('get_clone_table_sql', 'dbt')(this_relation, state_relation)) }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro default__get_clone_table_sql(this_relation, state_relation) %} | ||
create or replace table {{ this_relation }} clone {{ state_relation }} | ||
{% endmacro %} | ||
|
||
|
||
{% macro snowflake__get_clone_table_sql(this_relation, state_relation) %} | ||
create or replace | ||
{{ "transient" if config.get("transient", true) }} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Snowflake requires that table clones match the transience/permanence of the table they're cloning. We determine if the table is a table based on the cache result from the other/prod schema, but here we're just using the current (dev) configuration for |
||
table {{ this_relation }} | ||
clone {{ state_relation }} | ||
{{ "copy grants" if config.get("copy_grants", false) }} | ||
{% endmacro %} | ||
|
||
|
||
{%- materialization clone, default -%} | ||
|
||
{%- set relations = {'relations': []} -%} | ||
|
||
{%- set existing_relation = load_cached_relation(this) -%} | ||
{%- set other_existing_relation = load_cached_relation(state_relation) -%} | ||
|
||
{%- if existing_relation and not flags.FULL_REFRESH -%} | ||
-- noop! | ||
{{ return(relations) }} | ||
{%- endif -%} | ||
|
||
-- If this is a database that can do zero-copy cloning of tables, and the other relation is a table, then this will be a table | ||
-- Otherwise, this will be a view | ||
Comment on lines
+68
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
{% set can_clone_tables = can_clone_tables() %} | ||
|
||
{%- if other_existing_relation and other_existing_relation.type == 'table' and can_clone_tables -%} | ||
|
||
{%- set target_relation = this.incorporate(type='table') -%} | ||
{% if existing_relation is not none and not existing_relation.is_table %} | ||
{{ log("Dropping relation " ~ existing_relation ~ " because it is of type " ~ existing_relation.type) }} | ||
{{ drop_relation_if_exists(existing_relation) }} | ||
{% endif %} | ||
|
||
-- as a general rule, data platforms that can clone tables can also do atomic 'create or replace' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is me being a little bit lazy, but it does hold true for Snowflake/BigQuery/Databricks. My goal is for all of our data platforms to be able to use this materialization "as is," and only reimplement a targeted set of macros where the behavior genuinely differs. |
||
{% call statement('main') %} | ||
{{ get_clone_table_sql(target_relation, state_relation) }} | ||
{% endcall %} | ||
|
||
{% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} | ||
{% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} | ||
{% do persist_docs(target_relation, model) %} | ||
Comment on lines
+86
to
+88
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm thinking we should still apply grants & table/column-level comments. I have a suspicion that whether these things are copied over, during cloning, varies by data platform; I should really look to confirm/reject that suspicion. It's also possible that the user has defined conditional logic for these that differs between dev & prod, especially |
||
|
||
{{ return({'relations': [target_relation]}) }} | ||
|
||
{%- else -%} | ||
|
||
{%- set target_relation = this.incorporate(type='view') -%} | ||
|
||
-- TODO: this should probably be illegal | ||
-- I'm just doing it out of convenience to reuse the 'view' materialization logic | ||
{%- do context.update({ | ||
'sql': get_pointer_sql(state_relation), | ||
'compiled_code': get_pointer_sql(state_relation) | ||
}) -%} | ||
|
||
-- reuse the view materialization | ||
-- TODO: support actual dispatch for materialization macros | ||
{% set search_name = "materialization_view_" ~ adapter.type() %} | ||
{% if not search_name in context %} | ||
{% set search_name = "materialization_view_default" %} | ||
{% endif %} | ||
{% set materialization_macro = context[search_name] %} | ||
{% set relations = materialization_macro() %} | ||
Comment on lines
+96
to
+110
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is some of the most inspired (read: horrifying) code I've written in years. Big idea:
Specific things to call out:
|
||
{{ return(relations) }} | ||
|
||
{%- endif -%} | ||
|
||
{%- endmaterialization -%} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Open to naming suggestions! This will only be available in the context for the
clone
command currentlyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is "relation" being used as a byword for table/view. Relation convene, so many things, and I feel like we want an additional term or a better term that can be more specific about what you're trying to achieve. Perhaps "stateful_db_relation". I really don't know the subtleties here though, so you might have picked the best one.