From d15c0c6126f77a0bab291702c3e222b4f00905ab Mon Sep 17 00:00:00 2001 From: Drew Banin Date: Wed, 24 May 2017 20:53:04 -0400 Subject: [PATCH] First time flow (#439) * clone sample repo, better output after dbt init * add default target to initial profiles file * change starter project to fishtown repo * pep8 * add `dbt debug --config-dir` cmd * pep8 * sample file tweaks * add overwrite check + unit test --- dbt/clients/git.py | 13 +++- dbt/clients/system.py | 26 +++++++ dbt/main.py | 7 ++ dbt/task/debug.py | 29 ++++++- dbt/task/init.py | 133 +++++++++++++++++++++----------- sample.dbt_project.yml | 21 ++++- sample.profiles.yml | 62 +++++++++++---- sample.schema.yml | 3 +- test/unit/test_system_client.py | 47 +++++++++++ 9 files changed, 270 insertions(+), 71 deletions(-) create mode 100644 test/unit/test_system_client.py diff --git a/dbt/clients/git.py b/dbt/clients/git.py index 85c20be5fec..26831398918 100644 --- a/dbt/clients/git.py +++ b/dbt/clients/git.py @@ -18,8 +18,13 @@ def run_cmd(cwd, cmd): return proc.communicate() -def clone(repo, cwd): - return run_cmd(cwd, ['git', 'clone', '--depth', '1', repo]) +def clone(repo, cwd, dirname=None): + clone_cmd = ['git', 'clone', '--depth', '1', repo] + + if dirname is not None: + clone_cmd.append(dirname) + + return run_cmd(cwd, clone_cmd) def checkout(cwd, branch=None): @@ -39,3 +44,7 @@ def get_current_sha(cwd): out, err = run_cmd(cwd, ['git', 'rev-parse', 'HEAD']) return out.decode('utf-8') + + +def remove_remote(cwd): + return run_cmd(cwd, ['git', 'remote', 'rm', 'origin']) diff --git a/dbt/clients/system.py b/dbt/clients/system.py index 6c7b71c83d4..533c902a416 100644 --- a/dbt/clients/system.py +++ b/dbt/clients/system.py @@ -2,6 +2,7 @@ import fnmatch import os import os.path +import sys def find_matching(root_path, @@ -70,3 +71,28 @@ def make_directory(path): pass else: raise e + + +def make_file(path, contents='', overwrite=False): + """ + Make a file at `path` assuming that the directory it resides in already + exists. The file is saved with contents `contents` + """ + if overwrite or not os.path.exists(path): + with open(path, 'w') as fh: + fh.write(contents) + return True + + return False + + +def open_dir_cmd(): + # https://docs.python.org/2/library/sys.html#sys.platform + if sys.platform == 'win32': + return 'start' + + elif sys.platform == 'darwin': + return 'open' + + else: + return 'xdg-open' diff --git a/dbt/main.py b/dbt/main.py index cd43d892bb0..54f6dfdb16b 100644 --- a/dbt/main.py +++ b/dbt/main.py @@ -291,6 +291,13 @@ def parse_args(args): ) sub = subs.add_parser('debug', parents=[base_subparser]) + sub.add_argument( + '--config-dir', + action='store_true', + help=""" + If specified, DBT will show path information for this project + """ + ) sub.set_defaults(cls=debug_task.DebugTask, which='debug') sub = subs.add_parser('deps', parents=[base_subparser]) diff --git a/dbt/task/debug.py b/dbt/task/debug.py index 482bfdbbd3a..23ed736d0cf 100644 --- a/dbt/task/debug.py +++ b/dbt/task/debug.py @@ -1,5 +1,12 @@ import pprint + from dbt.logger import GLOBAL_LOGGER as logger +import dbt.clients.system +import dbt.project + +PROFILE_DIR_MESSAGE = """To view your profiles.yml file, run: + +{open_cmd} {profiles_dir}""" class DebugTask: @@ -7,9 +14,25 @@ def __init__(self, args, project): self.args = args self.project = project - def run(self): + def path_info(self): + open_cmd = dbt.clients.system.open_dir_cmd() + profiles_dir = dbt.project.default_profiles_dir + + message = PROFILE_DIR_MESSAGE.format( + open_cmd=open_cmd, + profiles_dir=profiles_dir + ) + + logger.info(message) + + def diag(self): logger.info("args: {}".format(self.args)) logger.info("project: ") - - # TODO: switch this out for a log statement pprint.pprint(self.project) + + def run(self): + + if self.args.config_dir: + self.path_info() + else: + self.diag() diff --git a/dbt/task/init.py b/dbt/task/init.py index 9dea7a77551..2f2c5d00421 100644 --- a/dbt/task/init.py +++ b/dbt/task/init.py @@ -1,66 +1,111 @@ -import pprint import os -SAMPLE_CONFIG = """ -name: 'package_name' -version: '1.0' - -source-paths: ["models"] # paths with source code to compile -analysis-paths: ["analysis"] # path with analysis files which are compiled, but - # not run -target-path: "target" # path for compiled code -clean-targets: ["target"] # directories removed by the clean task -test-paths: ["test"] # where to store test results -data-paths: ["data"] # load CSVs from this directory with `dbt seed` - -# specify per-model configs -#models: -# package_name: # define configs for this package (called -# # "package_name" above) -# pardot: # assuming pardot is listed in models/ -# enabled: false # disable all pardot models except where overriden -# pardot_emails: # override the configs for the pardot_emails model -# enabled: true # enable this specific model -# materialized: true # create a table instead of a view - -# uncomment below and add real repositories to add dependencies to this project -#repositories: -# - "git@github.com:[your-org]/[some-repo-1]" -# - "git@github.com:[your-org]/[some-repo-2]" -""" +import dbt.project +import dbt.clients.git +import dbt.clients.system + +from dbt.logger import GLOBAL_LOGGER as logger + +STARTER_REPO = 'https://github.com/fishtown-analytics/dbt-starter-project.git' +DOCS_URL = 'https://dbt.readme.io/docs/configure-your-profile' +SAMPLE_PROFILES_YML_FILE = 'https://github.com/fishtown-analytics/dbt/blob/master/sample.profiles.yml' # noqa + +ON_COMPLETE_MESSAGE = """ +Your new dbt project "{project_name}" was created! If this is your first time +using dbt, you'll need to set up your profiles.yml file -- this file will +tell dbt how to connect to your database. You can find this file by running: + + {open_cmd} {profiles_path} -GIT_IGNORE = """ -target/ -dbt_modules/ +For more information on how to configure the profiles.yml file, +please consult the dbt documentation here: + + {docs_url} + +One more thing: + +Need help? Don't hesitate to reach out to us via GitHub issues or on Slack -- +There's a link to our Slack group in the GitHub Readme. Happy modeling! """ +STARTER_PROFILE = """ +# For more information on how to configure this file, please see: +# {profiles_sample} + +default: + outputs: + dev: + type: redshift + threads: 1 + host: 127.0.0.1 + port: 5439 + user: alice + pass: pa55word + dbname: warehouse + schema: dbt_alice + prod: + type: redshift + threads: 1 + host: 127.0.0.1 + port: 5439 + user: alice + pass: pa55word + dbname: warehouse + schema: analytics + target: dev +""".format(profiles_sample=SAMPLE_PROFILES_YML_FILE) + + class InitTask: def __init__(self, args, project=None): self.args = args self.project = project - def __write(self, path, filename, contents): - file_path = os.path.join(path, filename) + def clone_starter_repo(self, project_name): + dbt.clients.git.clone(STARTER_REPO, '.', project_name) + dbt.clients.git.remove_remote(project_name) + + def create_profiles_dir(self, profiles_dir): + if not os.path.exists(profiles_dir): + dbt.clients.system.make_directory(profiles_dir) + return True + return False + + def create_profiles_file(self, profiles_file): + if not os.path.exists(profiles_file): + dbt.clients.system.make_file(profiles_file, STARTER_PROFILE) + return True + return False - with open(file_path, 'w') as fh: - fh.write(contents) + def get_addendum(self, project_name, profiles_path): + open_cmd = dbt.clients.system.open_dir_cmd() + + return ON_COMPLETE_MESSAGE.format( + open_cmd=open_cmd, + project_name=project_name, + profiles_path=profiles_path, + docs_url=DOCS_URL + ) def run(self): project_dir = self.args.project_name + profiles_dir = dbt.project.default_profiles_dir + profiles_file = os.path.join(profiles_dir, 'profiles.yml') + + self.create_profiles_dir(profiles_dir) + self.create_profiles_file(profiles_file) + + msg = "Creating dbt configuration folder at {}" + logger.info(msg.format(profiles_dir)) + if os.path.exists(project_dir): raise RuntimeError("directory {} already exists!".format( project_dir )) - os.mkdir(project_dir) - - project_dir = self.args.project_name - self.__write(project_dir, 'dbt_project.yml', SAMPLE_CONFIG) - self.__write(project_dir, '.gitignore', GIT_IGNORE) + self.clone_starter_repo(project_dir) - dirs = ['models', 'analysis', 'tests', 'data'] - for dir_name in dirs: - dir_path = os.path.join(project_dir, dir_name) - os.mkdir(dir_path) + addendum = self.get_addendum(project_dir, profiles_dir) + logger.info(addendum) diff --git a/sample.dbt_project.yml b/sample.dbt_project.yml index 35fa7bda8a4..726e3db22b8 100644 --- a/sample.dbt_project.yml +++ b/sample.dbt_project.yml @@ -16,7 +16,7 @@ name: 'your_package_name' # version: Required. This indicates the current version of your package and # should conform to semantic versioning. The field is currently unused -version: '0.0.1' +version: '0.1.0' @@ -40,7 +40,7 @@ target-path: "target" # test-paths: Optional. Specify which path(s) dbt should look in to find data # test definitions. -test-paths: ["test"] +test-paths: ["tests"] # data-paths: Optional. Specify which path(s) dbt should look in to find CSV # files. Running `dbt seed` will load these CSVs as tables in your warehouse @@ -191,5 +191,18 @@ repositories: # Archival # -# Data archival is a powerful feature intended for advanced dbt users. For more -# information, consult: https://dbt.readme.io/reference#archives +# dbt's archival process records snapshots of specified tables so that +# you can analyze how these tables change over time. In the example below, +# the public.users table is configured for archival. When the `updated_at` +# value changes for a given user record (identified by the `id` field), dbt +# will record a new record in `users_archived` table which reflects the +# changed state of that row. For more information on this command, consult +# the dbt documentation: https://dbt.readme.io/reference#archive +archive: + - source_schema: public + target_schema: public + tables: + - source_table: users + target_table: users_archived + updated_at: updated_at + unique_key: id diff --git a/sample.profiles.yml b/sample.profiles.yml index 7d539bb6a53..d461ac35936 100644 --- a/sample.profiles.yml +++ b/sample.profiles.yml @@ -7,22 +7,50 @@ # can be configured when dbt is invoked with the --profiles-dir option: # # $ dbt run --profiles-dir /opt/dbt/ -# -# Profiles should adhere to the structure defined below: +# Top-level configs that apply to all profiles are set here +config: + send_anonymous_usage_stats: True + use_colors: True + +# Profiles configurations should adhere to the structure defined below: +# +# Postgres / Redshift +# ------------------- +# +# [profile-name]: +# outputs: +# [target-name]: +# type: {redshift, postgres} +# threads: [1 - 8] +# host: [host ip or fully qualified domain name] +# port: [port] +# user: [user] +# pass: [password] +# dbname: [dbname] +# schema: [schema name] +# target: [target-name] +# +# +# Snowflake +# ------------------- +# +# [profile-name]: +# outputs: +# [target-name]: +# type: snowflake +# threads: [1 - 8] +# account: [url prefix for your snowflake connection] +# +# user: [user] +# password: [password] +# role: [optional, the snowflake role you want to use] +# +# database: [db name] +# warehouse: [warehouse] +# schema: [schema name] +# target: [target-name] # -# [profile-name]: -# outputs: -# [target-name]: -# type: {redshift, postgres} -# threads: [1 - 8] -# host: [host ip or fully qualified domain name] -# port: [port] -# user: [user] -# pass: [password] -# dbname: [dbname] -# schema: [schema name] -# target: [target-name] # # Commonly, it's helpful to define multiple targets for a profile. For example, # these targets might be `dev` and `prod`. Whereas the `dev` target points to @@ -30,13 +58,13 @@ # prod schema (eg. analytics). Analytical/BI tools should point to the # prod schema so that local development does not interfere with analysis. # -# In practice, a profile might look like this: +# The following are some examples of well-formatted profile configurations evil-corp: outputs: dev: # specify the dev connection type: redshift - threads: 8 + threads: 1 host: 12.34.56.78 port: 5439 user: elliot @@ -71,7 +99,7 @@ mr-robot: outputs: dev: # specify the dev connection type: postgres - threads: 6 + threads: 2 host: 87.65.43.21 port: 5439 user: mr_robot diff --git a/sample.schema.yml b/sample.schema.yml index 156114ea7bd..217c25313f8 100644 --- a/sample.schema.yml +++ b/sample.schema.yml @@ -54,8 +54,9 @@ people: - id # count number of rows where `people.account_id` points to a nonexistent `accounts.id` (should be 0) + # Note: the 'accounts' model needs to be `ref`d here relationships: - - {from: account_id, to: accounts, field: id} + - {from: account_id, to: ref('accounts'), field: id} accounts: constraints: diff --git a/test/unit/test_system_client.py b/test/unit/test_system_client.py new file mode 100644 index 00000000000..2e87e7f5b50 --- /dev/null +++ b/test/unit/test_system_client.py @@ -0,0 +1,47 @@ +import os +import unittest + +import dbt.clients.system + +if os.name == 'nt': + TMPDIR = 'c:/Windows/TEMP' +else: + TMPDIR = '/tmp' + +profiles_path = '{}/profiles.yml'.format(TMPDIR) + +class SystemClient(unittest.TestCase): + + def set_up_profile(self): + with open(profiles_path, 'w') as f: + f.write('ORIGINAL_TEXT') + + def get_profile_text(self): + with open(profiles_path, 'r') as f: + return f.read() + + def tearDown(self): + try: + os.remove(profiles_path) + except: + pass + + def test__make_file_when_exists(self): + self.set_up_profile() + written = dbt.clients.system.make_file(profiles_path, contents='NEW_TEXT') + + self.assertFalse(written) + self.assertEqual(self.get_profile_text(), 'ORIGINAL_TEXT') + + def test__make_file_when_not_exists(self): + written = dbt.clients.system.make_file(profiles_path, contents='NEW_TEXT') + + self.assertTrue(written) + self.assertEqual(self.get_profile_text(), 'NEW_TEXT') + + def test__make_file_with_overwrite(self): + self.set_up_profile() + written = dbt.clients.system.make_file(profiles_path, contents='NEW_TEXT', overwrite=True) + + self.assertTrue(written) + self.assertEqual(self.get_profile_text(), 'NEW_TEXT')