From f4b93c8a5a5efc30421849eb065514bf9b9bffe1 Mon Sep 17 00:00:00 2001 From: Jacob Beck Date: Wed, 17 Jun 2020 12:03:01 -0600 Subject: [PATCH 1/4] handle long paths in windows, add a test --- CHANGELOG.md | 1 + core/dbt/clients/system.py | 47 +++++++++++++- .../test_docs_generate.py | 64 +++++++++++++++++++ test/integration/base.py | 5 +- 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6e7ddcaec9..ba8aeff50b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - `dbt deps` now respects the `--project-dir` flag, so using `dbt deps --project-dir=/some/path` and then `dbt run --project-dir=/some/path` will properly find dependencies ([#2519](https://github.com/fishtown-analytics/dbt/issues/2519), [#2534](https://github.com/fishtown-analytics/dbt/pull/2534)) - `packages.yml` revision/version fields can be float-like again (`revision: '1.0'` is valid). ([#2518](https://github.com/fishtown-analytics/dbt/issues/2518), [#2535](https://github.com/fishtown-analytics/dbt/pull/2535)) - Parallel RPC requests no longer step on each others' arguments ([[#2484](https://github.com/fishtown-analytics/dbt/issues/2484), [#2554](https://github.com/fishtown-analytics/dbt/pull/2554)]) +- On windows (depending upon OS support), dbt no longer fails with errors when writing artifacts ([#2558](https://github.com/fishtown-analytics/dbt/issues/2558), [#2566](https://github.com/fishtown-analytics/dbt/pull/2566)) ## dbt 0.17.0 (June 08, 2020) diff --git a/core/dbt/clients/system.py b/core/dbt/clients/system.py index b167d192e4e..eebe6fe8945 100644 --- a/core/dbt/clients/system.py +++ b/core/dbt/clients/system.py @@ -66,6 +66,7 @@ def find_matching( def load_file_contents(path: str, strip: bool = True) -> str: + path = convert_path(path) with open(path, 'rb') as handle: to_return = handle.read().decode('utf-8') @@ -81,6 +82,7 @@ def make_directory(path: str) -> None: exist. This function handles the case where two threads try to create a directory at once. """ + path = convert_path(path) if not os.path.exists(path): # concurrent writes that try to create the same dir can fail try: @@ -99,6 +101,7 @@ def make_file(path: str, contents: str = '', overwrite: bool = False) -> bool: exists. The file is saved with contents `contents` """ if overwrite or not os.path.exists(path): + path = convert_path(path) with open(path, 'w') as fh: fh.write(contents) return True @@ -121,6 +124,7 @@ def supports_symlinks() -> bool: def write_file(path: str, contents: str = '') -> bool: + path = convert_path(path) make_directory(os.path.dirname(path)) with open(path, 'w', encoding='utf-8') as f: f.write(str(contents)) @@ -163,7 +167,7 @@ def rmdir(path: str) -> None: different permissions on Windows. Otherwise, removing directories (eg. cloned via git) can cause rmtree to throw a PermissionError exception """ - logger.debug("DEBUG** Window rmdir sys.platform: {}".format(sys.platform)) + path = convert_path(path) if sys.platform == 'win32': onerror = _windows_rmdir_readonly else: @@ -172,15 +176,49 @@ def rmdir(path: str) -> None: shutil.rmtree(path, onerror=onerror) +def convert_path(path: str) -> str: + """Convert a path that dbt has, which might be >260 characters long, to one + that will be writable/readable on Windows. + + On other platforms, this is a no-op. + """ + if sys.platform != 'win32': + return path + prefix = '\\\\?\\' + # Nothing to do + if path.startswith(prefix): + return path + path = os.path.normpath(path) + if path.startswith('\\'): + # if a path starts with '\\', splitdrive() on it will return '' for the + # drive, but the prefix requires a drive letter. So let's add the drive + # letter back in. + curdrive = os.path.splitdrive(os.getcwd())[0] + path = curdrive + path + + # now our path is either an absolute UNC path or relative to the current + # directory. If it's relative, we need to make it absolute or the prefix + # won't work. + if not os.path.splitdrive(path)[0]: + path = os.path.join(os.getcwd(), path) + + if not path.startswith(prefix): + path = prefix + path + return path + + def remove_file(path: str) -> None: + path = convert_path(path) os.remove(path) def path_exists(path: str) -> bool: + path = convert_path(path) return os.path.lexists(path) def path_is_symlink(path: str) -> bool: + path = convert_path(path) return os.path.islink(path) @@ -326,6 +364,7 @@ def run_cmd( def download(url: str, path: str, timeout: Union[float, tuple] = None) -> None: + path = convert_path(path) connection_timeout = timeout or float(os.getenv('DBT_HTTP_TIMEOUT', 10)) response = requests.get(url, timeout=connection_timeout) with open(path, 'wb') as handle: @@ -334,6 +373,8 @@ def download(url: str, path: str, timeout: Union[float, tuple] = None) -> None: def rename(from_path: str, to_path: str, force: bool = False) -> None: + from_path = convert_path(from_path) + to_path = convert_path(to_path) is_symlink = path_is_symlink(to_path) if os.path.exists(to_path) and force: @@ -348,6 +389,7 @@ def rename(from_path: str, to_path: str, force: bool = False) -> None: def untar_package( tar_path: str, dest_dir: str, rename_to: Optional[str] = None ) -> None: + tar_path = convert_path(tar_path) tar_dir_name = None with tarfile.open(tar_path, 'r') as tarball: tarball.extractall(dest_dir) @@ -384,6 +426,8 @@ def move(src, dst): This is almost identical to the real shutil.move, except it uses our rmtree and skips handling non-windows OSes since the existing one works ok there. """ + src = convert_path(src) + dst = convert_path(dst) if os.name != 'nt': return shutil.move(src, dst) @@ -418,4 +462,5 @@ def rmtree(path): """Recursively remove path. On permissions errors on windows, try to remove the read-only flag and try again. """ + path = convert_path(path) return shutil.rmtree(path, onerror=chmod_and_retry) diff --git a/test/integration/029_docs_generate_tests/test_docs_generate.py b/test/integration/029_docs_generate_tests/test_docs_generate.py index 213ace98b67..de0ba561eaa 100644 --- a/test/integration/029_docs_generate_tests/test_docs_generate.py +++ b/test/integration/029_docs_generate_tests/test_docs_generate.py @@ -2,15 +2,19 @@ import json import os import random +import shutil +import tempfile import time from datetime import datetime from unittest.mock import ANY, patch +from pytest import mark from test.integration.base import DBTIntegrationTest, use_profile, AnyFloat, \ AnyString, AnyStringWith, normalize, Normalized from dbt.exceptions import CompilationException + def _read_file(path): with open(path, 'r') as fp: return fp.read().replace('\r', '').replace('\\r', '') @@ -3180,3 +3184,63 @@ def test_postgres_override_used(self): self.run_dbt(['docs', 'generate']) self.assertIn('rejected: no catalogs for you', str(exc.exception)) + + +@mark.skipif(os.name != 'nt', reason='This is only relevant on windows') +class TestDocsGenerateLongWindowsPaths(DBTIntegrationTest): + def _generate_test_root_dir(self): + assert os.name == 'nt' + magic_prefix = '\\\\?\\' + + # tempfile.mkdtemp doesn't use `\\?\` by default so we have to + # get a tiny bit creative. + temp_dir = tempfile.gettempdir() + if not temp_dir.startswith(magic_prefix): + temp_dir = magic_prefix + temp_dir + outer = tempfile.mkdtemp(prefix='dbt-int-test-', dir=temp_dir) + # then inside _that_ directory make a new one that gets us to just + # barely 260 total. I picked 250 to account for the '\' and anything + # else. The key is that len(inner) + len('target\\compiled\\...') will + # be >260 chars + new_length = 250 - len(outer) + inner = os.path.join(outer, 'a'*new_length) + os.mkdir(inner) + return normalize(inner) + + def _symlink_test_folders(self): + # dbt's normal symlink behavior breaks this test, so special-case it + for entry in os.listdir(self.test_original_source_path): + src = os.path.join(self.test_original_source_path, entry) + tst = os.path.join(self.test_root_dir, entry) + if entry == 'trivial_models': + shutil.copytree(src, tst) + elif entry == 'local_dependency': + continue + elif os.path.isdir(entry) or entry.endswith('.sql'): + os.symlink(src, tst) + + @property + def schema(self): + return 'docs_generate_029' + + @staticmethod + def dir(path): + return normalize(path) + + @property + def models(self): + return self.dir("trivial_models") + + def run_and_generate(self): + self.assertEqual(len(self.run_dbt(['run'])), 1) + os.remove(normalize('target/manifest.json')) + os.remove(normalize('target/run_results.json')) + self.run_dbt(['docs', 'generate']) + + @use_profile('postgres') + def test_postgres_long_paths(self): + self.run_and_generate() + # this doesn't use abspath, so all should be well here + manifest = _read_json('./target/manifest.json') + self.assertIn('nodes', manifest) + assert os.path.exists('./target/run/test/trivial_models/model.sql') diff --git a/test/integration/base.py b/test/integration/base.py index 7122d1a6c86..0ca24bf8b94 100644 --- a/test/integration/base.py +++ b/test/integration/base.py @@ -348,6 +348,9 @@ def test_root_realpath(self): else: return self.test_root_dir + def _generate_test_root_dir(self): + return normalize(tempfile.mkdtemp(prefix='dbt-int-test-')) + def setUp(self): self.dbt_core_install_root = os.path.dirname(dbt.__file__) log_manager.reset_handlers() @@ -357,7 +360,7 @@ def setUp(self): self._logs_dir = os.path.join(self.initial_dir, 'logs', self.prefix) _really_makedirs(self._logs_dir) self.test_original_source_path = _pytest_get_test_root() - self.test_root_dir = normalize(tempfile.mkdtemp(prefix='dbt-int-test-')) + self.test_root_dir = self._generate_test_root_dir() os.chdir(self.test_root_dir) try: From ff272c797a5314116caedc30ad97cb58069c0940 Mon Sep 17 00:00:00 2001 From: Jacob Beck Date: Thu, 18 Jun 2020 15:17:27 -0600 Subject: [PATCH 2/4] make sure dbt clean still works... --- test/integration/029_docs_generate_tests/test_docs_generate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/integration/029_docs_generate_tests/test_docs_generate.py b/test/integration/029_docs_generate_tests/test_docs_generate.py index de0ba561eaa..18066105620 100644 --- a/test/integration/029_docs_generate_tests/test_docs_generate.py +++ b/test/integration/029_docs_generate_tests/test_docs_generate.py @@ -3244,3 +3244,5 @@ def test_postgres_long_paths(self): manifest = _read_json('./target/manifest.json') self.assertIn('nodes', manifest) assert os.path.exists('./target/run/test/trivial_models/model.sql') + self.run_dbt(['clean']) + assert not os.path.exists('./target/run') From 8e63da2f04e59cc8203099fd11a4cd62141de5b0 Mon Sep 17 00:00:00 2001 From: Jacob Beck Date: Thu, 18 Jun 2020 17:11:15 -0600 Subject: [PATCH 3/4] Be pickier about when to mangle paths, handle the case where we still hit path length limits --- core/dbt/clients/system.py | 95 +++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/core/dbt/clients/system.py b/core/dbt/clients/system.py index eebe6fe8945..2193e6e7bc9 100644 --- a/core/dbt/clients/system.py +++ b/core/dbt/clients/system.py @@ -19,6 +19,12 @@ from dbt.logger import GLOBAL_LOGGER as logger +if sys.platform == 'win32': + from ctypes import WinDLL, c_bool +else: + WinDLL = None + c_bool = None + def find_matching( root_path: str, @@ -125,10 +131,24 @@ def supports_symlinks() -> bool: def write_file(path: str, contents: str = '') -> bool: path = convert_path(path) - make_directory(os.path.dirname(path)) - with open(path, 'w', encoding='utf-8') as f: - f.write(str(contents)) - + try: + make_directory(os.path.dirname(path)) + with open(path, 'w', encoding='utf-8') as f: + f.write(str(contents)) + except FileNotFoundError as exc: + if ( + os.name == 'nt' and + getattr(exc, 'winerror', 0) == 3 and + len(path) >= 260 + ): + # all our hard work and the path was still too long. Log and + # continue. + logger.debug( + f'Could not write to path {path}: Path was too long ' + f'({len(path)} characters)' + ) + else: + raise return True @@ -176,32 +196,73 @@ def rmdir(path: str) -> None: shutil.rmtree(path, onerror=onerror) +def _win_prepare_path(path: str) -> str: + """Given a windows path, prepare it for use by making sure it is absolute + and normalized. + """ + path = os.path.normpath(path) + + # if a path starts with '\', splitdrive() on it will return '' for the + # drive, but the prefix requires a drive letter. So let's add the drive + # letter back in. + # Unless it starts with '\\'. In that case, the path is a UNC mount point + # and splitdrive will be fine. + if not path.startswith('\\\\') and path.startswith('\\'): + curdrive = os.path.splitdrive(os.getcwd())[0] + path = curdrive + path + + # now our path is either an absolute UNC path or relative to the current + # directory. If it's relative, we need to make it absolute or the prefix + # won't work. `ntpath.abspath` allegedly doesn't always play nice with long + # paths, so do this instead. + if not os.path.splitdrive(path)[0]: + path = os.path.join(os.getcwd(), path) + + return path + + +def _supports_long_paths() -> bool: + if sys.platform != 'win32': + return True + # Eryk Sun says to use `WinDLL('ntdll')` instead of `windll.ntdll` because + # of pointer caching in a comment here: + # https://stackoverflow.com/a/35097999/11262881 + # I don't know exaclty what he means, but I am inclined to believe him as + # he's pretty active on Python windows bugs! + try: + dll = WinDLL('ntdll') + except OSError: # I don't think this happens? you need ntdll to run python + return False + # not all windows versions have it at all + if not hasattr(dll, 'RtlAreLongPathsEnabled'): + return False + # tell windows we want to get back a single unsigned byte (a bool). + dll.RtlAreLongPathsEnabled.restype = c_bool + return dll.RtlAreLongPathsEnabled() + + def convert_path(path: str) -> str: """Convert a path that dbt has, which might be >260 characters long, to one that will be writable/readable on Windows. On other platforms, this is a no-op. """ - if sys.platform != 'win32': + # some parts of python seem to append '\*.*' to strings, better safe than + # sorry. + if len(path) < 250: + return path + if _supports_long_paths(): return path + prefix = '\\\\?\\' # Nothing to do if path.startswith(prefix): return path - path = os.path.normpath(path) - if path.startswith('\\'): - # if a path starts with '\\', splitdrive() on it will return '' for the - # drive, but the prefix requires a drive letter. So let's add the drive - # letter back in. - curdrive = os.path.splitdrive(os.getcwd())[0] - path = curdrive + path - # now our path is either an absolute UNC path or relative to the current - # directory. If it's relative, we need to make it absolute or the prefix - # won't work. - if not os.path.splitdrive(path)[0]: - path = os.path.join(os.getcwd(), path) + path = _win_prepare_path(path) + # add the prefix. The check is just in case os.getcwd() does something + # unexpected - I believe this if-state should always be True though! if not path.startswith(prefix): path = prefix + path return path From db50880399b743792e3a60a2085243b55de68372 Mon Sep 17 00:00:00 2001 From: Jacob Beck Date: Fri, 19 Jun 2020 07:26:40 -0600 Subject: [PATCH 4/4] fix this test for spawn() vs fork() --- test/integration/048_rpc_test/test_rpc.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/test/integration/048_rpc_test/test_rpc.py b/test/integration/048_rpc_test/test_rpc.py index 70320681750..7c29dc006b0 100644 --- a/test/integration/048_rpc_test/test_rpc.py +++ b/test/integration/048_rpc_test/test_rpc.py @@ -768,11 +768,9 @@ def test_timeout_postgres(self): self.assertIn('message', error_data) self.assertEqual(error_data['message'], 'RPC timed out after 1.0s') self.assertIn('logs', error_data) - if sys.platform == 'darwin': - # because fork() without exec() is broken on macos, we use 'spawn' - # so on macos we don't get logs back because we didn't fork() - return - self.assertTrue(len(error_data['logs']) > 0) + # because fork() without exec() is broken, we use 'spawn' so we don't + # get logs back because we didn't fork() + return @mark.flaky(rerun_filter=addr_in_use)