Updating disdat, luigi, docker to latest (#2)

* updating disdat, luigi, docker to latest * Updated the dependencies here to move to Disdat 1.1.3, * Moved Docker and Luigi dependencies forward. * Updated tests to use Mock >5, as well as use standard pytest conftest.py file. * changed few logger.warn to logger.warning * sdist creation now uses '_' for canonical naming * tests require fastparquet, because pyarrow's s3fs implementation side-steps Moto. * removing debug code --------- Co-authored-by: Ken Yocum <ken_yocum@intuit.com>
kyocum · Jul 2, 2024 · 6b32cab · 6b32cab
1 parent bd6a973
commit 6b32cab
Show file tree

Hide file tree

Showing 23 changed files with 104 additions and 79 deletions.
diff --git a/build-dist.sh b/build-dist.sh
@@ -7,14 +7,14 @@ echo "Building Disdat package for local installation or PyPi . . ."
 # and then: git tag <version>
 
 # Remove the prior tar ball from the context.template
-rm -rf disdatluigi/infrastructure/dockerizer/context.template/disdat-luigi-*.tar.gz
-rm -rf dist/disdat-luigi-*.tar.gz
+rm -rf disdatluigi/infrastructure/dockerizer/context.template/disdat_luigi-*.tar.gz
+rm -rf dist/disdat_luigi-*.tar.gz
 
 # Create a new sdist
 python setup.py sdist
 
 # Copy over to the context.template.
-cp dist/disdat-luigi-*.tar.gz disdatluigi/infrastructure/dockerizer/context.template/.
+cp dist/disdat_luigi-*.tar.gz disdatluigi/infrastructure/dockerizer/context.template/.
 
 # Create a new sdist that will have that tar.gz in the template
 python setup.py sdist
@@ -25,7 +25,7 @@ if false; then
  #twine upload --repository-url https://test.pypi.org/legacy/ dist/disdat-*.tar.gz
  # Test: pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple disdat
  # now do it for real
- twine upload dist/disdat-luigi-*.tar.gz
+ twine upload dist/disdat_luigi-*.tar.gz
 fi
 
 echo "Finished"

diff --git a/disdatluigi/apply.py b/disdatluigi/apply.py
@@ -69,10 +69,10 @@ def apply(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force,
  _logger.debug("incremental_pull {}".format(incremental_pull))
 
  if incremental_push:
- _logger.warn("incremental_push {}".format(incremental_push))
+ _logger.warning("incremental_push {}".format(incremental_push))
 
  if incremental_pull:
- _logger.warn("incremental_pull {}".format(incremental_pull))
+ _logger.warning("incremental_pull {}".format(incremental_pull))
 
  pfs = fs.DisdatFS()
 

diff --git a/disdatluigi/entrypoints/sagemaker_ep.py b/disdatluigi/entrypoints/sagemaker_ep.py
@@ -90,7 +90,7 @@ def train():
  _logger.error("Disdat SageMaker train entrypoint failed.")
  sys.exit(os.EX_IOERR)
  elif args.purpose == 'serve':
- _logger.warn("Disdat does not yet support SageMaker serve.")
+ _logger.warning("Disdat does not yet support SageMaker serve.")
  sys.exit(os.EX_UNAVAILABLE)
  else:
  _logger.error("Disdat SageMaker invoked entrypoint with {}, not 'train' or 'serve'".format(args.purpose))

diff --git a/disdatluigi/utility/aws_batch.py b/disdatluigi/utility/aws_batch.py
@@ -164,7 +164,7 @@ def ecr_create_fq_respository_name(repository_name, policy_resource_package=None
  )
  repository_metadata = response['repositories'][0]
  elif e.response['Error']['Code'] == 'AccessDeniedException':
- _logger.warn("Error [AccessDeniedException] when creating repo {}, trying to continue...".format(repository_name))
+ _logger.warning("Error [AccessDeniedException] when creating repo {}, trying to continue...".format(repository_name))
  else:
  raise e
  return repository_metadata['repositoryUri']

diff --git a/setup.py b/setup.py
@@ -90,10 +90,10 @@
  # If <= means higher versions broke something.
 
  install_requires=[
- 'disdat>=1.0.0',
- 'luigi>=3.0,<=3.1',
+ 'disdat>=1.1.3,<1.2',
+ 'luigi>=3.0,<3.6',
  'boto3>=1.14.49,<2.0',
- 'docker>=4.1.0,<4.4.0',
+ 'docker>=7.0.0,<7.2.0',
  ],
 
  # List additional groups of dependencies here (e.g. development
@@ -108,7 +108,8 @@
  'pylint',
  'coverage',
  'tox',
- 'moto',
+ 'moto>=5',
+ 'fastparquet',
  's3fs<=0.4.2' # 0.5.0 breaks with aiobotocore and missing AWS headers
  ],
  'rel': [

diff --git a/tests/bundles/test_arg_capture.py b/tests/bundles/test_arg_capture.py
@@ -17,7 +17,7 @@
 import disdat.api as dsdt_api
 from disdatluigi.pipe import PipeTask
 import disdatluigi.api as api
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 test_luigi_args_data = {'str_arg': 'some string',
  'int_arg': 10,
@@ -39,7 +39,7 @@ def pipe_run(self):
  return True
 
 
-def test_luigi_args(run_test):
+def test_luigi_args():
  """ Create a task, store args, retrieve from bundle api.
  Pass in python objects as the values for Luigi parameters.
  Stored as serialized json objects. Bundle presents the parameters

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,58 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import pytest
+import disdat.api as api
+
+TEST_CONTEXT = '___test_context___'
+
+"""
+NOTE: When testing we try to use Moto to mock-out the s3 calls. Pyarrow sidesteps boto by using its own S3FileSystem. This is a problem
+because that c++ code is no longer accessing s3 URLs through the botocore Python library. So you get errors about INVALID CREDENTIALS. 
+The primary solution for testing is to simply use fastparquet instead of pyarrow. The other alternative is to use the moto_server or, as the 
+comment below suggests, use fsspec's s3fs. At this time (7-2024) we simply use fastparquet for testing.
+
+From: https://issues.apache.org/jira/browse/ARROW-16437?page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel&focusedCommentId=17530795#comment-17530795
+
+ I don't think you can use the "mock_s3" method of moto directly with our S3 filesystem integration (as Antoine said, we don't use boto). 
+ A possible way to do this is to use fsspec's s3fs, and then pass this filesystem object (and pyarrow will wrap that in a PyFileSystem), 
+ but that adds another layer of indirection which you don't need here (and which can be another source of failures).
+ Another option is to use the "moto_server" feature of moto (http://docs.getmoto.org/en/latest/docs/getting_started.html#stand-alone-server-mode), 
+ which gives you an endpoint url, that can be used to construct a pyarrow S3FileSystem that interacts with the moto server.
+ This is basically the approach that we use ourselves (but with MinIO instead of moto), and eg also dask switched from mock_s3 to 
+ moto_server in their tests (see eg https://github.com/dask/dask/blob/4d6a5f08c45be56302f696ca4ef6038a1cd1e734/dask/bytes/tests/test_s3.py#L84)
+
+"""
+
+@pytest.fixture(autouse=True, scope='module')
+def aws_credentials():
+ """Mocked AWS Credentials for moto."""
+ _aws_credentials()
+
+def _aws_credentials():
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+ os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
+
+@pytest.fixture(autouse=True, scope='function')
+def run_test():
+ # Remove test context before running test
+ if TEST_CONTEXT in api.ls_contexts():
+ api.delete_context(context_name=TEST_CONTEXT)
+ api.context(context_name=TEST_CONTEXT)
+ yield
+ api.delete_context(context_name=TEST_CONTEXT)
diff --git a/tests/functional/common.py b/tests/functional/common.py
diff --git a/tests/functional/test_api_exit.py b/tests/functional/test_api_exit.py
@@ -16,11 +16,10 @@
 
 from disdatluigi.pipe import PipeTask
 from disdatluigi.common import ApplyError
-import disdat.api as disdat_api
 import disdatluigi.api as api
 import luigi
 
-from tests.functional.common import TEST_CONTEXT, run_test
+from tests.conftest import TEST_CONTEXT
 
 TEST_NAME = 'test_bundle'
 

diff --git a/tests/functional/test_api_run.py b/tests/functional/test_api_run.py
@@ -19,8 +19,7 @@
 import docker
 import pytest
 
-
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 from tests.functional.common_tasks import COMMON_DEFAULT_ARGS
 import disdat.api as disdat_api
 import disdatluigi.api as api
@@ -106,7 +105,7 @@ def test_run_local_container(run_test, build_container_setup_only):
  assert b_a_f.uuid != b_a_f2.uuid
 
 
-#@moto.mock_s3
+#@moto.mock_aws
 def manual_test_run_aws_batch(run_test, build_container_setup_only):
  """ Incomplete test. The container code itself needs to have
  its S3 access mocked out. Here we are testing manually

diff --git a/tests/functional/test_external_bundle.py b/tests/functional/test_external_bundle.py
@@ -20,7 +20,7 @@
 import disdat.api as api
 import disdatluigi.api as dlapi
 
-from tests.functional.common import run_test, TEST_CONTEXT # autouse fixture to setup / tear down context
+from tests.conftest import TEST_CONTEXT
 
 
 def test(run_test):

diff --git a/tests/functional/test_external_dep.py b/tests/functional/test_external_dep.py
@@ -19,7 +19,7 @@
 import disdat.api as api
 import disdatluigi.api as dlapi
 from disdatluigi.common import ApplyError
-from tests.functional.common import run_test, TEST_CONTEXT # autouse fixture to setup / tear down context
+from tests.conftest import TEST_CONTEXT
 
 EXT_BUNDLE_NAME='ext_bundle_human_name'
 BUNDLE_CONTENTS=list(range(9))

diff --git a/tests/functional/test_force_one_and_all.py b/tests/functional/test_force_one_and_all.py
@@ -17,7 +17,7 @@
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 TEST_NAME = 'test_bundle'
 

diff --git a/tests/functional/test_inc_pull.py b/tests/functional/test_inc_pull.py
@@ -27,7 +27,7 @@
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 
 TEST_REMOTE = '__test_remote_context__'
@@ -82,11 +82,11 @@ def pipe_run(self, b=None):
  return {'file': [target.path]}
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_add_with_treat_as_bundle():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
-
+ 
  # Setup moto s3 resources
  s3_client = boto3.client('s3')
  s3_resource = boto3.resource('s3')

diff --git a/tests/functional/test_inc_push.py b/tests/functional/test_inc_push.py
@@ -27,7 +27,7 @@
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 TEST_REMOTE = '__test_remote_context__'
 TEST_BUCKET = 'test-bucket'
@@ -79,7 +79,7 @@ def pipe_run(self, b=None):
  raise Exception
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_add_with_treat_as_bundle():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)

diff --git a/tests/functional/test_managed.py b/tests/functional/test_managed.py
@@ -21,14 +21,17 @@
 """
 import boto3
 import moto
+from moto import mock_aws
 import pandas as pd
 import pytest
 import os
 
+from tests.conftest import _aws_credentials
+
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 TEST_REMOTE = '__test_remote_context__'
 TEST_BUCKET = 'test-bucket'
@@ -132,7 +135,7 @@ def test_non_managed_local():
  'Local file should be present in bundle'
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_remote_push_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -165,7 +168,7 @@ def test_remote_push_managed_s3():
  assert output_file in keys, 'Pipeline should have pushed file'
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_remote_push_non_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -205,7 +208,7 @@ def test_remote_push_non_managed_s3():
  assert output_file in keys, 'Pipeline should have pushed file'
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_remote_no_push_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -226,7 +229,7 @@ def test_remote_no_push_managed_s3():
  dlapi.apply(TEST_CONTEXT, ManagedS3)
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_remote_no_push_non_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -266,7 +269,7 @@ def test_no_remote_push_managed_s3():
  dlapi.apply(TEST_CONTEXT, ManagedS3, incremental_push=True)
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_no_remote_push_non_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -301,7 +304,7 @@ def test_no_remote_no_push_managed_s3():
  dlapi.apply(TEST_CONTEXT, ManagedS3)
 
 
-@moto.mock_s3
+@moto.mock_aws
 def test_no_remote_no_push_non_managed_s3():
  api.delete_context(TEST_CONTEXT)
  api.context(context_name=TEST_CONTEXT)
@@ -328,7 +331,6 @@ def test_no_remote_no_push_non_managed_s3():
 
 
 if __name__ == '__main__':
- #test_remote_push_managed_s3()
  pytest.main([__file__])
 
 

diff --git a/tests/functional/test_mark_force.py b/tests/functional/test_mark_force.py
@@ -16,7 +16,7 @@
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 TEST_NAME = 'test_bundle'
 

diff --git a/tests/functional/test_pipeline.py b/tests/functional/test_pipeline.py
@@ -17,7 +17,7 @@
 from disdatluigi.pipe import PipeTask
 import disdat.api as api
 import disdatluigi.api as dlapi
-from tests.functional.common import run_test, TEST_CONTEXT
+from tests.conftest import TEST_CONTEXT
 
 
 class A(PipeTask):