From 1551e3435a7d05df26046c9a45b63b185c1e6a50 Mon Sep 17 00:00:00 2001 From: Dustin Luong Date: Wed, 22 Jul 2020 12:10:00 -0700 Subject: [PATCH] Removed custom rules pick 4a4265c6 Unit test passes squash 345b8a74 Updated unit tests, working on integration tests --- components/aws/sagemaker/common/_utils.py | 15 +- .../aws/sagemaker/common/train.template.yaml | 3 - .../tests/unit_tests/tests/test_train.py | 4 +- .../debugger_component_demo/README.md | 50 +++--- .../debugger-component-demo.py | 151 ++++++++++++++++++ .../debugger_component_demo.py | 25 --- .../training-pipeline.py | 148 +++++++++++++++++ .../training-pipeline.tar.gz | Bin 0 -> 1950 bytes .../training-pipeline.py | 37 +---- 9 files changed, 336 insertions(+), 97 deletions(-) create mode 100644 samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py create mode 100644 samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py create mode 100644 samples/contrib/aws-samples/debugger_component_demo/training-pipeline.tar.gz diff --git a/components/aws/sagemaker/common/_utils.py b/components/aws/sagemaker/common/_utils.py index a6b9a050940..8ede32a9226 100644 --- a/components/aws/sagemaker/common/_utils.py +++ b/components/aws/sagemaker/common/_utils.py @@ -210,21 +210,16 @@ def create_training_job_request(args): if 'CollectionConfigurations' in args['debug_hook_config']: logging.into('Existing CollectionConfigurations in debug_hook_config will be overwritten. Move and reformat into collection_config parameter') raise Exception('Could not create job request') - if 'S3OutputPath' not in args['debug_hook_config']: - logging.info('DebugHookConfig requires an S3OutputPath to be defined.') - raise Exception('Could not create job request') request['DebugHookConfig'] = args['debug_hook_config'] request['DebugHookConfig']['CollectionConfigurations'] = [] - else: - request.pop('DebugHookConfig') if args['collection_config']: - if 'DebugHookConfig' not in request: - logging.info('CollectionConfigurations requires a debug hook to be configured.') - raise Exception('Could not create job request') for key, val in args['collection_config'].items(): request['DebugHookConfig']['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val}) + if not args['debug_hook_config'] and not args['collection_config']: + request.pop('DebugHookConfig') + if args['debug_rule_config']: request['DebugRuleConfigurations'] = args['debug_rule_config'] else: @@ -287,6 +282,7 @@ def wait_for_debug_rules(client, training_job_name, poll_interval=31): print_debug_rule_status(response) time.sleep(poll_interval) + def debug_rules_errored(response): if 'DebugRuleEvaluationStatuses' in response: for debug_rule in response['DebugRuleEvaluationStatuses']: @@ -295,7 +291,6 @@ def debug_rules_errored(response): return False - def debug_rules_completed(response): if 'DebugRuleEvaluationStatuses' in response: for debug_rule in response['DebugRuleEvaluationStatuses']: @@ -308,7 +303,7 @@ def print_debug_rule_status(response, verbose=False): for debug_rule in response['DebugRuleEvaluationStatuses']: logging.info(" - {}: {}".format(debug_rule['RuleConfigurationName'], debug_rule['RuleEvaluationStatus'])) if verbose and 'StatusDetails' in debug_rule: - logging.info(" - {}".format(debug_rule['StatusDetails'])) + logging.info(" - {}".format(debug_rule['StatusDetails']).rstrip()) diff --git a/components/aws/sagemaker/common/train.template.yaml b/components/aws/sagemaker/common/train.template.yaml index 36eb5d0188b..7532c7faaa1 100644 --- a/components/aws/sagemaker/common/train.template.yaml +++ b/components/aws/sagemaker/common/train.template.yaml @@ -23,9 +23,6 @@ StoppingCondition: MaxWaitTimeInSeconds: 86400 DebugHookConfig: CollectionConfigurations: [] - HookParameters: {} - LocalPath: '' - S3OutputPath: '' DebugRuleConfigurations: [] CheckpointConfig: S3Uri: '' diff --git a/components/aws/sagemaker/tests/unit_tests/tests/test_train.py b/components/aws/sagemaker/tests/unit_tests/tests/test_train.py index 73f5f483f5d..d8a7d2e6844 100644 --- a/components/aws/sagemaker/tests/unit_tests/tests/test_train.py +++ b/components/aws/sagemaker/tests/unit_tests/tests/test_train.py @@ -387,11 +387,9 @@ def test_hook_good_args(self): }]) def test_hook_bad_args(self): - no_s3_uri_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"LocalPath": "/opt/ml/output/tensors/"}']) config_in_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"S3OutputPath": "s3://fake-uri/", "CollectionConfigurations": [{"CollectionName": "collection1", "CollectionParameters": {"key1": "value1"}}]}']) - no_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{}', '--collection_config', '{"collection1": {"key1": "value1"}}']) - for arg in [no_s3_uri_args, no_hook_args]: + for arg in [config_in_hook_args]: with self.assertRaises(Exception): _utils.create_training_job_request(vars(arg)) diff --git a/samples/contrib/aws-samples/debugger_component_demo/README.md b/samples/contrib/aws-samples/debugger_component_demo/README.md index c7b8c4faa3a..21bc4f43b5c 100644 --- a/samples/contrib/aws-samples/debugger_component_demo/README.md +++ b/samples/contrib/aws-samples/debugger_component_demo/README.md @@ -3,49 +3,55 @@ An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train). The training component has -## Prerequisites +## Prerequisites This pipeline uses the exact same setup as [simple_training_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline). -## Steps -1. Compile the pipeline: - `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz` +## Steps +1. Compile the pipeline: + `dsl-compile --py debugger-component-demo.py --output debugger-component-demo.tar.gz` 2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. 3. Once the pipeline completes, you can view the results of each debugger rule under 'Logs'. -Inputs format to `debug_hook_config`, `collection_config`, and `debug_rule_config` : +Inputs format to `debug_hook_config` and `debug_rule_config` : ```buildoutcfg debug_hook_config = { "S3OutputPath": "s3:///path/for/data/emission/", "LocalPath": "/local/path/for/data/emission/", + "CollectionConfigurations": [ + { + "CollectionName": "losses", + "CollectionParameters": { + "start_step": "25", + "end_step": "150" + } + }, { + "CollectionName": "gradient", + "CollectionParameters": { + "start_step": "5", + "end_step": "100" + } + } + ], "HookParameters": { "save_interval": "10" } } -collection_config = { - "collection_name_1": { - "include_regex": ".*" - }, - "collection_name_2: { - "include_regex": ".*" - } -} -debug_rule_config = { + +debug_rule_config = { "RuleConfigurationName": "rule_name" - "RuleEvaluatorImage": "123456789011.dkr.ecr..amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3" + "RuleEvaluatorImage": "503895931360.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-debugger-rules:latest" "RuleParameters": { "rule_to_invoke": "LossNotDecreasing", "tensor_regex": ".*" } } ``` -The provided demo pipeline `training_pipeline.py` uses a built-in rule. When using a built-in rule, `RuleConfigurationName` and `RuleParameters` will take on values listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) and `RuleEvaluatorImage`'s possible values are listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-built-in-registry-ids) will depend on the specified region. - -In the case of using and writing your own custom rule, `RuleEvaluatorImage` will take on a value from [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-custom-rule-registry-ids). An example of a custom debugger rule can be found [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html#debugger-custom-rules-api). # Resources -* [Amazon SageMaker Debugger] (https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) -* [Debugger Built-In Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) -* [Pre-built Docker Images for Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html) -* [Debugger API Examples] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html) +* [Amazon SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) +* [Available Frameworks to Use Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html#debugger-supported-aws-containers) +* [Debugger Built-In Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) +* [Debugger Custom Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-custom-rules.html) +* [Debugger API Examples](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html) diff --git a/samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py b/samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py new file mode 100644 index 00000000000..cc33abb5b8b --- /dev/null +++ b/samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + +import kfp +import json +import os +import copy +from kfp import components +from kfp import dsl +from kfp.aws import use_aws_secret + + +cur_file_dir = os.path.dirname(__file__) +components_dir = os.path.join(cur_file_dir, '../../../../components/aws/sagemaker/') + +sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml') + +debug_hook = { + 'S3OutputPath':'s3://kubeflow-pipeline-data/mnist_kmeans_example/hook_config' +} + +debug_hook['CollectionConfigurations'] = [] + +collection_list = { + 'feature_importance' : { + 'save_interval': '5' + }, + 'losses' : { + 'save_interval': '10' + }, + 'average_shap': { + 'save_interval': '5' + }, + 'metrics': { + 'save_interval': '5' + } +} + +for key, val in collection_list.items(): + debug_hook['CollectionConfigurations'].append({'CollectionName': key, 'CollectionParameters': val}) + +loss_rule = { + 'RuleConfigurationName': 'LossNotDecreasing', + 'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest', + 'RuleParameters': { + 'rule_to_invoke': 'LossNotDecreasing', + 'tensor_regex': '.*' + } +} + +overtraining_rule = { + 'RuleConfigurationName': 'Overtraining', + 'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest', + 'RuleParameters': { + 'rule_to_invoke': 'Overtraining', + 'patience_train': '10', + 'patience_validation': '20' + } +} + +debug_rule_configurations=[loss_rule, overtraining_rule] + +bad_hyperparameters = { + 'objective': 'reg:squarederror', + 'max_depth': '5', + 'eta': '0', + 'gamma': '4', + 'min_child_weight': '6', + 'silent': '0', + 'subsample': '0.7', + 'num_round': '50' +} + +channelObjList = [] + +channelObj = { + 'ChannelName': '', + 'DataSource': { + 'S3DataSource': { + 'S3Uri': '', + 'S3DataType': 'S3Prefix', + 'S3DataDistributionType': 'FullyReplicated' + } + }, + 'ContentType': 'text/csv', + 'CompressionType': 'None', + 'RecordWrapperType': 'None', + 'InputMode': 'File' +} + +channelObj['ChannelName'] = 'train' +channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://kubeflow-pipeline-data/mnist_kmeans_example/input/valid_data.csv' +channelObjList.append(copy.deepcopy(channelObj)) + + +@dsl.pipeline( + name='xgboost-mnist-debugger', + description='SageMaker training job test with debugger' +) +def training( + region='us-east-1', + endpoint_url='', + image='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3', + training_input_mode='File', + hyperparameters=bad_hyperparameters, + channels=channelObjList, + instance_type='ml.m5.2xlarge', + instance_count=1, + volume_size=50, + max_run_time=3600, + model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/output/model', + output_encryption_key='', + network_isolation=True, + traffic_encryption=False, + spot_instance=False, + max_wait_time=3600, + checkpoint_config={}, + debug_hook_config=debug_hook, + debug_rule_config=debug_rule_configurations, + tensorboard_output_config={}, + role='' + ): + training = sagemaker_train_op( + region=region, + endpoint_url=endpoint_url, + image=image, + training_input_mode=training_input_mode, + hyperparameters=hyperparameters, + channels=channels, + instance_type=instance_type, + instance_count=instance_count, + volume_size=volume_size, + max_run_time=max_run_time, + model_artifact_path=model_artifact_path, + output_encryption_key=output_encryption_key, + network_isolation=network_isolation, + traffic_encryption=traffic_encryption, + spot_instance=spot_instance, + max_wait_time=max_wait_time, + checkpoint_config=checkpoint_config, + debug_hook_config=debug_hook_config, + debug_rule_config=debug_rule_config, + tensorboard_output_config=tensorboard_output_config, + role=role, + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + + +if __name__ == '__main__': + kfp.compiler.Compiler().compile(training, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py b/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py index 6faae070d8d..53972867b89 100644 --- a/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py +++ b/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py @@ -17,13 +17,8 @@ sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml') -<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py debug_hook = { 'S3OutputPath':'s3://dusluong-bucket0/xgboost-debugger/hookconfig' -======= -debugger_hook_config = { - "S3OutputPath":"s3://kubeflow-pipeline-data/mnist_kmeans_example/hookconfig", ->>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py } debug_hook['CollectionConfigurations'] = [] @@ -43,7 +38,6 @@ } } -<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py for key, val in collection_list.items(): debug_hook['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val}) @@ -65,25 +59,6 @@ 'RuleParameters': { 'rule_to_invoke': 'LossNotDecreasing', 'tensor_regex': '.*' -======= -bad_hyperparameters = { - "max_depth": "5", - "alpha": "100", - "eta": "0.5", - "gamma": "4", - "min_child_weight": "6", - "silent": "0", - "subsample": "0.7", - "num_round": "50" -} - -loss_rule = { - "RuleConfigurationName": "LossNotDecreasing", - "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest", - "RuleParameters": { - "rule_to_invoke": "LossNotDecreasing", - "tensor_regex": ".*" ->>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py } } diff --git a/samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py b/samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py new file mode 100644 index 00000000000..49d6525294a --- /dev/null +++ b/samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + +import kfp +import json +import os +import copy +from kfp import components +from kfp import dsl +from kfp.aws import use_aws_secret + + +cur_file_dir = os.path.dirname(__file__) +components_dir = os.path.join(cur_file_dir, '../../../../components/aws/sagemaker/') + +sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml') + +debugger_hook_config = { + "S3OutputPath":"s3://dusluong-bucket0/xgboost-debugger/hookconfig" +} + +collection_list = { + "feature_importance" : { + "save_interval": "5" + }, + "losses" : { + "save_interval": "500" + }, + "average_shap": { + "save_interval": "5" + }, + "metrics": { + "save_interval": "5" + }, + "gradient": { + "save_interval": "5" + } +} + +bad_hyperparameters = { + "max_depth": "5", + "eta": "0", + "gamma": "4", + "min_child_weight": "6", + "silent": "0", + "subsample": "0.7", + "num_round": "50" +} + +loss_rule = { + "RuleConfigurationName": "LossNotDecreasing", + "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest", + "RuleParameters": { + "rule_to_invoke": "LossNotDecreasing", + "tensor_regex": ".*" + } +} + +gradient_rule = { + "RuleConfigurationName": "VanishingGradient", + "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest", + "RuleParameters": { + "rule_to_invoke": "VanishingGradient", + "tensor_regex": ".*" + } +} + + +debug_rule_configurations=[loss_rule, gradient_rule] + +channelObjList = [] + +channelObj = { + 'ChannelName': '', + 'DataSource': { + 'S3DataSource': { + 'S3Uri': '', + 'S3DataType': 'S3Prefix', + 'S3DataDistributionType': 'FullyReplicated' + } + }, + 'ContentType': "text/csv", + 'CompressionType': 'None', + 'RecordWrapperType': 'None', + 'InputMode': 'File' +} + +channelObj['ChannelName'] = 'train' +channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://dusluong-bucket0/mnist_kmeans_example/input/valid_data.csv' +channelObjList.append(copy.deepcopy(channelObj)) + + +@dsl.pipeline( + name='xgboost-mnist-debugger', + description='SageMaker training job test with debugger' +) +def training( + region='us-east-1', + endpoint_url='', + image='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3', + training_input_mode='File', + hyperparameters=bad_hyperparameters, + channels=channelObjList, + instance_type='ml.m5.2xlarge', + instance_count=1, + volume_size=50, + max_run_time=3600, + model_artifact_path='s3://dusluong-bucket0/mnist_kmeans_example/output/model', + output_encryption_key='', + network_isolation=True, + traffic_encryption=False, + spot_instance=False, + max_wait_time=3600, + checkpoint_config={}, + debug_hook_config=debugger_hook_config, + collection_config=collection_list, + debug_rule_config=debug_rule_configurations, + role='arn:aws:iam::169544399729:role/kfp-example-sagemaker-execution-role' + ): + training = sagemaker_train_op( + region=region, + endpoint_url=endpoint_url, + image=image, + training_input_mode=training_input_mode, + hyperparameters=hyperparameters, + channels=channels, + instance_type=instance_type, + instance_count=instance_count, + volume_size=volume_size, + max_run_time=max_run_time, + model_artifact_path=model_artifact_path, + output_encryption_key=output_encryption_key, + network_isolation=network_isolation, + traffic_encryption=traffic_encryption, + spot_instance=spot_instance, + max_wait_time=max_wait_time, + checkpoint_config=checkpoint_config, + debug_hook_config=debug_hook_config, + collection_config=collection_config, + debug_rule_config=debug_rule_config, + role=role, + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + + +if __name__ == '__main__': + kfp.compiler.Compiler().compile(training, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/debugger_component_demo/training-pipeline.tar.gz b/samples/contrib/aws-samples/debugger_component_demo/training-pipeline.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..6ccc31fd3e12bf128d7ee942b78cd9c9ac9254b7 GIT binary patch literal 1950 zcmV;P2VwXhiwFqlb{Jm*|8#O;X>Mt5XDx7PaAj<1Ze=cXVR8WNS>JElHW1Ep{tCYt z7$(_{lQi+$kS@c3tt;BDeHaFTk!YExrVUHZV7kU>rf{GH&3GEFfL&P9P^r48t1j;R#tVKONYhhY+ISI@zOY8Z}F zn2dfknLMJbgm(xjdTs6U&YjyH%>oY`ym z0~T>teNtzo8m=8x0l$U=^`4bH)}KK6{j%fL@0^$`5X;qUmXsnZ8Tk95jJH_MXZrj_ z+Z-b*BD~WJnd!}#!F)&(spA{Q;<2(_nfd^GV1mD2zU6pLc5SWKfXvB5DdB|HBjDS8 znPvNTxX4Hh!2r&AepW;77?t4JQ)G#Ea(1wP$MS+>Aq>bjjNO+8O z%=d-b;$VySy@`cl`2>kEAcAE`o}{hm2VUZ-ogLO|61R+4L+()~tTCcsGH5noSY0{x z2@yHYv+)MU+bWI5jIK#)ryncAbzToPjBOqIDpr2eo_vd%jMcq6O~GhN#Y~}Apr?t2 zWf_k3rntTAS65)r5;HJ z-gso&{+UDC6^uu~B=F9hbNF4Ao%m;llPmB1!2Z`KAyOf>%vDoz#_`@@SMFlc4~3FND$ z2qHWZv9QuErnJ*OvEP^8yVMvCVKDgS3h{3S}wd zL~in)*;E+}=;s<(t=de{GGm}M#5dBmqZaXX%&3DYoAc`?#tt+~*+jB#)!VU7ZS5m{h8{LJ8D)xPNQXyKGp_Rh59i}-8K^NrX11aSl5fa+%USVpS3=) zVIiq*G-lP&fKdf>%Ro9RYNfR+Z#PKxU6k`eW2kEyYFCE3s3At*bM@7Ko78m`*4y9r z-5B(=@O@3_a5F#DSC8q`JMR%Ym~CHoJ=m=eU-r2!L~5f4l6*jb(9sHY7#xXg<))L9 znP&%9T#}%CbRCr;b(EO!j@*;%)Z=2J+KY$j<{~u=szu% zT6ZS*oaM)M=3u)&9rfC=4(Bnx##tOnq`kI}r>PaC7Fl*tWcFUB|n& zr6(C$H!!0J^_V0&2G>~`$7JCV2HX^7Od20H(8H+4)Pb=EI%&~BAsNQ*?revP&@i&I ziF4JkVMwd*9V~n!5FPE=P}zI|ve`cl)ZAg*?%r&uUXR1=;JV|A{RsHJK5z_Q7&g(< zca0?+!$t;rPbX^`Mz(LL99(Avka49n#E*vY9KF;iay5+J*WQk!XQOPmZ>ReI%WD|C z^KxbcIn;H$zWVG{^!<(RZ~n*r#&NrU=EaK_FJ8QO@#4jc7cXACc=6)Jix)3mym;~A k#fuj&Uc7km;>C*>FJ8QO@#4jcm%k%_0pE12i~vvo0ASY2kN^Mx literal 0 HcmV?d00001 diff --git a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py index 8275482bb49..4e0de3ac8e9 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py +++ b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py @@ -17,31 +17,6 @@ sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml') -debugger_hook_config = { - "S3OutputPath":"s3://dusluong-bucket0/emission-demo", - "LocalPath":"/opt/ml/output/tensors", - "HookParameters": { - "save_interval": "10" - } -} - -collection_list = { - "gradients" : { - "save_interval": "50" - }, - "losses" : { - "save_interval": "10" - } -} - -debug_rule_configurations=[{ - "RuleConfigurationName": "Amazon-VanishingGradient", - "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest", - "RuleParameters": { - "rule_to_invoke": "Overfit" - } -}] - channelObjList = [] channelObj = { @@ -59,7 +34,7 @@ } channelObj['ChannelName'] = 'train' -channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://dusluong-bucket0/mnist_kmeans_example/training-data' +channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://kubeflow-pipeline-data/mnist_kmeans_example/data' channelObjList.append(copy.deepcopy(channelObj)) @@ -78,17 +53,14 @@ def training( instance_count=1, volume_size=50, max_run_time=3600, - model_artifact_path='s3://dusluong-bucket0/mnist_kmeans_example/output/model', + model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/data', output_encryption_key='', network_isolation=True, traffic_encryption=False, spot_instance=False, max_wait_time=3600, checkpoint_config={}, - debug_hook_config=debugger_hook_config, - collection_config=collection_list, - debug_rule_config=debug_rule_configurations, - role='arn:aws:iam::169544399729:role/kfp-example-sagemaker-execution-role' + role='' ): training = sagemaker_train_op( region=region, @@ -108,9 +80,6 @@ def training( spot_instance=spot_instance, max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, - debug_hook_config=debug_hook_config, - collection_config=collection_config, - debug_rule_config=debug_rule_config, role=role, )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))