Removed custom rules

pick 4a4265c Unit test passes squash 345b8a7 Updated unit tests, working on integration tests
dstnluong · Jul 28, 2020 · 1551e34 · 1551e34
1 parent 11c872b
commit 1551e34
Show file tree

Hide file tree

Showing 9 changed files with 336 additions and 97 deletions.
diff --git a/components/aws/sagemaker/common/_utils.py b/components/aws/sagemaker/common/_utils.py
@@ -210,21 +210,16 @@ def create_training_job_request(args):
         if 'CollectionConfigurations' in args['debug_hook_config']:
             logging.into('Existing CollectionConfigurations in debug_hook_config will be overwritten. Move and reformat into collection_config parameter')
             raise Exception('Could not create job request')
-        if 'S3OutputPath' not in args['debug_hook_config']:
-            logging.info('DebugHookConfig requires an S3OutputPath to be defined.')
-            raise Exception('Could not create job request')
         request['DebugHookConfig'] = args['debug_hook_config']
         request['DebugHookConfig']['CollectionConfigurations'] = []
-    else:
-        request.pop('DebugHookConfig')
 
     if args['collection_config']:
-        if 'DebugHookConfig' not in request:
-            logging.info('CollectionConfigurations requires a debug hook to be configured.')
-            raise Exception('Could not create job request')
         for key, val in args['collection_config'].items():
             request['DebugHookConfig']['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val})
 
+    if not args['debug_hook_config'] and not args['collection_config']:
+        request.pop('DebugHookConfig')
+
     if args['debug_rule_config']:
         request['DebugRuleConfigurations'] = args['debug_rule_config']
     else:
@@ -287,6 +282,7 @@ def wait_for_debug_rules(client, training_job_name, poll_interval=31):
         print_debug_rule_status(response)
         time.sleep(poll_interval)
 
+
 def debug_rules_errored(response):
     if 'DebugRuleEvaluationStatuses' in response:
         for debug_rule in response['DebugRuleEvaluationStatuses']:
@@ -295,7 +291,6 @@ def debug_rules_errored(response):
     return False
 
 
-
 def debug_rules_completed(response):
     if 'DebugRuleEvaluationStatuses' in response:
         for debug_rule in response['DebugRuleEvaluationStatuses']:
@@ -308,7 +303,7 @@ def print_debug_rule_status(response, verbose=False):
     for debug_rule in response['DebugRuleEvaluationStatuses']:
         logging.info(" - {}: {}".format(debug_rule['RuleConfigurationName'], debug_rule['RuleEvaluationStatus']))
         if verbose and 'StatusDetails' in debug_rule:
-            logging.info("   - {}".format(debug_rule['StatusDetails']))
+            logging.info("   - {}".format(debug_rule['StatusDetails']).rstrip())
 
 
 

diff --git a/components/aws/sagemaker/common/train.template.yaml b/components/aws/sagemaker/common/train.template.yaml
@@ -23,9 +23,6 @@ StoppingCondition:
   MaxWaitTimeInSeconds: 86400
 DebugHookConfig:
   CollectionConfigurations: []
-  HookParameters: {}
-  LocalPath: ''
-  S3OutputPath: ''
 DebugRuleConfigurations: []
 CheckpointConfig:
   S3Uri: ''

diff --git a/components/aws/sagemaker/tests/unit_tests/tests/test_train.py b/components/aws/sagemaker/tests/unit_tests/tests/test_train.py
@@ -387,11 +387,9 @@ def test_hook_good_args(self):
       }])
 
   def test_hook_bad_args(self):
-      no_s3_uri_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"LocalPath": "/opt/ml/output/tensors/"}'])
       config_in_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"S3OutputPath": "s3://fake-uri/", "CollectionConfigurations": [{"CollectionName": "collection1", "CollectionParameters": {"key1": "value1"}}]}'])
-      no_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{}', '--collection_config', '{"collection1": {"key1": "value1"}}'])
 
-      for arg in [no_s3_uri_args, no_hook_args]:
+      for arg in [config_in_hook_args]:
           with self.assertRaises(Exception):
               _utils.create_training_job_request(vars(arg))
 

diff --git a/samples/contrib/aws-samples/debugger_component_demo/README.md b/samples/contrib/aws-samples/debugger_component_demo/README.md
@@ -3,49 +3,55 @@
 An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train). The training component has
 
 
-## Prerequisites 
+## Prerequisites
 
 This pipeline uses the exact same setup as [simple_training_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline).
 
-## Steps 
-1. Compile the pipeline:  
-   `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz`
+## Steps
+1. Compile the pipeline:
+   `dsl-compile --py debugger-component-demo.py --output debugger-component-demo.tar.gz`
 2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run.
 3. Once the pipeline completes, you can view the results of each debugger rule under 'Logs'.
 
-Inputs format to `debug_hook_config`, `collection_config`, and `debug_rule_config` :
+Inputs format to `debug_hook_config` and `debug_rule_config` :
 ```buildoutcfg
 debug_hook_config = {
     "S3OutputPath": "s3://<your_bucket_name>/path/for/data/emission/",
     "LocalPath": "/local/path/for/data/emission/",
+    "CollectionConfigurations": [
+        {
+          "CollectionName": "losses",
+          "CollectionParameters": {
+            "start_step": "25",
+            "end_step": "150"
+          }
+        }, {
+            "CollectionName": "gradient",
+            "CollectionParameters": {
+                "start_step": "5",
+                "end_step": "100"
+            }
+        }
+    ],
     "HookParameters": {
         "save_interval": "10"
     }
 }
-collection_config = {
-    "collection_name_1": {
-        "include_regex": ".*"
-    },
-    "collection_name_2: {
-        "include_regex": ".*"
-    }
-}
-debug_rule_config = { 
+
+debug_rule_config = {
     "RuleConfigurationName": "rule_name"
-    "RuleEvaluatorImage": "123456789011.dkr.ecr.<region>.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3"
+    "RuleEvaluatorImage": "503895931360.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-debugger-rules:latest"
     "RuleParameters": {
         "rule_to_invoke": "LossNotDecreasing",
         "tensor_regex": ".*"
     }
 }
 ```
-The provided demo pipeline `training_pipeline.py` uses a built-in rule. When using a built-in rule, `RuleConfigurationName` and `RuleParameters` will take on values listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) and `RuleEvaluatorImage`'s possible values are listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-built-in-registry-ids) will depend on the specified region.
-
-In the case of using and writing your own custom rule, `RuleEvaluatorImage` will take on a value from [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-custom-rule-registry-ids). An example of a custom debugger rule can be found [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html#debugger-custom-rules-api).
 
 # Resources
-* [Amazon SageMaker Debugger] (https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html)
-* [Debugger Built-In Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html)
-* [Pre-built Docker Images for Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html)
-* [Debugger API Examples] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html)
+* [Amazon SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html)
+* [Available Frameworks to Use Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html#debugger-supported-aws-containers)
+* [Debugger Built-In Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html)
+* [Debugger Custom Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-custom-rules.html)
+* [Debugger API Examples](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html)
 
diff --git a/samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py b/samples/contrib/aws-samples/debugger_component_demo/debugger-component-demo.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
+# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md
+
+import kfp
+import json
+import os
+import copy
+from kfp import components
+from kfp import dsl
+from kfp.aws import use_aws_secret
+
+
+cur_file_dir = os.path.dirname(__file__)
+components_dir = os.path.join(cur_file_dir, '../../../../components/aws/sagemaker/')
+
+sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml')
+
+debug_hook = {
+    'S3OutputPath':'s3://kubeflow-pipeline-data/mnist_kmeans_example/hook_config'
+}
+
+debug_hook['CollectionConfigurations'] = []
+
+collection_list = {
+    'feature_importance' : {
+        'save_interval': '5'
+    }, 
+    'losses' : {
+        'save_interval': '10'
+    },
+    'average_shap': {
+        'save_interval': '5'
+    },
+    'metrics': {
+        'save_interval': '5'
+    }
+}
+
+for key, val in collection_list.items():
+    debug_hook['CollectionConfigurations'].append({'CollectionName': key, 'CollectionParameters': val})
+
+loss_rule = {
+    'RuleConfigurationName': 'LossNotDecreasing',
+    'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
+    'RuleParameters': {
+        'rule_to_invoke': 'LossNotDecreasing',
+        'tensor_regex': '.*'
+    }
+}
+
+overtraining_rule = {
+    'RuleConfigurationName': 'Overtraining',
+    'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
+    'RuleParameters': {
+        'rule_to_invoke': 'Overtraining',
+        'patience_train': '10',
+        'patience_validation': '20'
+    }
+}
+
+debug_rule_configurations=[loss_rule, overtraining_rule]
+
+bad_hyperparameters = {
+    'objective': 'reg:squarederror',
+    'max_depth': '5', 
+    'eta': '0', 
+    'gamma': '4', 
+    'min_child_weight': '6', 
+    'silent': '0', 
+    'subsample': '0.7', 
+    'num_round': '50'
+}
+
+channelObjList = []
+
+channelObj = {
+    'ChannelName': '',
+    'DataSource': {
+        'S3DataSource': {
+            'S3Uri': '',
+            'S3DataType': 'S3Prefix',
+            'S3DataDistributionType': 'FullyReplicated'
+        }
+    },
+    'ContentType': 'text/csv',
+    'CompressionType': 'None',
+    'RecordWrapperType': 'None',
+    'InputMode': 'File'
+}
+
+channelObj['ChannelName'] = 'train'
+channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://kubeflow-pipeline-data/mnist_kmeans_example/input/valid_data.csv'
+channelObjList.append(copy.deepcopy(channelObj))
+
+
+@dsl.pipeline(
+    name='xgboost-mnist-debugger',
+    description='SageMaker training job test with debugger'
+)
+def training(
+        region='us-east-1',
+        endpoint_url='',
+        image='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3',
+        training_input_mode='File',
+        hyperparameters=bad_hyperparameters,
+        channels=channelObjList,
+        instance_type='ml.m5.2xlarge',
+        instance_count=1,
+        volume_size=50,
+        max_run_time=3600,
+        model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/output/model',
+        output_encryption_key='',
+        network_isolation=True,
+        traffic_encryption=False,
+        spot_instance=False,
+        max_wait_time=3600,
+        checkpoint_config={},
+        debug_hook_config=debug_hook,
+        debug_rule_config=debug_rule_configurations,
+        tensorboard_output_config={},
+        role=''
+        ):
+    training = sagemaker_train_op(
+        region=region,
+        endpoint_url=endpoint_url,
+        image=image,
+        training_input_mode=training_input_mode,
+        hyperparameters=hyperparameters,
+        channels=channels,
+        instance_type=instance_type,
+        instance_count=instance_count,
+        volume_size=volume_size,
+        max_run_time=max_run_time,
+        model_artifact_path=model_artifact_path,
+        output_encryption_key=output_encryption_key,
+        network_isolation=network_isolation,
+        traffic_encryption=traffic_encryption,
+        spot_instance=spot_instance,
+        max_wait_time=max_wait_time,
+        checkpoint_config=checkpoint_config,
+        debug_hook_config=debug_hook_config,
+        debug_rule_config=debug_rule_config,
+        tensorboard_output_config=tensorboard_output_config,
+        role=role,
+    )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
+
+
+if __name__ == '__main__':
+    kfp.compiler.Compiler().compile(training, __file__ + '.zip')
diff --git a/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py b/samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py
@@ -17,13 +17,8 @@
 
 sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml')
 
-<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py
 debug_hook = {
     'S3OutputPath':'s3://dusluong-bucket0/xgboost-debugger/hookconfig'
-=======
-debugger_hook_config = {
-    "S3OutputPath":"s3://kubeflow-pipeline-data/mnist_kmeans_example/hookconfig",
->>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py
 }
 
 debug_hook['CollectionConfigurations'] = []
@@ -43,7 +38,6 @@
     }
 }
 
-<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py
 for key, val in collection_list.items():
     debug_hook['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val})
 
@@ -65,25 +59,6 @@
     'RuleParameters': {
         'rule_to_invoke': 'LossNotDecreasing',
         'tensor_regex': '.*'
-=======
-bad_hyperparameters = {
-    "max_depth": "5", 
-    "alpha": "100", 
-    "eta": "0.5", 
-    "gamma": "4", 
-    "min_child_weight": "6", 
-    "silent": "0", 
-    "subsample": "0.7", 
-    "num_round": "50"
-}
-
-loss_rule = {
-    "RuleConfigurationName": "LossNotDecreasing",
-    "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
-    "RuleParameters": {
-        "rule_to_invoke": "LossNotDecreasing",
-        "tensor_regex": ".*"
->>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py
     }
 }