Skip to content

Commit

Permalink
Removed custom rules
Browse files Browse the repository at this point in the history
    pick 4a4265c Unit test passes
    squash 345b8a7 Updated unit tests, working on integration tests
  • Loading branch information
dstnluong committed Jul 28, 2020
1 parent 11c872b commit 1551e34
Show file tree
Hide file tree
Showing 9 changed files with 336 additions and 97 deletions.
15 changes: 5 additions & 10 deletions components/aws/sagemaker/common/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,21 +210,16 @@ def create_training_job_request(args):
if 'CollectionConfigurations' in args['debug_hook_config']:
logging.into('Existing CollectionConfigurations in debug_hook_config will be overwritten. Move and reformat into collection_config parameter')
raise Exception('Could not create job request')
if 'S3OutputPath' not in args['debug_hook_config']:
logging.info('DebugHookConfig requires an S3OutputPath to be defined.')
raise Exception('Could not create job request')
request['DebugHookConfig'] = args['debug_hook_config']
request['DebugHookConfig']['CollectionConfigurations'] = []
else:
request.pop('DebugHookConfig')

if args['collection_config']:
if 'DebugHookConfig' not in request:
logging.info('CollectionConfigurations requires a debug hook to be configured.')
raise Exception('Could not create job request')
for key, val in args['collection_config'].items():
request['DebugHookConfig']['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val})

if not args['debug_hook_config'] and not args['collection_config']:
request.pop('DebugHookConfig')

if args['debug_rule_config']:
request['DebugRuleConfigurations'] = args['debug_rule_config']
else:
Expand Down Expand Up @@ -287,6 +282,7 @@ def wait_for_debug_rules(client, training_job_name, poll_interval=31):
print_debug_rule_status(response)
time.sleep(poll_interval)


def debug_rules_errored(response):
if 'DebugRuleEvaluationStatuses' in response:
for debug_rule in response['DebugRuleEvaluationStatuses']:
Expand All @@ -295,7 +291,6 @@ def debug_rules_errored(response):
return False



def debug_rules_completed(response):
if 'DebugRuleEvaluationStatuses' in response:
for debug_rule in response['DebugRuleEvaluationStatuses']:
Expand All @@ -308,7 +303,7 @@ def print_debug_rule_status(response, verbose=False):
for debug_rule in response['DebugRuleEvaluationStatuses']:
logging.info(" - {}: {}".format(debug_rule['RuleConfigurationName'], debug_rule['RuleEvaluationStatus']))
if verbose and 'StatusDetails' in debug_rule:
logging.info(" - {}".format(debug_rule['StatusDetails']))
logging.info(" - {}".format(debug_rule['StatusDetails']).rstrip())



Expand Down
3 changes: 0 additions & 3 deletions components/aws/sagemaker/common/train.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ StoppingCondition:
MaxWaitTimeInSeconds: 86400
DebugHookConfig:
CollectionConfigurations: []
HookParameters: {}
LocalPath: ''
S3OutputPath: ''
DebugRuleConfigurations: []
CheckpointConfig:
S3Uri: ''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,11 +387,9 @@ def test_hook_good_args(self):
}])

def test_hook_bad_args(self):
no_s3_uri_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"LocalPath": "/opt/ml/output/tensors/"}'])
config_in_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{"S3OutputPath": "s3://fake-uri/", "CollectionConfigurations": [{"CollectionName": "collection1", "CollectionParameters": {"key1": "value1"}}]}'])
no_hook_args = self.parser.parse_args(required_args + ['--debug_hook_config', '{}', '--collection_config', '{"collection1": {"key1": "value1"}}'])

for arg in [no_s3_uri_args, no_hook_args]:
for arg in [config_in_hook_args]:
with self.assertRaises(Exception):
_utils.create_training_job_request(vars(arg))

Expand Down
50 changes: 28 additions & 22 deletions samples/contrib/aws-samples/debugger_component_demo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,55 @@
An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train). The training component has


## Prerequisites
## Prerequisites

This pipeline uses the exact same setup as [simple_training_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline).

## Steps
1. Compile the pipeline:
`dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz`
## Steps
1. Compile the pipeline:
`dsl-compile --py debugger-component-demo.py --output debugger-component-demo.tar.gz`
2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run.
3. Once the pipeline completes, you can view the results of each debugger rule under 'Logs'.

Inputs format to `debug_hook_config`, `collection_config`, and `debug_rule_config` :
Inputs format to `debug_hook_config` and `debug_rule_config` :
```buildoutcfg
debug_hook_config = {
"S3OutputPath": "s3://<your_bucket_name>/path/for/data/emission/",
"LocalPath": "/local/path/for/data/emission/",
"CollectionConfigurations": [
{
"CollectionName": "losses",
"CollectionParameters": {
"start_step": "25",
"end_step": "150"
}
}, {
"CollectionName": "gradient",
"CollectionParameters": {
"start_step": "5",
"end_step": "100"
}
}
],
"HookParameters": {
"save_interval": "10"
}
}
collection_config = {
"collection_name_1": {
"include_regex": ".*"
},
"collection_name_2: {
"include_regex": ".*"
}
}
debug_rule_config = {
debug_rule_config = {
"RuleConfigurationName": "rule_name"
"RuleEvaluatorImage": "123456789011.dkr.ecr.<region>.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3"
"RuleEvaluatorImage": "503895931360.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-debugger-rules:latest"
"RuleParameters": {
"rule_to_invoke": "LossNotDecreasing",
"tensor_regex": ".*"
}
}
```
The provided demo pipeline `training_pipeline.py` uses a built-in rule. When using a built-in rule, `RuleConfigurationName` and `RuleParameters` will take on values listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html) and `RuleEvaluatorImage`'s possible values are listed [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-built-in-registry-ids) will depend on the specified region.

In the case of using and writing your own custom rule, `RuleEvaluatorImage` will take on a value from [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html#debuger-custom-rule-registry-ids). An example of a custom debugger rule can be found [here](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html#debugger-custom-rules-api).

# Resources
* [Amazon SageMaker Debugger] (https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html)
* [Debugger Built-In Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html)
* [Pre-built Docker Images for Rules] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-docker-images-rules.html)
* [Debugger API Examples] (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html)
* [Amazon SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html)
* [Available Frameworks to Use Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html#debugger-supported-aws-containers)
* [Debugger Built-In Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html)
* [Debugger Custom Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-custom-rules.html)
* [Debugger API Examples](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html)

Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python3

# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md

import kfp
import json
import os
import copy
from kfp import components
from kfp import dsl
from kfp.aws import use_aws_secret


cur_file_dir = os.path.dirname(__file__)
components_dir = os.path.join(cur_file_dir, '../../../../components/aws/sagemaker/')

sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml')

debug_hook = {
'S3OutputPath':'s3://kubeflow-pipeline-data/mnist_kmeans_example/hook_config'
}

debug_hook['CollectionConfigurations'] = []

collection_list = {
'feature_importance' : {
'save_interval': '5'
},
'losses' : {
'save_interval': '10'
},
'average_shap': {
'save_interval': '5'
},
'metrics': {
'save_interval': '5'
}
}

for key, val in collection_list.items():
debug_hook['CollectionConfigurations'].append({'CollectionName': key, 'CollectionParameters': val})

loss_rule = {
'RuleConfigurationName': 'LossNotDecreasing',
'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
'RuleParameters': {
'rule_to_invoke': 'LossNotDecreasing',
'tensor_regex': '.*'
}
}

overtraining_rule = {
'RuleConfigurationName': 'Overtraining',
'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
'RuleParameters': {
'rule_to_invoke': 'Overtraining',
'patience_train': '10',
'patience_validation': '20'
}
}

debug_rule_configurations=[loss_rule, overtraining_rule]

bad_hyperparameters = {
'objective': 'reg:squarederror',
'max_depth': '5',
'eta': '0',
'gamma': '4',
'min_child_weight': '6',
'silent': '0',
'subsample': '0.7',
'num_round': '50'
}

channelObjList = []

channelObj = {
'ChannelName': '',
'DataSource': {
'S3DataSource': {
'S3Uri': '',
'S3DataType': 'S3Prefix',
'S3DataDistributionType': 'FullyReplicated'
}
},
'ContentType': 'text/csv',
'CompressionType': 'None',
'RecordWrapperType': 'None',
'InputMode': 'File'
}

channelObj['ChannelName'] = 'train'
channelObj['DataSource']['S3DataSource']['S3Uri'] = 's3://kubeflow-pipeline-data/mnist_kmeans_example/input/valid_data.csv'
channelObjList.append(copy.deepcopy(channelObj))


@dsl.pipeline(
name='xgboost-mnist-debugger',
description='SageMaker training job test with debugger'
)
def training(
region='us-east-1',
endpoint_url='',
image='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3',
training_input_mode='File',
hyperparameters=bad_hyperparameters,
channels=channelObjList,
instance_type='ml.m5.2xlarge',
instance_count=1,
volume_size=50,
max_run_time=3600,
model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/output/model',
output_encryption_key='',
network_isolation=True,
traffic_encryption=False,
spot_instance=False,
max_wait_time=3600,
checkpoint_config={},
debug_hook_config=debug_hook,
debug_rule_config=debug_rule_configurations,
tensorboard_output_config={},
role=''
):
training = sagemaker_train_op(
region=region,
endpoint_url=endpoint_url,
image=image,
training_input_mode=training_input_mode,
hyperparameters=hyperparameters,
channels=channels,
instance_type=instance_type,
instance_count=instance_count,
volume_size=volume_size,
max_run_time=max_run_time,
model_artifact_path=model_artifact_path,
output_encryption_key=output_encryption_key,
network_isolation=network_isolation,
traffic_encryption=traffic_encryption,
spot_instance=spot_instance,
max_wait_time=max_wait_time,
checkpoint_config=checkpoint_config,
debug_hook_config=debug_hook_config,
debug_rule_config=debug_rule_config,
tensorboard_output_config=tensorboard_output_config,
role=role,
)#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))


if __name__ == '__main__':
kfp.compiler.Compiler().compile(training, __file__ + '.zip')
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,8 @@

sagemaker_train_op = components.load_component_from_file(components_dir + '/train/component.yaml')

<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py
debug_hook = {
'S3OutputPath':'s3://dusluong-bucket0/xgboost-debugger/hookconfig'
=======
debugger_hook_config = {
"S3OutputPath":"s3://kubeflow-pipeline-data/mnist_kmeans_example/hookconfig",
>>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py
}

debug_hook['CollectionConfigurations'] = []
Expand All @@ -43,7 +38,6 @@
}
}

<<<<<<< HEAD:samples/contrib/aws-samples/debugger_component_demo/training-pipeline.py
for key, val in collection_list.items():
debug_hook['CollectionConfigurations'].append({"CollectionName": key, "CollectionParameters": val})

Expand All @@ -65,25 +59,6 @@
'RuleParameters': {
'rule_to_invoke': 'LossNotDecreasing',
'tensor_regex': '.*'
=======
bad_hyperparameters = {
"max_depth": "5",
"alpha": "100",
"eta": "0.5",
"gamma": "4",
"min_child_weight": "6",
"silent": "0",
"subsample": "0.7",
"num_round": "50"
}

loss_rule = {
"RuleConfigurationName": "LossNotDecreasing",
"RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest",
"RuleParameters": {
"rule_to_invoke": "LossNotDecreasing",
"tensor_regex": ".*"
>>>>>>> eafc91fe... Refactored wait_for_debug_rules, added unit tests, updated readme for debugger demo, fixed typos and small errors:samples/contrib/aws-samples/debugger_component_demo/debugger_component_demo.py
}
}

Expand Down
Loading

0 comments on commit 1551e34

Please sign in to comment.