From b19759412be421b9daa7d188530fddcfd5bc9afc Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Wed, 1 Sep 2021 16:57:55 -0700 Subject: [PATCH 1/8] Support placeholders for transfor step --- src/stepfunctions/steps/sagemaker.py | 43 ++++++---- tests/integ/test_sagemaker_steps.py | 87 ++++++++++++++++++++ tests/unit/test_sagemaker_steps.py | 118 +++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 18 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index 9530478..0c2c903 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -181,36 +181,39 @@ def __merge_hyperparameters(self, training_step_hyperparameters, estimator_hyper merged_hyperparameters[key] = value return merged_hyperparameters + class TransformStep(Task): """ Creates a Task State to execute a `SageMaker Transform Job `_. """ - def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, input_filter=None, output_filter=None, join_source=None, **kwargs): + def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, + compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, + input_filter=None, output_filter=None, join_source=None, **kwargs): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. transformer (sagemaker.transformer.Transformer): The SageMaker transformer to use in the TransformStep. job_name (str or Placeholder): Specify a transform job name. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution. model_name (str or Placeholder): Specify a model name for the transform job to use. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution. - data (str): Input data location in S3. - data_type (str): What the S3 location defines (default: 'S3Prefix'). + data (str or Placeholder): Input data location in S3. + data_type (str or Placeholder): What the S3 location defines (default: 'S3Prefix'). Valid values: * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will be used as inputs for the transform job. * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object to use as an input for the transform job. - content_type (str): MIME type of the input data (default: None). - compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None. - split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'. - experiment_config (dict, optional): Specify the experiment config for the transform. (Default: None) + content_type (str or Placeholder): MIME type of the input data (default: None). + compression_type (str or Placeholder): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None. + split_type (str or Placeholder): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'. + experiment_config (dict or Placeholder, optional): Specify the experiment config for the transform. (Default: None) wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True) - tags (list[dict], optional): `List to tags `_ to associate with the resource. - input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). - output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). - join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. + tags (list[dict] or Placeholder, optional): `List to tags `_ to associate with the resource. + input_filter (str or Placeholder): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). + output_filter (str or Placeholder): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). + join_source (str or Placeholder): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. """ if wait_for_completion: """ @@ -229,7 +232,7 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= SageMakerApi.CreateTransformJob) if isinstance(job_name, str): - parameters = transform_config( + transform_parameters = transform_config( transformer=transformer, data=data, data_type=data_type, @@ -242,7 +245,7 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= join_source=join_source ) else: - parameters = transform_config( + transform_parameters = transform_config( transformer=transformer, data=data, data_type=data_type, @@ -255,17 +258,21 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= ) if isinstance(job_name, Placeholder): - parameters['TransformJobName'] = job_name + transform_parameters['TransformJobName'] = job_name - parameters['ModelName'] = model_name + transform_parameters['ModelName'] = model_name if experiment_config is not None: - parameters['ExperimentConfig'] = experiment_config + transform_parameters['ExperimentConfig'] = experiment_config if tags: - parameters['Tags'] = tags_dict_to_kv_list(tags) + transform_parameters['Tags'] = tags if isinstance(tags, Placeholder) else tags_dict_to_kv_list(tags) - kwargs[Field.Parameters.value] = parameters + if Field.Parameters.value in kwargs and isinstance(kwargs[Field.Parameters.value], dict): + # Update processing_parameters with input parameters + merge_dicts(transform_parameters, kwargs[Field.Parameters.value]) + + kwargs[Field.Parameters.value] = transform_parameters super(TransformStep, self).__init__(state_id, **kwargs) diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index f840302..2937a92 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -175,6 +175,93 @@ def test_transform_step(trained_estimator, sfn_client, sfn_role_arn): state_machine_delete_wait(sfn_client, workflow.state_machine_arn) # End of Cleanup +def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role_arn): + # Create transformer from previously created estimator + job_name = generate_job_name() + pca_transformer = trained_estimator.transformer(instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE) + + # Create a model step to save the model + model_step = ModelStep('create_model_step', model=trained_estimator.create_model(), model_name=job_name) + + # Upload data for transformation to S3 + data_path = os.path.join(DATA_DIR, "one_p_mnist") + transform_input_path = os.path.join(data_path, "transform_input.csv") + transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" + transform_input = pca_transformer.sagemaker_session.upload_data( + path=transform_input_path, key_prefix=transform_input_key_prefix + ) + + execution_input = ExecutionInput(schema={ + 'data': str, + 'content_type': str, + 'split_type': str, + 'job_name': str, + 'model_name': str, + 'instance_count': int, + 'instance_type': str, + 'strategy': str, + 'max_concurrent_transforms': int, + 'max_payload': int, + }) + + parameters = { + 'BatchStrategy': execution_input['strategy'], + 'TransformInput': { + 'SplitType': execution_input['split_type'], + }, + 'TransformResources': { + 'InstanceCount': execution_input['instance_count'], + 'InstanceType': execution_input['instance_type'], + }, + 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], + 'MaxPayloadInMB': execution_input['max_payload'] + } + + # Build workflow definition + transform_step = TransformStep( + 'create_transform_job_step', + pca_transformer, + job_name=execution_input['job_name'], + model_name=execution_input['model_name'], + data=execution_input['data'], + content_type=execution_input['content_type'], + parameters=parameters + ) + workflow_graph = Chain([model_step, transform_step]) + + with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): + # Create workflow and check definition + workflow = create_workflow_and_check_definition( + workflow_graph=workflow_graph, + workflow_name=unique_name_from_base("integ-test-transform-step-workflow"), + sfn_client=sfn_client, + sfn_role_arn=sfn_role_arn + ) + + execution_input = { + 'job_name': job_name, + 'model_name': job_name, + 'data': transform_input, + 'content_type': "text/csv", + 'instance_count': 1, + 'instance_type': "ml.m5.large", + 'split_type': 'Line', + 'strategy': 'SingleRecord', + 'max_concurrent_transforms': 2, + 'max_payload': 5 + } + + # Execute workflow + execution = workflow.execute(inputs=execution_input) + execution_output = execution.get_output(wait=True) + + # Check workflow output + assert execution_output.get("TransformJobStatus") == "Completed" + + # Cleanup + state_machine_delete_wait(sfn_client, workflow.state_machine_arn) + + def test_endpoint_config_step(trained_estimator, sfn_client, sagemaker_session, sfn_role_arn): # Setup: Create model for trained estimator in SageMaker model = trained_estimator.create_model() diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index 664a498..e873c79 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -745,6 +745,124 @@ def test_transform_step_creation(pca_transformer): } +@patch.object(boto3.session.Session, 'region_name', 'us-east-1') +def test_transform_step_creation_with_placeholder(pca_transformer): + execution_input = ExecutionInput(schema={ + 'data': str, + 'data_type': str, + 'content_type': str, + 'compression_type': str, + 'split_type': str, + 'input_filter': str, + 'output_filter': str, + 'join_source': str, + 'job_name': str, + 'model_name': str, + 'instance_count': int, + 'strategy': str, + 'assemble_with': str, + 'output_path': str, + 'output_kms_key': str, + 'accept': str, + 'max_concurrent_transforms': int, + 'max_payload': int, + 'tags': [{str: str}], + 'env': str, + 'volume_kms_key': str, + }) + + step_input = StepInput(schema={ + 'instance_type': str + }) + + parameters = { + 'BatchStrategy': execution_input['strategy'], + 'TransformOutput': { + 'Accept': execution_input['accept'], + 'AssembleWith': execution_input['assemble_with'], + 'KmsKeyId': execution_input['output_kms_key'], + 'S3OutputPath': execution_input['output_path'] + }, + 'TransformResources': { + 'InstanceCount': execution_input['instance_count'], + 'InstanceType': step_input['instance_type'], + 'VolumeKmsKeyId': execution_input['volume_kms_key'] + }, + 'Tags': execution_input['tags'], + 'Environment': execution_input['env'], + 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], + 'MaxPayloadInMB': execution_input['max_concurrent_transforms'] + } + + step = TransformStep('Inference', + transformer=pca_transformer, + data=execution_input['data'], + data_type=execution_input['data_type'], + content_type=execution_input['content_type'], + compression_type=execution_input['compression_type'], + split_type=execution_input['split_type'], + job_name=execution_input['job_name'], + model_name=execution_input['model_name'], + experiment_config={ + 'ExperimentName': 'pca_experiment', + 'TrialName': 'pca_trial', + 'TrialComponentDisplayName': 'Transform' + }, + tags=execution_input['tags'], + join_source=execution_input['join_source'], + output_filter=execution_input['output_filter'], + input_filter=execution_input['input_filter'], + parameters=parameters + ) + + assert step.to_dict() == { + 'Type': 'Task', + 'Parameters': { + 'BatchStrategy.$': "$$.Execution.Input['strategy']", + 'ModelName.$': "$$.Execution.Input['model_name']", + 'TransformInput': { + 'CompressionType.$': "$$.Execution.Input['compression_type']", + 'ContentType.$': "$$.Execution.Input['content_type']", + 'DataSource': { + 'S3DataSource': { + 'S3DataType.$': "$$.Execution.Input['data_type']", + 'S3Uri.$': "$$.Execution.Input['data']" + } + }, + 'SplitType.$': "$$.Execution.Input['split_type']" + }, + 'TransformOutput': { + 'Accept.$': "$$.Execution.Input['accept']", + 'AssembleWith.$': "$$.Execution.Input['assemble_with']", + 'KmsKeyId.$': "$$.Execution.Input['output_kms_key']", + 'S3OutputPath.$': "$$.Execution.Input['output_path']" + }, + 'TransformJobName.$': "$$.Execution.Input['job_name']", + 'TransformResources': { + 'InstanceCount.$': "$$.Execution.Input['instance_count']", + 'InstanceType.$': "$['instance_type']", + 'VolumeKmsKeyId.$': "$$.Execution.Input['volume_kms_key']" + }, + 'ExperimentConfig': { + 'ExperimentName': 'pca_experiment', + 'TrialName': 'pca_trial', + 'TrialComponentDisplayName': 'Transform' + }, + 'DataProcessing': { + 'InputFilter.$': "$$.Execution.Input['input_filter']", + 'OutputFilter.$': "$$.Execution.Input['output_filter']", + 'JoinSource.$': "$$.Execution.Input['join_source']", + }, + 'Tags.$': "$$.Execution.Input['tags']", + 'Environment.$': "$$.Execution.Input['env']", + 'MaxConcurrentTransforms.$': "$$.Execution.Input['max_concurrent_transforms']", + 'MaxPayloadInMB.$': "$$.Execution.Input['max_concurrent_transforms']", + }, + 'Resource': 'arn:aws:states:::sagemaker:createTransformJob.sync', + 'End': True + } + + @patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call) @patch.object(boto3.session.Session, 'region_name', 'us-east-1') def test_get_expected_model(pca_estimator): From 9ff76245899c315c3e7ff3344db683b33f4ab9a7 Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Wed, 1 Sep 2021 17:34:12 -0700 Subject: [PATCH 2/8] Use max_payload in execution input --- tests/unit/test_sagemaker_steps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index e873c79..e4dc7c6 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -791,7 +791,7 @@ def test_transform_step_creation_with_placeholder(pca_transformer): 'Tags': execution_input['tags'], 'Environment': execution_input['env'], 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], - 'MaxPayloadInMB': execution_input['max_concurrent_transforms'] + 'MaxPayloadInMB': execution_input['max_payload'] } step = TransformStep('Inference', @@ -856,7 +856,7 @@ def test_transform_step_creation_with_placeholder(pca_transformer): 'Tags.$': "$$.Execution.Input['tags']", 'Environment.$': "$$.Execution.Input['env']", 'MaxConcurrentTransforms.$': "$$.Execution.Input['max_concurrent_transforms']", - 'MaxPayloadInMB.$': "$$.Execution.Input['max_concurrent_transforms']", + 'MaxPayloadInMB.$': "$$.Execution.Input['max_payload']", }, 'Resource': 'arn:aws:states:::sagemaker:createTransformJob.sync', 'End': True From 4ad008f8ee38616e8b020ab2f97ce34e36cc055d Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Thu, 2 Sep 2021 22:48:03 -0700 Subject: [PATCH 3/8] Adjust number of concurrent transform --- tests/integ/test_sagemaker_steps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index 2937a92..bc7ff76 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -243,11 +243,11 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'model_name': job_name, 'data': transform_input, 'content_type': "text/csv", - 'instance_count': 1, - 'instance_type': "ml.m5.large", + 'instance_count': INSTANCE_COUNT, + 'instance_type': INSTANCE_TYPE, 'split_type': 'Line', 'strategy': 'SingleRecord', - 'max_concurrent_transforms': 2, + 'max_concurrent_transforms': 1, 'max_payload': 5 } From bbe0f9eec8a2d5441f8b87453c6e07723f22d829 Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Fri, 3 Sep 2021 15:12:48 -0700 Subject: [PATCH 4/8] Added parameters arg doc --- src/stepfunctions/steps/sagemaker.py | 3 +++ tests/integ/test_sagemaker_steps.py | 12 +++--------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index 0c2c903..3b1930a 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -214,6 +214,9 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= input_filter (str or Placeholder): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). output_filter (str or Placeholder): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). join_source (str or Placeholder): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. + parameters(dict, optional): The value of this field is merged with other arguments to become the request payload for SageMaker `CreateTransformJob`_. + You can use `parameters` to override the value provided by other arguments and specify any field's value dynamically using `Placeholders`_. + """ if wait_for_completion: """ diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index bc7ff76..d5f6127 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -199,9 +199,7 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'model_name': str, 'instance_count': int, 'instance_type': str, - 'strategy': str, - 'max_concurrent_transforms': int, - 'max_payload': int, + 'strategy': str }) parameters = { @@ -212,9 +210,7 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'TransformResources': { 'InstanceCount': execution_input['instance_count'], 'InstanceType': execution_input['instance_type'], - }, - 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], - 'MaxPayloadInMB': execution_input['max_payload'] + } } # Build workflow definition @@ -246,9 +242,7 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'instance_count': INSTANCE_COUNT, 'instance_type': INSTANCE_TYPE, 'split_type': 'Line', - 'strategy': 'SingleRecord', - 'max_concurrent_transforms': 1, - 'max_payload': 5 + 'strategy': 'SingleRecord' } # Execute workflow From 58b1685c14a5e3037aa03866303877d668e1d452 Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Mon, 13 Sep 2021 09:26:50 -0700 Subject: [PATCH 5/8] Add retry to integ test --- tests/integ/test_sagemaker_steps.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index 29a8d8d..9d237b7 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -179,6 +179,7 @@ def test_transform_step(trained_estimator, sfn_client, sfn_role_arn): state_machine_delete_wait(sfn_client, workflow.state_machine_arn) # End of Cleanup + def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role_arn): # Create transformer from previously created estimator job_name = generate_job_name() @@ -186,6 +187,7 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role # Create a model step to save the model model_step = ModelStep('create_model_step', model=trained_estimator.create_model(), model_name=job_name) + model_step.add_retry(SAGEMAKER_RETRY_STRATEGY) # Upload data for transformation to S3 data_path = os.path.join(DATA_DIR, "one_p_mnist") @@ -227,6 +229,7 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role content_type=execution_input['content_type'], parameters=parameters ) + transform_step.add_retry(SAGEMAKER_RETRY_STRATEGY) workflow_graph = Chain([model_step, transform_step]) with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): From d5755187acb4ec62f31d22bd1d9c729dab422720 Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Tue, 14 Sep 2021 17:11:16 -0700 Subject: [PATCH 6/8] Updated tests --- tests/integ/test_sagemaker_steps.py | 12 +++-- tests/unit/test_sagemaker_steps.py | 81 ++++++++++++++--------------- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index 9d237b7..e3f5724 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -205,7 +205,9 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'model_name': str, 'instance_count': int, 'instance_type': str, - 'strategy': str + 'strategy': str, + 'max_concurrent_transforms': int, + 'max_payload': int, }) parameters = { @@ -216,7 +218,9 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'TransformResources': { 'InstanceCount': execution_input['instance_count'], 'InstanceType': execution_input['instance_type'], - } + }, + 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], + 'MaxPayloadInMB': execution_input['max_payload'] } # Build workflow definition @@ -249,7 +253,9 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role 'instance_count': INSTANCE_COUNT, 'instance_type': INSTANCE_TYPE, 'split_type': 'Line', - 'strategy': 'SingleRecord' + 'strategy': 'SingleRecord', + 'max_concurrent_transforms': 2, + 'max_payload': 5 } # Execute workflow diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index 6b687de..7038f98 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -971,51 +971,46 @@ def test_transform_step_creation_with_placeholder(pca_transformer): parameters=parameters ) - assert step.to_dict() == { - 'Type': 'Task', - 'Parameters': { - 'BatchStrategy.$': "$$.Execution.Input['strategy']", - 'ModelName.$': "$$.Execution.Input['model_name']", - 'TransformInput': { - 'CompressionType.$': "$$.Execution.Input['compression_type']", - 'ContentType.$': "$$.Execution.Input['content_type']", - 'DataSource': { - 'S3DataSource': { - 'S3DataType.$': "$$.Execution.Input['data_type']", - 'S3Uri.$': "$$.Execution.Input['data']" - } - }, - 'SplitType.$': "$$.Execution.Input['split_type']" - }, - 'TransformOutput': { - 'Accept.$': "$$.Execution.Input['accept']", - 'AssembleWith.$': "$$.Execution.Input['assemble_with']", - 'KmsKeyId.$': "$$.Execution.Input['output_kms_key']", - 'S3OutputPath.$': "$$.Execution.Input['output_path']" - }, - 'TransformJobName.$': "$$.Execution.Input['job_name']", - 'TransformResources': { - 'InstanceCount.$': "$$.Execution.Input['instance_count']", - 'InstanceType.$': "$['instance_type']", - 'VolumeKmsKeyId.$': "$$.Execution.Input['volume_kms_key']" - }, - 'ExperimentConfig': { - 'ExperimentName': 'pca_experiment', - 'TrialName': 'pca_trial', - 'TrialComponentDisplayName': 'Transform' - }, - 'DataProcessing': { - 'InputFilter.$': "$$.Execution.Input['input_filter']", - 'OutputFilter.$': "$$.Execution.Input['output_filter']", - 'JoinSource.$': "$$.Execution.Input['join_source']", + assert step.to_dict()['Parameters'] == { + 'BatchStrategy.$': "$$.Execution.Input['strategy']", + 'ModelName.$': "$$.Execution.Input['model_name']", + 'TransformInput': { + 'CompressionType.$': "$$.Execution.Input['compression_type']", + 'ContentType.$': "$$.Execution.Input['content_type']", + 'DataSource': { + 'S3DataSource': { + 'S3DataType.$': "$$.Execution.Input['data_type']", + 'S3Uri.$': "$$.Execution.Input['data']" + } }, - 'Tags.$': "$$.Execution.Input['tags']", - 'Environment.$': "$$.Execution.Input['env']", - 'MaxConcurrentTransforms.$': "$$.Execution.Input['max_concurrent_transforms']", - 'MaxPayloadInMB.$': "$$.Execution.Input['max_payload']", + 'SplitType.$': "$$.Execution.Input['split_type']" }, - 'Resource': 'arn:aws:states:::sagemaker:createTransformJob.sync', - 'End': True + 'TransformOutput': { + 'Accept.$': "$$.Execution.Input['accept']", + 'AssembleWith.$': "$$.Execution.Input['assemble_with']", + 'KmsKeyId.$': "$$.Execution.Input['output_kms_key']", + 'S3OutputPath.$': "$$.Execution.Input['output_path']" + }, + 'TransformJobName.$': "$$.Execution.Input['job_name']", + 'TransformResources': { + 'InstanceCount.$': "$$.Execution.Input['instance_count']", + 'InstanceType.$': "$['instance_type']", + 'VolumeKmsKeyId.$': "$$.Execution.Input['volume_kms_key']" + }, + 'ExperimentConfig': { + 'ExperimentName': 'pca_experiment', + 'TrialName': 'pca_trial', + 'TrialComponentDisplayName': 'Transform' + }, + 'DataProcessing': { + 'InputFilter.$': "$$.Execution.Input['input_filter']", + 'OutputFilter.$': "$$.Execution.Input['output_filter']", + 'JoinSource.$': "$$.Execution.Input['join_source']", + }, + 'Tags.$': "$$.Execution.Input['tags']", + 'Environment.$': "$$.Execution.Input['env']", + 'MaxConcurrentTransforms.$': "$$.Execution.Input['max_concurrent_transforms']", + 'MaxPayloadInMB.$': "$$.Execution.Input['max_payload']" } From 3d7970adab4da1015c4c440f38a0c9d101ddbe9d Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen <83104894+ca-nguyen@users.noreply.github.com> Date: Fri, 15 Oct 2021 10:28:00 -0700 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Eliot V Scott Co-authored-by: Shiv Lakshminarayan --- src/stepfunctions/steps/sagemaker.py | 2 +- tests/integ/test_sagemaker_steps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index e209605..6d8ccc1 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -276,7 +276,7 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= transform_parameters['Tags'] = tags if isinstance(tags, Placeholder) else tags_dict_to_kv_list(tags) if Field.Parameters.value in kwargs and isinstance(kwargs[Field.Parameters.value], dict): - # Update processing_parameters with input parameters + # Update transform_parameters with input parameters merge_dicts(transform_parameters, kwargs[Field.Parameters.value]) kwargs[Field.Parameters.value] = transform_parameters diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index e3f5724..0230f69 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -181,7 +181,7 @@ def test_transform_step(trained_estimator, sfn_client, sfn_role_arn): def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role_arn): - # Create transformer from previously created estimator + # Create transformer from supplied estimator job_name = generate_job_name() pca_transformer = trained_estimator.transformer(instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE) From 1422243911104b6ba2a1ba2847a5a796aed36a81 Mon Sep 17 00:00:00 2001 From: Carolyn Nguyen Date: Fri, 15 Oct 2021 10:42:03 -0700 Subject: [PATCH 8/8] Correct indentation and use placeholder for experiment_config in unit test --- tests/integ/test_sagemaker_steps.py | 22 +++++++++++----------- tests/unit/test_sagemaker_steps.py | 10 ++++------ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/integ/test_sagemaker_steps.py b/tests/integ/test_sagemaker_steps.py index 0230f69..a04f110 100644 --- a/tests/integ/test_sagemaker_steps.py +++ b/tests/integ/test_sagemaker_steps.py @@ -211,17 +211,17 @@ def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role }) parameters = { - 'BatchStrategy': execution_input['strategy'], - 'TransformInput': { - 'SplitType': execution_input['split_type'], - }, - 'TransformResources': { - 'InstanceCount': execution_input['instance_count'], - 'InstanceType': execution_input['instance_type'], - }, - 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], - 'MaxPayloadInMB': execution_input['max_payload'] - } + 'BatchStrategy': execution_input['strategy'], + 'TransformInput': { + 'SplitType': execution_input['split_type'], + }, + 'TransformResources': { + 'InstanceCount': execution_input['instance_count'], + 'InstanceType': execution_input['instance_type'], + }, + 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], + 'MaxPayloadInMB': execution_input['max_payload'] + } # Build workflow definition transform_step = TransformStep( diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index 7038f98..cc17bae 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -925,6 +925,7 @@ def test_transform_step_creation_with_placeholder(pca_transformer): 'tags': [{str: str}], 'env': str, 'volume_kms_key': str, + 'experiment_config': str, }) step_input = StepInput(schema={ @@ -944,10 +945,11 @@ def test_transform_step_creation_with_placeholder(pca_transformer): 'InstanceType': step_input['instance_type'], 'VolumeKmsKeyId': execution_input['volume_kms_key'] }, + 'ExperimentConfig': execution_input['experiment_config'], 'Tags': execution_input['tags'], 'Environment': execution_input['env'], 'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'], - 'MaxPayloadInMB': execution_input['max_payload'] + 'MaxPayloadInMB': execution_input['max_payload'], } step = TransformStep('Inference', @@ -997,11 +999,7 @@ def test_transform_step_creation_with_placeholder(pca_transformer): 'InstanceType.$': "$['instance_type']", 'VolumeKmsKeyId.$': "$$.Execution.Input['volume_kms_key']" }, - 'ExperimentConfig': { - 'ExperimentName': 'pca_experiment', - 'TrialName': 'pca_trial', - 'TrialComponentDisplayName': 'Transform' - }, + 'ExperimentConfig.$': "$$.Execution.Input['experiment_config']", 'DataProcessing': { 'InputFilter.$': "$$.Execution.Input['input_filter']", 'OutputFilter.$': "$$.Execution.Input['output_filter']",