From 21631fa620b3e109a0720e7ec8ea8b9ea3536b71 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Thu, 11 Feb 2021 15:24:13 -0800
Subject: [PATCH] Format all working *.py and *.ipynb
---
00_quickstart/01_Setup_Dependencies.ipynb | 4 +-
00_quickstart/02_Copy_TSV_To_S3.ipynb | 24 +-
00_quickstart/03_Create_Athena_Database.ipynb | 27 +-
.../04_Register_S3_TSV_With_Athena.ipynb | 59 +-
...onvert_S3_TSV_To_Parquet_With_Athena.ipynb | 59 +-
.../06_Visualize_Reviews_Dataset.ipynb | 340 +++--
.../07_Prepare_Dataset_Bias_Analysis.ipynb | 84 +-
.../08_Run_Data_Bias_Analysis_AdHoc.ipynb | 40 +-
...Run_Data_Bias_Analysis_ProcessingJob.ipynb | 73 +-
...eate_SageMaker_Pipeline_BERT_Reviews.ipynb | 563 ++++-----
.../11_Evaluate_Pipeline_Execution.ipynb | 73 +-
00_quickstart/12_Register_Deploy_Model.ipynb | 107 +-
00_quickstart/13_Cleanup.ipynb | 4 +-
00_quickstart/evaluate_model_metrics.py | 218 ++--
...ocess-scikit-text-to-bert-feature-store.py | 695 +++++-----
00_quickstart/src/inference.py | 113 +-
00_quickstart/src/tf_bert_reviews.py | 676 +++++-----
01_setup/01_Setup_Dependencies.ipynb | 4 +-
01_setup/02_Check_Environment.ipynb | 57 +-
01_setup/03_Create_S3_Bucket.ipynb | 10 +-
.../04_Update_IAM_Roles_And_Policies.ipynb | 362 +++---
02_usecases/01_Setup.ipynb | 2 +-
02_usecases/03_Celebrity_Recognition.ipynb | 127 +-
02_usecases/04_Content_Moderation.ipynb | 73 +-
.../05_Inappropriate_Text_Detection.ipynb | 158 ++-
..._Text_Classification_Prepare_Dataset.ipynb | 104 +-
.../07_Text_Classification_Train_Model.ipynb | 232 ++--
.../08_Text_Classification_Predict.ipynb | 67 +-
.../archive/05_Celebrity_Detection.ipynb | 126 +-
03_automl/01_Prepare_Dataset_Autopilot.ipynb | 156 ++-
03_automl/02_Train_Reviews_Autopilot.ipynb | 502 +++++---
03_automl/03_Predict_Reviews_Autopilot.ipynb | 52 +-
.../candidate_data_processors/dpp0.py | 27 +-
.../candidate_data_processors/dpp1.py | 28 +-
.../candidate_data_processors/dpp2.py | 27 +-
.../sagemaker_serve.py | 86 +-
03_automl/generated_module/setup.py | 16 +-
...AutopilotCandidateDefinitionNotebook.ipynb | 245 ++--
.../notebooks/sagemaker_automl/common.py | 71 +-
.../notebooks/sagemaker_automl/config.py | 39 +-
.../sagemaker_automl/interactive_runner.py | 46 +-
.../sagemaker_automl/local_candidate.py | 84 +-
03_automl/notebooks/sagemaker_automl/steps.py | 46 +-
04_ingest/01_Copy_TSV_To_S3.ipynb | 72 +-
04_ingest/02_Create_Athena_Database.ipynb | 27 +-
.../03_Register_S3_TSV_With_Athena.ipynb | 59 +-
...onvert_S3_TSV_To_Parquet_With_Athena.ipynb | 59 +-
.../05_Query_Data_With_AWS_DataWrangler.ipynb | 76 +-
05_explore/01_Visualize_Reviews_Dataset.ipynb | 338 +++--
.../02_Prepare_Dataset_Bias_Analysis.ipynb | 84 +-
.../03_Run_Data_Bias_Analysis_AdHoc.ipynb | 40 +-
...Run_Data_Bias_Analysis_ProcessingJob.ipynb | 77 +-
...e_Data_Quality_ProcessingJob_PySpark.ipynb | 105 +-
...GENERATED_Data_Wrangler_Job_Notebook.ipynb | 35 +-
05_explore/99_GENERATED_Python_Code.py | 151 ++-
...TED_SageMaker_Feature_Store_Notebook.ipynb | 215 ++--
...ENERATED_SageMaker_Pipeline_Notebook.ipynb | 95 +-
.../01_Visualize_Reviews_Dataset.ipynb | 402 +++---
.../archive/02_Explore_Redshift_Data.ipynb | 113 +-
05_explore/preprocess-deequ-pyspark.py | 166 ++-
...taset_BERT_Scikit_AdHoc_FeatureStore.ipynb | 268 ++--
..._BERT_Scikit_ScriptMode_FeatureStore.ipynb | 250 ++--
.../data-wrangler/DataWranglerJob_Antje.ipynb | 35 +-
.../DataWrangler_To_FeatureStore_Antje.ipynb | 219 ++--
.../DataWrangler_To_Pipeline_Antje.ipynb | 110 +-
.../data-wrangler/data_wrangler_antje.py | 148 ++-
...ocess-scikit-text-to-bert-feature-store.py | 695 +++++-----
...s_BERT_Transformers_TensorFlow_AdHoc.ipynb | 197 ++-
...T_Transformers_TensorFlow_ScriptMode.ipynb | 367 +++---
...T_Transformers_TensorFlow_To_PyTorch.ipynb | 105 +-
07_train/04_Evaluate_Model_Metrics.ipynb | 161 +--
.../00_Prepare_Dataset_BERT.ipynb | 143 ++-
.../00_setup_eks/00_01_Setup_EKS.ipynb | 3 +-
.../00_setup_eks/00_04_Setup_FSX.ipynb | 51 +-
.../01_Develop_Code_Notebook.ipynb | 413 +++---
.../03_Run_ML_Training_SageMaker.ipynb | 155 ++-
07_train/container-demo/code/train.py | 417 +++---
07_train/evaluate_model_metrics.py | 218 ++--
07_train/src/inference.py | 113 +-
07_train/src/tf_bert_reviews.py | 676 +++++-----
...meter_Tuning_Reviews_BERT_TensorFlow.ipynb | 233 ++--
...meter_Tuning_Reviews_BERT_TensorFlow.ipynb | 300 +++--
08_optimize/src/inference.py | 113 +-
08_optimize/src/tf_bert_reviews.py | 676 +++++-----
...ageMaker_Autopilot_Model_From_Athena.ipynb | 140 ++-
...y_Reviews_BERT_PyTorch_REST_Endpoint.ipynb | 87 +-
...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 105 +-
...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 93 +-
...views_BERT_TensorFlow_REST_Endpoints.ipynb | 389 +++---
09_deploy/code-pytorch/inference.py | 96 +-
09_deploy/code/inference.py | 113 +-
09_deploy/common/docker_utils.py | 52 +-
09_deploy/common/env_utils.py | 72 +-
09_deploy/common/markdown_helper.py | 23 +-
09_deploy/common/misc.py | 90 +-
.../common/sagemaker_rl/coach_launcher.py | 158 +--
.../common/sagemaker_rl/configuration_list.py | 27 +-
09_deploy/common/sagemaker_rl/docker_utils.py | 10 +-
09_deploy/common/sagemaker_rl/mpi_launcher.py | 104 +-
09_deploy/common/sagemaker_rl/onnx_utils.py | 28 +-
.../clients/ddb/experiment_db_client.py | 124 +-
.../clients/ddb/join_db_client.py | 98 +-
.../clients/ddb/model_db_client.py | 126 +-
.../exceptions/ddb_client_exceptions.py | 4 +-
.../exceptions/workflow_exceptions.py | 8 +-
.../orchestrator/resource_manager.py | 350 +++---
.../orchestrator/utils/cloudwatch_logger.py | 173 +--
.../workflow/datatypes/experiment_record.py | 39 +-
.../workflow/datatypes/join_job_record.py | 73 +-
.../workflow/datatypes/model_record.py | 103 +-
.../workflow/manager/experiment_manager.py | 1116 +++++++++--------
.../workflow/manager/join_manager.py | 271 ++--
.../workflow/manager/model_manager.py | 314 ++---
09_deploy/common/sagemaker_rl/ray_launcher.py | 99 +-
.../sagemaker_rl/sage_cluster_communicator.py | 21 +-
.../sagemaker_rl/stable_baselines_launcher.py | 89 +-
.../common/sagemaker_rl/tf_serving_utils.py | 17 +-
09_deploy/src/eval-cfa-vw.py | 38 +-
09_deploy/src/io_utils.py | 14 +-
09_deploy/src/train-vw.py | 47 +-
09_deploy/src/vw_model.py | 39 +-
09_deploy/src/vw_utils.py | 2 +-
...eate_SageMaker_Pipeline_BERT_Reviews.ipynb | 567 ++++-----
.../02_Evaluate_Pipeline_Execution.ipynb | 77 +-
10_pipeline/03_Register_Deploy_Model.ipynb | 156 +--
10_pipeline/airflow/00_Create_S3_Bucket.ipynb | 34 +-
.../01_Setup_Airflow_Dependencies.ipynb | 152 ++-
.../02_Create_Airflow_Environment.ipynb | 74 +-
.../03_Trigger_Airflow_Environment.ipynb | 52 +-
10_pipeline/airflow/dags/bert_reviews.py | 83 +-
10_pipeline/airflow/dags/config.py | 29 +-
10_pipeline/airflow/dags/pipeline/prepare.py | 65 +-
.../airflow/dags/pipeline/preprocess.py | 79 +-
10_pipeline/airflow/src/config.py | 29 +-
.../dag_ml_pipeline_amazon_video_reviews.py | 91 +-
10_pipeline/evaluate_model_metrics.py | 218 ++--
10_pipeline/human/00_Overview.ipynb | 2 +-
.../01_Setup_Augmented_AI_Workflow.ipynb | 64 +-
...om_Comprehend_Custom_Text_Classifier.ipynb | 103 +-
10_pipeline/kubeflow/00_00_Setup_EKS.ipynb | 8 +-
..._05_Launch_Kubeflow_Jupyter_Notebook.ipynb | 4 +-
.../02_Kubeflow_Pipeline_Simple.ipynb | 38 +-
...flow_Pipeline_Reviews_BERT_SageMaker.ipynb | 334 ++---
...LIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb | 2 +-
10_pipeline/kubeflow/code/inference.py | 113 +-
10_pipeline/kubeflow/code/tf_bert_reviews.py | 676 +++++-----
.../kubeflow/evaluate_model_metrics.py | 218 ++--
...ocess-scikit-text-to-bert-feature-store.py | 695 +++++-----
...ageMaker_Pipeline_BERT_Reviews_MLOps.ipynb | 389 +++---
.../dsoaws/evaluate_model_metrics.py | 218 ++--
.../pipelines/dsoaws/inference.py | 113 +-
.../pipelines/dsoaws/pipeline.py | 458 +++----
...ocess-scikit-text-to-bert-feature-store.py | 695 +++++-----
.../pipelines/dsoaws/tf_bert_reviews.py | 676 +++++-----
.../pipelines/run_pipeline.py | 137 +-
.../sagemaker-project-modelbuild/setup.py | 8 +-
.../test/test.py | 4 +-
...ocess-scikit-text-to-bert-feature-store.py | 695 +++++-----
10_pipeline/src/inference.py | 113 +-
10_pipeline/src/tf_bert_reviews.py | 676 +++++-----
...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 34 +-
...y_Reviews_BERT_TensorFlow_S3_Trigger.ipynb | 716 +++++------
...d_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb | 291 +++--
11_stream/01_Setup_IAM.ipynb | 211 +---
...02_Create_Lambda_To_Invoke_SageMaker.ipynb | 135 +-
.../03_Create_Kinesis_Data_Firehose.ipynb | 162 ++-
11_stream/04_Create_Kinesis_Data_Stream.ipynb | 59 +-
...Create_Lambda_Destination_CloudWatch.ipynb | 89 +-
.../06_Create_Lambda_Destination_SNS.ipynb | 104 +-
...07_Create_Kinesis_Data_Analytics_App.ipynb | 227 ++--
...Put_Reviews_On_Kinesis_Data_Firehose.ipynb | 217 ++--
.../archive/11_stream.orig/00_Overview.ipynb | 2 +-
.../archive/11_stream.orig/01_Setup_IAM.ipynb | 192 +--
.../02_Create_Kinesis_Data_Firehose.ipynb | 89 +-
.../03_Create_Kinesis_Data_Stream.ipynb | 61 +-
.../04_Create_Lambda_Destination.ipynb | 84 +-
...05_Create_Kinesis_Data_Analytics_App.ipynb | 201 ++-
...Put_Reviews_On_Kinesis_Data_Firehose.ipynb | 202 +--
.../11_stream.orig/src/lambda_function.py | 48 +-
.../src/deliver_metrics_to_cloudwatch.py | 48 +-
.../src/invoke_sm_endpoint_from_kinesis.py | 75 +-
11_stream/src/push_notification_to_sns.py | 43 +-
12_security/01_Secrets_Manager.ipynb | 29 +-
12_security/02_Insecure_DataAccess_S3.ipynb | 4 +-
...cure_DataAccess_S3_BucketPolicy_Role.ipynb | 32 +-
...ecure_DataAccess_S3_BucketPolicy_VPC.ipynb | 51 +-
..._Secure_DataAccess_S3_IAMPolicy_Role.ipynb | 48 +-
...a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb | 57 +-
...5_Secure_SageMaker_Notebook_Instance.ipynb | 16 +-
12_security/07_Insecure_Train.ipynb | 185 +--
.../08_Secure_Train_IAMPolicy_Role.ipynb | 251 ++--
.../08a_Secure_Train_IAMPolicy_VPC.ipynb | 419 ++++---
...ure_Train_IAMPolicy_VPC_ConditionKey.ipynb | 451 ++++---
...09_Secure_Train_EncryptionAtRest_KMS.ipynb | 193 +--
.../10_Secure_Train_EncryptionInTransit.ipynb | 183 +--
.../11_Secure_Train_NetworkIsolation.ipynb | 187 +--
12_security/src/inference.py | 113 +-
12_security/src/tf_bert_reviews.py | 639 +++++-----
99_cleanup/01_Cleanup.ipynb | 28 +-
pyproject.toml | 17 +
200 files changed, 16361 insertions(+), 16815 deletions(-)
create mode 100644 pyproject.toml
diff --git a/00_quickstart/01_Setup_Dependencies.ipynb b/00_quickstart/01_Setup_Dependencies.ipynb
index f4194b22..1755c0c8 100644
--- a/00_quickstart/01_Setup_Dependencies.ipynb
+++ b/00_quickstart/01_Setup_Dependencies.ipynb
@@ -95,7 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!conda install -y pytorch==1.6.0 -c pytorch "
+ "!conda install -y pytorch==1.6.0 -c pytorch"
]
},
{
@@ -260,7 +260,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_dependencies_passed=True"
+ "setup_dependencies_passed = True"
]
},
{
diff --git a/00_quickstart/02_Copy_TSV_To_S3.ipynb b/00_quickstart/02_Copy_TSV_To_S3.ipynb
index a9aa8bfc..a6664daf 100644
--- a/00_quickstart/02_Copy_TSV_To_S3.ipynb
+++ b/00_quickstart/02_Copy_TSV_To_S3.ipynb
@@ -72,13 +72,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -99,9 +99,9 @@
"try:\n",
" setup_dependencies_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -126,7 +126,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'"
+ "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\""
]
},
{
@@ -151,7 +151,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n",
+ "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n",
"print(s3_private_path_tsv)"
]
},
@@ -223,7 +223,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Bucket'.format(region, account_id, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Bucket'.format(\n",
+ " region, account_id, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/00_quickstart/03_Create_Athena_Database.ipynb b/00_quickstart/03_Create_Athena_Database.ipynb
index 80a857be..d4a918f9 100644
--- a/00_quickstart/03_Create_Athena_Database.ipynb
+++ b/00_quickstart/03_Create_Athena_Database.ipynb
@@ -29,7 +29,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -62,10 +62,10 @@
"try:\n",
" s3_public_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -95,10 +95,10 @@
"try:\n",
" s3_private_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -141,7 +141,7 @@
"metadata": {},
"outputs": [],
"source": [
- "database_name = 'dsoaws'"
+ "database_name = \"dsoaws\""
]
},
{
@@ -160,7 +160,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -178,7 +178,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'CREATE DATABASE IF NOT EXISTS {}'.format(database_name)\n",
+ "statement = \"CREATE DATABASE IF NOT EXISTS {}\".format(database_name)\n",
"print(statement)"
]
},
@@ -189,6 +189,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"pd.read_sql(statement, conn)"
]
},
@@ -205,7 +206,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW DATABASES'\n",
+ "statement = \"SHOW DATABASES\"\n",
"\n",
"df_show = pd.read_sql(statement, conn)\n",
"df_show.head(5)"
diff --git a/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb b/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb
index e8b935b4..33fff7c9 100644
--- a/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb
+++ b/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb
@@ -31,7 +31,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -64,9 +64,9 @@
"try:\n",
" ingest_create_athena_db_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -85,11 +85,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_db_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]') "
+ " print(\"[OK]\")"
]
},
{
@@ -110,10 +110,10 @@
"try:\n",
" s3_private_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -179,7 +179,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -189,8 +189,8 @@
"outputs": [],
"source": [
"# Set Athena parameters\n",
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'"
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\""
]
},
{
@@ -226,7 +226,9 @@
" review_body string,\n",
" review_date string\n",
") ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\\\t' LINES TERMINATED BY '\\\\n' LOCATION '{}'\n",
- "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(database_name, table_name_tsv, s3_private_path_tsv)\n",
+ "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(\n",
+ " database_name, table_name_tsv, s3_private_path_tsv\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -238,6 +240,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"pd.read_sql(statement, conn)"
]
},
@@ -254,7 +257,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW TABLES in {}'.format(database_name)\n",
+ "statement = \"SHOW TABLES in {}\".format(database_name)\n",
"\n",
"df_show = pd.read_sql(statement, conn)\n",
"df_show.head(5)"
@@ -292,10 +295,12 @@
"metadata": {},
"outputs": [],
"source": [
- "product_category = 'Digital_Software'\n",
+ "product_category = \"Digital_Software\"\n",
"\n",
"statement = \"\"\"SELECT * FROM {}.{}\n",
- " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_tsv, product_category)\n",
+ " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n",
+ " database_name, table_name_tsv, product_category\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -317,11 +322,11 @@
"outputs": [],
"source": [
"if not df.empty:\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"else:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -339,7 +344,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review AWS Glue Catalog'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review AWS Glue Catalog'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb b/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
index 454235c0..a41e834f 100644
--- a/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
+++ b/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
@@ -30,7 +30,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -63,9 +63,9 @@
"try:\n",
" ingest_create_athena_table_tsv_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -84,11 +84,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_tsv_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -123,12 +123,12 @@
"outputs": [],
"source": [
"# Set S3 path to Parquet data\n",
- "s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(bucket)\n",
+ "s3_path_parquet = \"s3://{}/amazon-reviews-pds/parquet\".format(bucket)\n",
"\n",
"# Set Athena parameters\n",
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'\n",
- "table_name_parquet = 'amazon_reviews_parquet'"
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\"\n",
+ "table_name_parquet = \"amazon_reviews_parquet\""
]
},
{
@@ -138,7 +138,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -185,7 +185,9 @@
" CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,\n",
" DATE(review_date) AS review_date,\n",
" product_category\n",
- "FROM {}.{}\"\"\".format(database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv)\n",
+ "FROM {}.{}\"\"\".format(\n",
+ " database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -221,7 +223,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)\n",
+ "statement = \"MSCK REPAIR TABLE {}.{}\".format(database_name, table_name_parquet)\n",
"\n",
"print(statement)"
]
@@ -233,6 +235,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"df = pd.read_sql(statement, conn)\n",
"df.head(5)"
]
@@ -250,7 +253,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)\n",
+ "statement = \"SHOW PARTITIONS {}.{}\".format(database_name, table_name_parquet)\n",
"\n",
"print(statement)"
]
@@ -278,7 +281,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW TABLES in {}'.format(database_name)"
+ "statement = \"SHOW TABLES in {}\".format(database_name)"
]
},
{
@@ -323,10 +326,12 @@
"metadata": {},
"outputs": [],
"source": [
- "product_category = 'Digital_Software'\n",
+ "product_category = \"Digital_Software\"\n",
"\n",
"statement = \"\"\"SELECT * FROM {}.{}\n",
- " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_parquet, product_category)\n",
+ " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n",
+ " database_name, table_name_parquet, product_category\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -348,11 +353,11 @@
"outputs": [],
"source": [
"if not df.empty:\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"else:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -370,7 +375,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review AWS Glue Catalog'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review AWS Glue Catalog'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/00_quickstart/06_Visualize_Reviews_Dataset.ipynb b/00_quickstart/06_Visualize_Reviews_Dataset.ipynb
index 616653c8..bbd9ffbd 100644
--- a/00_quickstart/06_Visualize_Reviews_Dataset.ipynb
+++ b/00_quickstart/06_Visualize_Reviews_Dataset.ipynb
@@ -39,7 +39,7 @@
"import sagemaker\n",
"import boto3\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -63,9 +63,9 @@
"try:\n",
" ingest_create_athena_table_parquet_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -84,11 +84,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_parquet_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -97,9 +97,9 @@
"metadata": {},
"outputs": [],
"source": [
- "# Set Athena database & table \n",
- "database_name = 'dsoaws'\n",
- "table_name = 'amazon_reviews_parquet'"
+ "# Set Athena database & table\n",
+ "database_name = \"dsoaws\"\n",
+ "table_name = \"amazon_reviews_parquet\""
]
},
{
@@ -118,7 +118,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -148,6 +148,7 @@
"import seaborn as sns\n",
"\n",
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'"
]
@@ -158,23 +159,27 @@
"metadata": {},
"outputs": [],
"source": [
- "sns.set_style = 'seaborn-whitegrid'\n",
- "\n",
- "sns.set(rc={\"font.style\":\"normal\",\n",
- " \"axes.facecolor\":\"white\",\n",
- " 'grid.color': '.8',\n",
- " 'grid.linestyle': '-',\n",
- " \"figure.facecolor\":\"white\",\n",
- " \"figure.titlesize\":20,\n",
- " \"text.color\":\"black\",\n",
- " \"xtick.color\":\"black\",\n",
- " \"ytick.color\":\"black\",\n",
- " \"axes.labelcolor\":\"black\",\n",
- " \"axes.grid\":True,\n",
- " 'axes.labelsize':10,\n",
- " 'xtick.labelsize':10,\n",
- " 'font.size':10,\n",
- " 'ytick.labelsize':10})"
+ "sns.set_style = \"seaborn-whitegrid\"\n",
+ "\n",
+ "sns.set(\n",
+ " rc={\n",
+ " \"font.style\": \"normal\",\n",
+ " \"axes.facecolor\": \"white\",\n",
+ " \"grid.color\": \".8\",\n",
+ " \"grid.linestyle\": \"-\",\n",
+ " \"figure.facecolor\": \"white\",\n",
+ " \"figure.titlesize\": 20,\n",
+ " \"text.color\": \"black\",\n",
+ " \"xtick.color\": \"black\",\n",
+ " \"ytick.color\": \"black\",\n",
+ " \"axes.labelcolor\": \"black\",\n",
+ " \"axes.grid\": True,\n",
+ " \"axes.labelsize\": 10,\n",
+ " \"xtick.labelsize\": 10,\n",
+ " \"font.size\": 10,\n",
+ " \"ytick.labelsize\": 10,\n",
+ " }\n",
+ ")"
]
},
{
@@ -195,7 +200,7 @@
" for p in ax.patches:\n",
" _x = p.get_x() + p.get_width() + float(space)\n",
" _y = p.get_y() + p.get_height()\n",
- " value = round(float(p.get_width()),2)\n",
+ " value = round(float(p.get_width()), 2)\n",
" ax.text(_x, _y, value, ha=\"left\")\n",
"\n",
" if isinstance(axs, np.ndarray):\n",
@@ -224,7 +229,9 @@
"FROM {}.{} \n",
"GROUP BY product_category \n",
"ORDER BY avg_star_rating DESC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -271,16 +278,16 @@
"outputs": [],
"source": [
"# Create plot\n",
- "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df, saturation=1)\n",
"\n",
"if num_categories < 10:\n",
- " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n",
- " \n",
- "# Set title and x-axis ticks \n",
- "plt.title('Average Rating by Product Category')\n",
- "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n",
+ " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
"\n",
- "# Helper code to show actual values afters bars \n",
+ "# Set title and x-axis ticks\n",
+ "plt.title(\"Average Rating by Product Category\")\n",
+ "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n",
+ "\n",
+ "# Helper code to show actual values afters bars\n",
"show_values_barplot(barplot, 0.1)\n",
"\n",
"plt.xlabel(\"Average Rating\")\n",
@@ -323,7 +330,9 @@
"FROM {}.{}\n",
"GROUP BY product_category \n",
"ORDER BY count_star_rating DESC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -345,10 +354,10 @@
"outputs": [],
"source": [
"# Store counts\n",
- "count_ratings = df['count_star_rating']\n",
+ "count_ratings = df[\"count_star_rating\"]\n",
"\n",
"# Store max ratings\n",
- "max_ratings = df['count_star_rating'].max()\n",
+ "max_ratings = df[\"count_star_rating\"].max()\n",
"print(max_ratings)"
]
},
@@ -366,20 +375,20 @@
"outputs": [],
"source": [
"# Create Seaborn barplot\n",
- "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n",
"\n",
"if num_categories < 10:\n",
- " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n",
+ " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
"\n",
"# Set title\n",
"plt.title(\"Number of Ratings per Product Category for Subset of Product Categories\")\n",
"\n",
- "# Set x-axis ticks to match scale \n",
+ "# Set x-axis ticks to match scale\n",
"if max_ratings > 200000:\n",
- " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
+ " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
" plt.xlim(0, 20000000)\n",
"elif max_ratings <= 200000:\n",
- " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '150K', '200K'])\n",
+ " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"150K\", \"200K\"])\n",
" plt.xlim(0, 200000)\n",
"\n",
"plt.xlabel(\"Number of Ratings\")\n",
@@ -417,13 +426,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category, MIN(review_date) AS first_review_date\n",
"FROM {}.{}\n",
"GROUP BY product_category\n",
"ORDER BY first_review_date \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -446,7 +457,8 @@
"source": [
"# Convert date strings (e.g. 2014-10-18) to datetime\n",
"import datetime as datetime\n",
- "dates = pd.to_datetime(df['first_review_date'])\n"
+ "\n",
+ "dates = pd.to_datetime(df[\"first_review_date\"])"
]
},
{
@@ -457,16 +469,18 @@
"source": [
"# See: https://stackoverflow.com/questions/60761410/how-to-graph-events-on-a-timeline\n",
"\n",
+ "\n",
"def modify_dataframe(df):\n",
" \"\"\" Modify dataframe to include new columns \"\"\"\n",
- " df['year'] = pd.to_datetime(df['first_review_date'], format='%Y-%m-%d').dt.year\n",
+ " df[\"year\"] = pd.to_datetime(df[\"first_review_date\"], format=\"%Y-%m-%d\").dt.year\n",
" return df\n",
"\n",
+ "\n",
"def get_x_y(df):\n",
" \"\"\" Get X and Y coordinates; return tuple \"\"\"\n",
- " series = df['year'].value_counts().sort_index()\n",
+ " series = df[\"year\"].value_counts().sort_index()\n",
" # new_series = series.reindex(range(1,21)).fillna(0).astype(int)\n",
- " return series.index, series.values\n"
+ " return series.index, series.values"
]
},
{
@@ -494,20 +508,20 @@
"metadata": {},
"outputs": [],
"source": [
- "fig = plt.figure(figsize=(12,5))\n",
+ "fig = plt.figure(figsize=(12, 5))\n",
"ax = plt.gca()\n",
"\n",
- "ax.set_title('Number Of First Product Category Reviews Per Year for Subset of Categories')\n",
- "ax.set_xlabel('Year')\n",
- "ax.set_ylabel('Count')\n",
+ "ax.set_title(\"Number Of First Product Category Reviews Per Year for Subset of Categories\")\n",
+ "ax.set_xlabel(\"Year\")\n",
+ "ax.set_ylabel(\"Count\")\n",
"\n",
"ax.plot(X, Y, color=\"black\", linewidth=2, marker=\"o\")\n",
- "ax.fill_between(X, [0]*len(X), Y, facecolor='lightblue')\n",
+ "ax.fill_between(X, [0] * len(X), Y, facecolor=\"lightblue\")\n",
"\n",
"ax.locator_params(integer=True)\n",
"\n",
"ax.set_xticks(range(1995, 2016, 1))\n",
- "ax.set_yticks(range(0, max(Y)+2, 1))\n",
+ "ax.set_yticks(range(0, max(Y) + 2, 1))\n",
"\n",
"plt.xticks(rotation=45)\n",
"\n",
@@ -538,7 +552,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category,\n",
" star_rating,\n",
@@ -546,7 +560,9 @@
"FROM {}.{}\n",
"GROUP BY product_category, star_rating\n",
"ORDER BY product_category ASC, star_rating DESC, count_reviews\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -575,14 +591,14 @@
"outputs": [],
"source": [
"# Create grouped DataFrames by category and by star rating\n",
- "grouped_category = df.groupby('product_category')\n",
- "grouped_star = df.groupby('star_rating')\n",
+ "grouped_category = df.groupby(\"product_category\")\n",
+ "grouped_star = df.groupby(\"star_rating\")\n",
"\n",
"# Create sum of ratings per star rating\n",
- "df_sum = df.groupby(['star_rating']).sum()\n",
+ "df_sum = df.groupby([\"star_rating\"]).sum()\n",
"\n",
"# Calculate total number of star ratings\n",
- "total = df_sum['count_reviews'].sum()\n",
+ "total = df_sum[\"count_reviews\"].sum()\n",
"print(total)"
]
},
@@ -595,17 +611,17 @@
"# Create dictionary of product categories and array of star rating distribution per category\n",
"distribution = {}\n",
"count_reviews_per_star = []\n",
- "i=0\n",
- " \n",
+ "i = 0\n",
+ "\n",
"for category, ratings in grouped_category:\n",
" count_reviews_per_star = []\n",
- " for star in ratings['star_rating']:\n",
- " count_reviews_per_star.append(ratings.at[i, 'count_reviews'])\n",
- " i=i+1;\n",
+ " for star in ratings[\"star_rating\"]:\n",
+ " count_reviews_per_star.append(ratings.at[i, \"count_reviews\"])\n",
+ " i = i + 1\n",
" distribution[category] = count_reviews_per_star\n",
"\n",
"# Check if distribution has been created succesfully\n",
- "print(distribution)\n"
+ "print(distribution)"
]
},
{
@@ -644,8 +660,8 @@
"# Sort distribution by highest average rating per category\n",
"sorted_distribution = {}\n",
"\n",
- "average_star_ratings.iloc[:,0]\n",
- "for index, value in average_star_ratings.iloc[:,0].items():\n",
+ "average_star_ratings.iloc[:, 0]\n",
+ "for index, value in average_star_ratings.iloc[:, 0].items():\n",
" sorted_distribution[value] = distribution[value]"
]
},
@@ -706,7 +722,7 @@
"proportion_star5 = np.true_divide(star5, total) * 100\n",
"\n",
"# Add colors\n",
- "colors = ['red', 'purple','blue','orange','green']\n",
+ "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n",
"\n",
"# The position of the bars on the x-axis\n",
"r = range(len(categories))\n",
@@ -714,21 +730,53 @@
"\n",
"# Plot bars\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
- "\n",
- "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n",
- "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n",
- "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n",
- "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n",
- "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
+ "\n",
+ "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n",
+ "ax4 = plt.barh(\n",
+ " r,\n",
+ " proportion_star4,\n",
+ " left=proportion_star5,\n",
+ " color=colors[3],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"4-Star Ratings\",\n",
+ ")\n",
+ "ax3 = plt.barh(\n",
+ " r,\n",
+ " proportion_star3,\n",
+ " left=proportion_star5 + proportion_star4,\n",
+ " color=colors[2],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"3-Star Ratings\",\n",
+ ")\n",
+ "ax2 = plt.barh(\n",
+ " r,\n",
+ " proportion_star2,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3,\n",
+ " color=colors[1],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"2-Star Ratings\",\n",
+ ")\n",
+ "ax1 = plt.barh(\n",
+ " r,\n",
+ " proportion_star1,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n",
+ " color=colors[0],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"1-Star Ratings\",\n",
+ ")\n",
"\n",
- "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n",
- "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
- "plt.yticks(r, categories, fontweight='regular')\n",
+ "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n",
+ "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n",
+ "plt.yticks(r, categories, fontweight=\"regular\")\n",
"\n",
- "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n",
+ "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n",
"plt.gca().invert_yaxis()\n",
"plt.tight_layout()\n",
"\n",
@@ -759,14 +807,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT star_rating,\n",
" COUNT(*) AS count_reviews\n",
"FROM {}.{}\n",
"GROUP BY star_rating\n",
"ORDER BY star_rating DESC, count_reviews \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -797,15 +847,12 @@
"metadata": {},
"outputs": [],
"source": [
- "chart = df.plot.bar(x='star_rating', \n",
- " y='count_reviews', \n",
- " rot='0',\n",
- " figsize=(10,5), \n",
- " title='Review Count by Star Ratings', \n",
- " legend=False)\n",
+ "chart = df.plot.bar(\n",
+ " x=\"star_rating\", y=\"count_reviews\", rot=\"0\", figsize=(10, 5), title=\"Review Count by Star Ratings\", legend=False\n",
+ ")\n",
"\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")\n",
"\n",
"plt.show(chart)"
]
@@ -842,13 +889,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT year, ROUND(AVG(star_rating),4) AS avg_rating\n",
"FROM {}.{}\n",
"GROUP BY year\n",
"ORDER BY year\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -869,7 +918,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year\n"
+ "df[\"year\"] = pd.to_datetime(df[\"year\"], format=\"%Y\").dt.year"
]
},
{
@@ -886,21 +935,21 @@
"outputs": [],
"source": [
"fig = plt.gcf()\n",
- "fig.set_size_inches(12,5)\n",
+ "fig.set_size_inches(12, 5)\n",
"\n",
- "fig.suptitle('Average Star Rating Over Time (Across Subset of Product Categories)')\n",
+ "fig.suptitle(\"Average Star Rating Over Time (Across Subset of Product Categories)\")\n",
"\n",
"ax = plt.gca()\n",
- "#ax = plt.gca().set_xticks(df['year'])\n",
+ "# ax = plt.gca().set_xticks(df['year'])\n",
"ax.locator_params(integer=True)\n",
- "ax.set_xticks(df['year'].unique())\n",
+ "ax.set_xticks(df[\"year\"].unique())\n",
"\n",
- "df.plot(kind='line',x='year',y='avg_rating', color='red', ax=ax)\n",
+ "df.plot(kind=\"line\", x=\"year\", y=\"avg_rating\", color=\"red\", ax=ax)\n",
"\n",
- "#plt.xticks(range(1995, 2016, 1))\n",
- "#plt.yticks(range(0,6,1))\n",
- "plt.xlabel('Years')\n",
- "plt.ylabel('Average Star Rating')\n",
+ "# plt.xticks(range(1995, 2016, 1))\n",
+ "# plt.yticks(range(0,6,1))\n",
+ "plt.xlabel(\"Years\")\n",
+ "plt.ylabel(\"Average Star Rating\")\n",
"plt.xticks(rotation=45)\n",
"\n",
"# fig.savefig('average-rating.png', dpi=300)\n",
@@ -930,13 +979,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category, year, ROUND(AVG(star_rating), 4) AS avg_rating_category\n",
"FROM {}.{}\n",
"GROUP BY product_category, year\n",
"ORDER BY year \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -965,11 +1016,20 @@
"outputs": [],
"source": [
"def plot_categories(df):\n",
- " df_categories = df['product_category'].unique()\n",
+ " df_categories = df[\"product_category\"].unique()\n",
" for category in df_categories:\n",
" # print(category)\n",
- " df_plot = df.loc[df['product_category'] == category]\n",
- " df_plot.plot(kind='line',x='year',y='avg_rating_category', c=np.random.rand(3,), ax=ax, label=category)"
+ " df_plot = df.loc[df[\"product_category\"] == category]\n",
+ " df_plot.plot(\n",
+ " kind=\"line\",\n",
+ " x=\"year\",\n",
+ " y=\"avg_rating_category\",\n",
+ " c=np.random.rand(\n",
+ " 3,\n",
+ " ),\n",
+ " ax=ax,\n",
+ " label=category,\n",
+ " )"
]
},
{
@@ -979,19 +1039,19 @@
"outputs": [],
"source": [
"fig = plt.gcf()\n",
- "fig.set_size_inches(12,5)\n",
+ "fig.set_size_inches(12, 5)\n",
+ "\n",
+ "fig.suptitle(\"Average Star Rating Over Time Across Subset Of Categories\")\n",
"\n",
- "fig.suptitle('Average Star Rating Over Time Across Subset Of Categories')\n",
- " \n",
"ax = plt.gca()\n",
"\n",
"ax.locator_params(integer=True)\n",
- "ax.set_xticks(df['year'].unique())\n",
+ "ax.set_xticks(df[\"year\"].unique())\n",
"\n",
"plot_categories(df)\n",
"\n",
- "plt.xlabel('Year')\n",
- "plt.ylabel('Average Star Rating')\n",
+ "plt.xlabel(\"Year\")\n",
+ "plt.ylabel(\"Average Star Rating\")\n",
"plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n",
"\n",
"# fig.savefig('average_rating_category_all_data.png', dpi=300)\n",
@@ -1021,14 +1081,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT star_rating,\n",
" AVG(helpful_votes) AS avg_helpful_votes\n",
"FROM {}.{}\n",
"GROUP BY star_rating\n",
"ORDER BY star_rating ASC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1066,10 +1128,12 @@
"metadata": {},
"outputs": [],
"source": [
- "chart = df.plot.bar(x='star_rating', y='avg_helpful_votes', rot='0', figsize=(10,5), title='Helpfulness Of Star Ratings', legend=False )\n",
+ "chart = df.plot.bar(\n",
+ " x=\"star_rating\", y=\"avg_helpful_votes\", rot=\"0\", figsize=(10, 5), title=\"Helpfulness Of Star Ratings\", legend=False\n",
+ ")\n",
"\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Average Helpful Votes')\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Average Helpful Votes\")\n",
"\n",
"# chart.get_figure().savefig('helpful-votes.png', dpi=300)\n",
"plt.show(chart)"
@@ -1098,7 +1162,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_title,\n",
" helpful_votes,\n",
@@ -1107,7 +1171,9 @@
" SUBSTR(review_body, 1, 100) AS review_body_substr\n",
"FROM {}.{}\n",
"ORDER BY helpful_votes DESC LIMIT 10 \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1145,7 +1211,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT (CAST(positive_review_count AS DOUBLE) / CAST(negative_review_count AS DOUBLE)) AS positive_to_negative_sentiment_ratio\n",
"FROM (\n",
@@ -1157,7 +1223,9 @@
" FROM {}.{}\n",
" WHERE star_rating < 4\n",
")\n",
- "\"\"\".format(database_name, table_name, database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name, database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1195,7 +1263,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT customer_id, product_category, product_title, \n",
"ROUND(AVG(star_rating),4) AS avg_star_rating, COUNT(*) AS review_count \n",
@@ -1204,7 +1272,9 @@
"HAVING COUNT(*) > 1 \n",
"ORDER BY review_count DESC\n",
"LIMIT 5\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1265,7 +1335,7 @@
"metadata": {},
"outputs": [],
"source": [
- "summary = df['num_words'].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n",
+ "summary = df[\"num_words\"].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n",
"summary"
]
},
@@ -1275,9 +1345,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df['num_words'].plot.hist(xticks=[0, 16, 32, 64, 128, 256], \n",
- " bins=100,\n",
- " range=[0, 256]).axvline(x=summary['80%'], c='red')"
+ "df[\"num_words\"].plot.hist(xticks=[0, 16, 32, 64, 128, 256], bins=100, range=[0, 256]).axvline(\n",
+ " x=summary[\"80%\"], c=\"red\"\n",
+ ")"
]
},
{
@@ -1332,4 +1402,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb b/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb
index c1d4cf89..d206f883 100644
--- a/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb
+++ b/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb
@@ -44,7 +44,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -85,10 +85,12 @@
"source": [
"import csv\n",
"\n",
- "df_giftcards = pd.read_csv('./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_giftcards = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_giftcards.shape"
]
},
@@ -109,10 +111,12 @@
"source": [
"import csv\n",
"\n",
- "df_software = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_software = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_software.shape"
]
},
@@ -133,10 +137,12 @@
"source": [
"import csv\n",
"\n",
- "df_videogames = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_videogames = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_videogames.shape"
]
},
@@ -163,12 +169,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_giftcards[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_giftcards[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -178,12 +187,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_software[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_software[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -193,12 +205,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_videogames[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_videogames[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -270,7 +285,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -286,7 +301,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n",
+ "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n",
"df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n",
"df_balanced.shape"
]
@@ -299,7 +314,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -331,7 +346,7 @@
"metadata": {},
"outputs": [],
"source": [
- "path = './data-clarify/amazon_reviews_us_giftcards_software_videogames.csv'\n",
+ "path = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\"\n",
"df.to_csv(path, index=False, header=True)"
]
},
@@ -357,7 +372,7 @@
"metadata": {},
"outputs": [],
"source": [
- "path_balanced = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv'\n",
+ "path_balanced = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv\"\n",
"df_balanced.to_csv(path_balanced, index=False, header=True)"
]
},
@@ -374,8 +389,8 @@
"metadata": {},
"outputs": [],
"source": [
- "path_jsonlines = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl'\n",
- "df_balanced.to_json(path_or_buf=path_jsonlines, orient='records', lines=True)"
+ "path_jsonlines = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl\"\n",
+ "df_balanced.to_json(path_or_buf=path_jsonlines, orient=\"records\", lines=True)"
]
},
{
@@ -392,9 +407,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path)\n",
+ "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path)\n",
"bias_data_s3_uri"
]
},
@@ -413,7 +429,9 @@
"metadata": {},
"outputs": [],
"source": [
- "balanced_bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_balanced)\n",
+ "balanced_bias_data_s3_uri = sess.upload_data(\n",
+ " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_balanced\n",
+ ")\n",
"balanced_bias_data_s3_uri"
]
},
@@ -432,7 +450,9 @@
"metadata": {},
"outputs": [],
"source": [
- "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_jsonlines)\n",
+ "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(\n",
+ " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_jsonlines\n",
+ ")\n",
"balanced_bias_data_jsonlines_s3_uri"
]
},
diff --git a/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb b/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb
index cb27a9c5..03d6d536 100644
--- a/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb
+++ b/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb
@@ -114,7 +114,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n",
+ "df = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n",
"df.shape"
]
},
@@ -132,7 +132,7 @@
},
"outputs": [],
"source": [
- "sns.countplot(data=df, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -166,11 +166,9 @@
},
"outputs": [],
"source": [
- "facet_column = report.FacetColumn(name='product_category')\n",
- "label_column = report.LabelColumn(name='star_rating', \n",
- " data=df['star_rating'], \n",
- " positive_label_values=[5, 4])\n",
- "group_variable = df['product_category']"
+ "facet_column = report.FacetColumn(name=\"product_category\")\n",
+ "label_column = report.LabelColumn(name=\"star_rating\", data=df[\"star_rating\"], positive_label_values=[5, 4])\n",
+ "group_variable = df[\"product_category\"]"
]
},
{
@@ -194,11 +192,9 @@
},
"outputs": [],
"source": [
- "report.bias_report(df, \n",
- " facet_column, \n",
- " label_column, \n",
- " stage_type=report.StageType.PRE_TRAINING, \n",
- " group_variable=group_variable)"
+ "report.bias_report(\n",
+ " df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n",
+ ")"
]
},
{
@@ -214,7 +210,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n",
+ "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n",
"df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n",
"df_balanced.shape"
]
@@ -227,7 +223,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -255,12 +251,10 @@
"source": [
"from smclarify.bias import report\n",
"\n",
- "facet_column = report.FacetColumn(name='product_category')\n",
- "label_column = report.LabelColumn(name='star_rating',\n",
- " data=df_balanced['star_rating'],\n",
- " positive_label_values=[5, 4])\n",
+ "facet_column = report.FacetColumn(name=\"product_category\")\n",
+ "label_column = report.LabelColumn(name=\"star_rating\", data=df_balanced[\"star_rating\"], positive_label_values=[5, 4])\n",
"\n",
- "group_variable = df_balanced['product_category']"
+ "group_variable = df_balanced[\"product_category\"]"
]
},
{
@@ -276,11 +270,9 @@
"metadata": {},
"outputs": [],
"source": [
- "report.bias_report(df_balanced,\n",
- " facet_column,\n",
- " label_column,\n",
- " stage_type=report.StageType.PRE_TRAINING,\n",
- " group_variable=group_variable)"
+ "report.bias_report(\n",
+ " df_balanced, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n",
+ ")"
]
},
{
diff --git a/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb b/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb
index 9635551f..d37e1e1e 100644
--- a/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb
+++ b/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb
@@ -20,12 +20,12 @@
"import pandas as pd\n",
"import numpy as np\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -72,7 +72,7 @@
"source": [
"import pandas as pd\n",
"\n",
- "data = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n",
+ "data = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n",
"data.head()"
]
},
@@ -101,7 +101,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=data, x='star_rating', hue='product_category')"
+ "sns.countplot(data=data, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -121,10 +121,9 @@
"source": [
"from sagemaker import clarify\n",
"\n",
- "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n",
- " instance_count=1,\n",
- " instance_type='ml.c5.2xlarge',\n",
- " sagemaker_session=sess)"
+ "clarify_processor = clarify.SageMakerClarifyProcessor(\n",
+ " role=role, instance_count=1, instance_type=\"ml.c5.2xlarge\", sagemaker_session=sess\n",
+ ")"
]
},
{
@@ -151,13 +150,15 @@
"metadata": {},
"outputs": [],
"source": [
- "bias_report_output_path = 's3://{}/clarify'.format(bucket)\n",
+ "bias_report_output_path = \"s3://{}/clarify\".format(bucket)\n",
"\n",
- "bias_data_config = clarify.DataConfig(s3_data_input_path=bias_data_s3_uri,\n",
- " s3_output_path=bias_report_output_path,\n",
- " label='star_rating',\n",
- " headers=data.columns.to_list(),\n",
- " dataset_type='text/csv')"
+ "bias_data_config = clarify.DataConfig(\n",
+ " s3_data_input_path=bias_data_s3_uri,\n",
+ " s3_output_path=bias_report_output_path,\n",
+ " label=\"star_rating\",\n",
+ " headers=data.columns.to_list(),\n",
+ " dataset_type=\"text/csv\",\n",
+ ")"
]
},
{
@@ -176,9 +177,9 @@
"metadata": {},
"outputs": [],
"source": [
- "bias_config = clarify.BiasConfig(label_values_or_threshold=[5, 4],\n",
- " facet_name='product_category',\n",
- " group_name='product_category')"
+ "bias_config = clarify.BiasConfig(\n",
+ " label_values_or_threshold=[5, 4], facet_name=\"product_category\", group_name=\"product_category\"\n",
+ ")"
]
},
{
@@ -195,11 +196,8 @@
"outputs": [],
"source": [
"clarify_processor.run_pre_training_bias(\n",
- " data_config=bias_data_config,\n",
- " data_bias_config=bias_config,\n",
- " methods='all',\n",
- " wait=False,\n",
- " logs=False)"
+ " data_config=bias_data_config, data_bias_config=bias_config, methods=\"all\", wait=False, logs=False\n",
+ ")"
]
},
{
@@ -220,7 +218,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Processing Job'.format(region, run_pre_training_bias_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Job'.format(\n",
+ " region, run_pre_training_bias_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -231,7 +235,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, run_pre_training_bias_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, run_pre_training_bias_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -242,7 +252,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, run_pre_training_bias_processing_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n",
+ " bucket, run_pre_training_bias_processing_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -251,8 +267,9 @@
"metadata": {},
"outputs": [],
"source": [
- "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=run_pre_training_bias_processing_job_name,\n",
- " sagemaker_session=sess)\n",
+ "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n",
+ " processing_job_name=run_pre_training_bias_processing_job_name, sagemaker_session=sess\n",
+ ")\n",
"\n",
"processing_job_description = running_processor.describe()\n",
"\n",
@@ -302,7 +319,7 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Bias Report'))\n"
+ "display(HTML('Review Bias Report'))"
]
},
{
diff --git a/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb b/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
index 41439e6d..516a2567 100644
--- a/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
+++ b/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
@@ -32,12 +32,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -54,9 +54,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "pipeline_name = 'BERT-pipeline-{}'.format(timestamp)"
+ "pipeline_name = \"BERT-pipeline-{}\".format(timestamp)"
]
},
{
@@ -95,12 +96,13 @@
"from smexperiments.experiment import Experiment\n",
"\n",
"pipeline_experiment = Experiment.create(\n",
- " experiment_name=pipeline_name,\n",
- " description='Amazon Customer Reviews BERT Pipeline Experiment', \n",
- " sagemaker_boto_client=sm)\n",
+ " experiment_name=pipeline_name,\n",
+ " description=\"Amazon Customer Reviews BERT Pipeline Experiment\",\n",
+ " sagemaker_boto_client=sm,\n",
+ ")\n",
"\n",
"pipeline_experiment_name = pipeline_experiment.experiment_name\n",
- "print('Pipeline experiment name: {}'.format(pipeline_experiment_name))"
+ "print(\"Pipeline experiment name: {}\".format(pipeline_experiment_name))"
]
},
{
@@ -128,12 +130,12 @@
"import time\n",
"from smexperiments.trial import Trial\n",
"\n",
- "pipeline_trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n",
- " experiment_name=pipeline_experiment_name,\n",
- " sagemaker_boto_client=sm)\n",
+ "pipeline_trial = Trial.create(\n",
+ " trial_name=\"trial-{}\".format(timestamp), experiment_name=pipeline_experiment_name, sagemaker_boto_client=sm\n",
+ ")\n",
"\n",
"pipeline_trial_name = pipeline_trial.trial_name\n",
- "print('Trial name: {}'.format(pipeline_trial_name))"
+ "print(\"Trial name: {}\".format(pipeline_trial_name))"
]
},
{
@@ -238,7 +240,7 @@
"metadata": {},
"outputs": [],
"source": [
- "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n",
+ "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n",
"print(raw_input_data_s3_uri)"
]
},
@@ -258,6 +260,7 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
"input_data = ParameterString(\n",
@@ -265,15 +268,9 @@
" default_value=raw_input_data_s3_uri,\n",
")\n",
"\n",
- "processing_instance_count = ParameterInteger(\n",
- " name=\"ProcessingInstanceCount\",\n",
- " default_value=1\n",
- ")\n",
+ "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n",
"\n",
- "processing_instance_type = ParameterString(\n",
- " name=\"ProcessingInstanceType\",\n",
- " default_value=\"ml.c5.2xlarge\"\n",
- ")\n",
+ "processing_instance_type = ParameterString(name=\"ProcessingInstanceType\", default_value=\"ml.c5.2xlarge\")\n",
"\n",
"max_seq_length = ParameterInteger(\n",
" name=\"MaxSeqLength\",\n",
@@ -284,7 +281,7 @@
" name=\"BalanceDataset\",\n",
" default_value=\"True\",\n",
")\n",
- " \n",
+ "\n",
"train_split_percentage = ParameterFloat(\n",
" name=\"TrainSplitPercentage\",\n",
" default_value=0.90,\n",
@@ -305,10 +302,7 @@
" default_value=\"reviews-feature-store-\" + str(timestamp),\n",
")\n",
"\n",
- "feature_group_name = ParameterString(\n",
- " name=\"FeatureGroupName\",\n",
- " default_value=\"reviews-feature-group-\" + str(timestamp)\n",
- ")"
+ "feature_group_name = ParameterString(name=\"FeatureGroupName\", default_value=\"reviews-feature-group-\" + str(timestamp))"
]
},
{
@@ -341,12 +335,13 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " env={'AWS_DEFAULT_REGION': region}, \n",
- " )"
+ "processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " env={\"AWS_DEFAULT_REGION\": region},\n",
+ ")"
]
},
{
@@ -358,45 +353,56 @@
"from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
"from sagemaker.workflow.steps import ProcessingStep\n",
"\n",
- "processing_inputs=[\n",
- " ProcessingInput(\n",
- " input_name='raw-input-data',\n",
- " source=input_data,\n",
- " destination='/opt/ml/processing/input/data/',\n",
- " s3_data_distribution_type='ShardedByS3Key'\n",
- " )\n",
+ "processing_inputs = [\n",
+ " ProcessingInput(\n",
+ " input_name=\"raw-input-data\",\n",
+ " source=input_data,\n",
+ " destination=\"/opt/ml/processing/input/data/\",\n",
+ " s3_data_distribution_type=\"ShardedByS3Key\",\n",
+ " )\n",
"]\n",
"\n",
- "processing_outputs=[\n",
- " ProcessingOutput(output_name='bert-train',\n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/bert/train',\n",
- " ),\n",
- " ProcessingOutput(output_name='bert-validation',\n",
- " s3_upload_mode='EndOfJob', \n",
- " source='/opt/ml/processing/output/bert/validation',\n",
- " ),\n",
- " ProcessingOutput(output_name='bert-test',\n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/bert/test',\n",
- " ),\n",
- "] \n",
+ "processing_outputs = [\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-train\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/train\",\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-validation\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/validation\",\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-test\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/test\",\n",
+ " ),\n",
+ "]\n",
"\n",
"processing_step = ProcessingStep(\n",
- " name='Processing', \n",
- " code='preprocess-scikit-text-to-bert-feature-store.py',\n",
+ " name=\"Processing\",\n",
+ " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n",
" processor=processor,\n",
" inputs=processing_inputs,\n",
" outputs=processing_outputs,\n",
- " job_arguments=['--train-split-percentage', str(train_split_percentage.default_value), \n",
- " '--validation-split-percentage', str(validation_split_percentage.default_value),\n",
- " '--test-split-percentage', str(test_split_percentage.default_value),\n",
- " '--max-seq-length', str(max_seq_length.default_value),\n",
- " '--balance-dataset', str(balance_dataset.default_value),\n",
- " '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),\n",
- " '--feature-group-name', str(feature_group_name.default_value)\n",
- " ]\n",
- ") \n",
+ " job_arguments=[\n",
+ " \"--train-split-percentage\",\n",
+ " str(train_split_percentage.default_value),\n",
+ " \"--validation-split-percentage\",\n",
+ " str(validation_split_percentage.default_value),\n",
+ " \"--test-split-percentage\",\n",
+ " str(test_split_percentage.default_value),\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length.default_value),\n",
+ " \"--balance-dataset\",\n",
+ " str(balance_dataset.default_value),\n",
+ " \"--feature-store-offline-prefix\",\n",
+ " str(feature_store_offline_prefix.default_value),\n",
+ " \"--feature-group-name\",\n",
+ " str(feature_group_name.default_value),\n",
+ " ],\n",
+ ")\n",
"\n",
"print(processing_step)"
]
@@ -439,15 +445,9 @@
"metadata": {},
"outputs": [],
"source": [
- "train_instance_type = ParameterString(\n",
- " name=\"TrainingInstanceType\",\n",
- " default_value=\"ml.c5.9xlarge\"\n",
- ")\n",
+ "train_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.c5.9xlarge\")\n",
"\n",
- "train_instance_count = ParameterInteger(\n",
- " name=\"TrainingInstanceCount\",\n",
- " default_value=1\n",
- ")"
+ "train_instance_count = ParameterInteger(name=\"TrainingInstanceCount\", default_value=1)"
]
},
{
@@ -464,56 +464,26 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs = ParameterInteger(\n",
- " name=\"Epochs\",\n",
- " default_value=1\n",
- ")\n",
- " \n",
- "learning_rate = ParameterFloat(\n",
- " name=\"LearningRate\",\n",
- " default_value=0.00001\n",
- ") \n",
- " \n",
- "epsilon = ParameterFloat(\n",
- " name=\"Epsilon\",\n",
- " default_value=0.00000001\n",
- ")\n",
- " \n",
- "train_batch_size = ParameterInteger(\n",
- " name=\"TrainBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "validation_batch_size = ParameterInteger(\n",
- " name=\"ValidationBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "test_batch_size = ParameterInteger(\n",
- " name=\"TestBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "train_steps_per_epoch = ParameterInteger(\n",
- " name=\"TrainStepsPerEpoch\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "validation_steps = ParameterInteger(\n",
- " name=\"ValidationSteps\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "test_steps = ParameterInteger(\n",
- " name=\"TestSteps\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "train_volume_size = ParameterInteger(\n",
- " name=\"TrainVolumeSize\",\n",
- " default_value=1024\n",
- ") \n",
- " \n",
+ "epochs = ParameterInteger(name=\"Epochs\", default_value=1)\n",
+ "\n",
+ "learning_rate = ParameterFloat(name=\"LearningRate\", default_value=0.00001)\n",
+ "\n",
+ "epsilon = ParameterFloat(name=\"Epsilon\", default_value=0.00000001)\n",
+ "\n",
+ "train_batch_size = ParameterInteger(name=\"TrainBatchSize\", default_value=128)\n",
+ "\n",
+ "validation_batch_size = ParameterInteger(name=\"ValidationBatchSize\", default_value=128)\n",
+ "\n",
+ "test_batch_size = ParameterInteger(name=\"TestBatchSize\", default_value=128)\n",
+ "\n",
+ "train_steps_per_epoch = ParameterInteger(name=\"TrainStepsPerEpoch\", default_value=50)\n",
+ "\n",
+ "validation_steps = ParameterInteger(name=\"ValidationSteps\", default_value=50)\n",
+ "\n",
+ "test_steps = ParameterInteger(name=\"TestSteps\", default_value=50)\n",
+ "\n",
+ "train_volume_size = ParameterInteger(name=\"TrainVolumeSize\", default_value=1024)\n",
+ "\n",
"use_xla = ParameterString(\n",
" name=\"UseXLA\",\n",
" default_value=\"True\",\n",
@@ -523,7 +493,7 @@
" name=\"UseAMP\",\n",
" default_value=\"True\",\n",
")\n",
- " \n",
+ "\n",
"freeze_bert_layer = ParameterString(\n",
" name=\"FreezeBERTLayer\",\n",
" default_value=\"False\",\n",
@@ -533,7 +503,7 @@
" name=\"EnableSageMakerDebugger\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"enable_checkpointing = ParameterString(\n",
" name=\"EnableCheckpointing\",\n",
" default_value=\"False\",\n",
@@ -543,7 +513,7 @@
" name=\"EnableTensorboard\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"input_mode = ParameterString(\n",
" name=\"InputMode\",\n",
" default_value=\"File\",\n",
@@ -558,7 +528,7 @@
" name=\"RunTest\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"run_sample_predictions = ParameterString(\n",
" name=\"RunSamplePredictions\",\n",
" default_value=\"False\",\n",
@@ -579,10 +549,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -618,36 +588,39 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size, \n",
- " py_version='py37',\n",
- " framework_version='2.3.1',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " metric_definitions=metrics_definitions,\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py37\",\n",
+ " framework_version=\"2.3.1\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " metric_definitions=metrics_definitions,\n",
+ ")"
]
},
{
@@ -669,27 +642,21 @@
"from sagemaker.workflow.steps import TrainingStep\n",
"\n",
"training_step = TrainingStep(\n",
- " name='Train',\n",
+ " name=\"Train\",\n",
" estimator=estimator,\n",
" inputs={\n",
- " 'train': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-train'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
+ " \"train\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-train\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " \"validation\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-validation\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
" ),\n",
- " 'validation': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-validation'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
+ " \"test\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-test\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
" ),\n",
- " 'test': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-test'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
- " ) \n",
" },\n",
")\n",
"\n",
@@ -743,12 +710,14 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "evaluation_processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " env={'AWS_DEFAULT_REGION': region},\n",
- " max_runtime_in_seconds=7200)"
+ "evaluation_processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " env={\"AWS_DEFAULT_REGION\": region},\n",
+ " max_runtime_in_seconds=7200,\n",
+ ")"
]
},
{
@@ -759,7 +728,7 @@
},
"outputs": [],
"source": [
- "!pygmentize evaluate_model_metrics.py\n"
+ "!pygmentize evaluate_model_metrics.py"
]
},
{
@@ -779,11 +748,7 @@
"source": [
"from sagemaker.workflow.properties import PropertyFile\n",
"\n",
- "evaluation_report = PropertyFile(\n",
- " name='EvaluationReport',\n",
- " output_name='metrics',\n",
- " path='evaluation.json'\n",
- ")"
+ "evaluation_report = PropertyFile(name=\"EvaluationReport\", output_name=\"metrics\", path=\"evaluation.json\")"
]
},
{
@@ -793,27 +758,28 @@
"outputs": [],
"source": [
"evaluation_step = ProcessingStep(\n",
- " name='EvaluateModel',\n",
+ " name=\"EvaluateModel\",\n",
" processor=evaluation_processor,\n",
- " code='evaluate_model_metrics.py',\n",
+ " code=\"evaluate_model_metrics.py\",\n",
" inputs=[\n",
" ProcessingInput(\n",
" source=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
- " destination='/opt/ml/processing/input/model'\n",
+ " destination=\"/opt/ml/processing/input/model\",\n",
" ),\n",
" ProcessingInput(\n",
- " source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri,\n",
- " destination='/opt/ml/processing/input/data'\n",
- " )\n",
+ " source=processing_step.properties.ProcessingInputs[\"raw-input-data\"].S3Input.S3Uri,\n",
+ " destination=\"/opt/ml/processing/input/data\",\n",
+ " ),\n",
" ],\n",
" outputs=[\n",
- " ProcessingOutput(output_name='metrics', \n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/metrics/'),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"metrics\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/metrics/\"\n",
+ " ),\n",
" ],\n",
" job_arguments=[\n",
- " '--max-seq-length', str(max_seq_length.default_value),\n",
- " ],\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length.default_value),\n",
+ " ],\n",
" property_files=[evaluation_report],\n",
")"
]
@@ -831,14 +797,14 @@
"metadata": {},
"outputs": [],
"source": [
- "from sagemaker.model_metrics import MetricsSource, ModelMetrics \n",
+ "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n",
"\n",
"model_metrics = ModelMetrics(\n",
" model_statistics=MetricsSource(\n",
" s3_uri=\"{}/evaluation.json\".format(\n",
" evaluation_step.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n",
" ),\n",
- " content_type=\"application/json\"\n",
+ " content_type=\"application/json\",\n",
" )\n",
")\n",
"\n",
@@ -870,20 +836,11 @@
"metadata": {},
"outputs": [],
"source": [
- "model_approval_status = ParameterString(\n",
- " name=\"ModelApprovalStatus\",\n",
- " default_value=\"PendingManualApproval\"\n",
- ")\n",
+ "model_approval_status = ParameterString(name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\")\n",
"\n",
- "deploy_instance_type = ParameterString(\n",
- " name=\"DeployInstanceType\",\n",
- " default_value=\"ml.m5.4xlarge\"\n",
- ")\n",
+ "deploy_instance_type = ParameterString(name=\"DeployInstanceType\", default_value=\"ml.m5.4xlarge\")\n",
"\n",
- "deploy_instance_count = ParameterInteger(\n",
- " name=\"DeployInstanceCount\",\n",
- " default_value=1\n",
- ")"
+ "deploy_instance_count = ParameterInteger(name=\"DeployInstanceCount\", default_value=1)"
]
},
{
@@ -909,7 +866,7 @@
" version=\"2.3.1\",\n",
" py_version=\"py37\",\n",
" instance_type=deploy_instance_type,\n",
- " image_scope=\"inference\"\n",
+ " image_scope=\"inference\",\n",
")\n",
"print(inference_image_uri)"
]
@@ -924,10 +881,10 @@
"\n",
"register_step = RegisterModel(\n",
" name=\"RegisterModel\",\n",
- "# entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n",
- "# source_dir='src',\n",
+ " # entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n",
+ " # source_dir='src',\n",
" estimator=estimator,\n",
- " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n",
+ " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n",
" model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
" content_types=[\"application/json\"],\n",
" response_types=[\"application/json\"],\n",
@@ -935,7 +892,7 @@
" transform_instances=[\"ml.c5.18xlarge\"],\n",
" model_package_group_name=model_package_group_name,\n",
" approval_status=model_approval_status,\n",
- " model_metrics=model_metrics\n",
+ " model_metrics=model_metrics,\n",
")"
]
},
@@ -956,7 +913,7 @@
"source": [
"from sagemaker.model import Model\n",
"\n",
- "model_name = 'bert-model-{}'.format(timestamp)\n",
+ "model_name = \"bert-model-{}\".format(timestamp)\n",
"\n",
"model = Model(\n",
" name=model_name,\n",
@@ -976,7 +933,7 @@
"from sagemaker.inputs import CreateModelInput\n",
"\n",
"create_inputs = CreateModelInput(\n",
- " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n",
+ " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n",
")"
]
},
@@ -1018,10 +975,7 @@
"metadata": {},
"outputs": [],
"source": [
- "min_accuracy_value = ParameterFloat(\n",
- " name=\"MinAccuracyValue\",\n",
- " default_value=0.01\n",
- ")"
+ "min_accuracy_value = ParameterFloat(name=\"MinAccuracyValue\", default_value=0.01)"
]
},
{
@@ -1042,14 +996,14 @@
" property_file=evaluation_report,\n",
" json_path=\"metrics.accuracy.value\",\n",
" ),\n",
- " right=min_accuracy_value # accuracy\n",
+ " right=min_accuracy_value, # accuracy\n",
")\n",
"\n",
"minimum_accuracy_condition_step = ConditionStep(\n",
" name=\"AccuracyCondition\",\n",
" conditions=[minimum_accuracy_condition],\n",
- " if_steps=[register_step, create_step], # success, continue with model registration\n",
- " else_steps=[], # fail, end the pipeline\n",
+ " if_steps=[register_step, create_step], # success, continue with model registration\n",
+ " else_steps=[], # fail, end the pipeline\n",
")"
]
},
@@ -1125,7 +1079,7 @@
" min_accuracy_value,\n",
" model_approval_status,\n",
" deploy_instance_type,\n",
- " deploy_instance_count\n",
+ " deploy_instance_count,\n",
" ],\n",
" steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],\n",
" sagemaker_session=sess,\n",
@@ -1213,16 +1167,16 @@
" parameters=dict(\n",
" InputData=raw_input_data_s3_uri,\n",
" ProcessingInstanceCount=1,\n",
- " ProcessingInstanceType='ml.c5.2xlarge',\n",
+ " ProcessingInstanceType=\"ml.c5.2xlarge\",\n",
" MaxSeqLength=64,\n",
- " BalanceDataset='True',\n",
+ " BalanceDataset=\"True\",\n",
" TrainSplitPercentage=0.9,\n",
" ValidationSplitPercentage=0.05,\n",
" TestSplitPercentage=0.05,\n",
- " FeatureStoreOfflinePrefix='reviews-feature-store-'+str(timestamp),\n",
- " FeatureGroupName='reviews-feature-group-'+str(timestamp),\n",
+ " FeatureStoreOfflinePrefix=\"reviews-feature-store-\" + str(timestamp),\n",
+ " FeatureGroupName=\"reviews-feature-group-\" + str(timestamp),\n",
" LearningRate=0.000012,\n",
- " TrainingInstanceType='ml.c5.9xlarge',\n",
+ " TrainingInstanceType=\"ml.c5.9xlarge\",\n",
" TrainingInstanceCount=1,\n",
" Epochs=1,\n",
" Epsilon=0.00000001,\n",
@@ -1233,20 +1187,20 @@
" ValidationSteps=50,\n",
" TestSteps=50,\n",
" TrainVolumeSize=1024,\n",
- " UseXLA='True',\n",
- " UseAMP='True',\n",
- " FreezeBERTLayer='False',\n",
- " EnableSageMakerDebugger='False',\n",
- " EnableCheckpointing='False',\n",
- " EnableTensorboard='False',\n",
- " InputMode='File',\n",
- " RunValidation='True',\n",
- " RunTest='Fasle',\n",
- " RunSamplePredictions='False', \n",
+ " UseXLA=\"True\",\n",
+ " UseAMP=\"True\",\n",
+ " FreezeBERTLayer=\"False\",\n",
+ " EnableSageMakerDebugger=\"False\",\n",
+ " EnableCheckpointing=\"False\",\n",
+ " EnableTensorboard=\"False\",\n",
+ " InputMode=\"File\",\n",
+ " RunValidation=\"True\",\n",
+ " RunTest=\"Fasle\",\n",
+ " RunSamplePredictions=\"False\",\n",
" MinAccuracyValue=0.01,\n",
- " ModelApprovalStatus='PendingManualApproval', \n",
- " DeployInstanceType='ml.m5.4xlarge',\n",
- " DeployInstanceCount=1 \n",
+ " ModelApprovalStatus=\"PendingManualApproval\",\n",
+ " DeployInstanceType=\"ml.m5.4xlarge\",\n",
+ " DeployInstanceCount=1,\n",
" )\n",
")\n",
"\n",
@@ -1287,7 +1241,7 @@
"metadata": {},
"outputs": [],
"source": [
- "execution_run_name = execution_run['PipelineExecutionDisplayName']\n",
+ "execution_run_name = execution_run[\"PipelineExecutionDisplayName\"]\n",
"print(execution_run_name)"
]
},
@@ -1297,7 +1251,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = execution_run['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = execution_run[\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -1349,18 +1303,18 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -1377,7 +1331,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -1387,7 +1341,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -1411,7 +1365,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -1441,8 +1395,8 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_name=None\n",
- "training_job_name=None"
+ "processing_job_name = None\n",
+ "training_job_name = None"
]
},
{
@@ -1456,15 +1410,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -1486,7 +1440,7 @@
"outputs": [],
"source": [
"# -aws-processing-job is the default name assigned by ProcessingJob\n",
- "processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)\n",
+ "processing_job_tc = \"{}-aws-processing-job\".format(processing_job_name)\n",
"print(processing_job_tc)"
]
},
@@ -1514,10 +1468,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = sm.associate_trial_component(\n",
- " TrialComponentName=processing_job_tc,\n",
- " TrialName=pipeline_trial_name\n",
- ")"
+ "response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=pipeline_trial_name)"
]
},
{
@@ -1527,7 +1478,7 @@
"outputs": [],
"source": [
"# -aws-training-job is the default name assigned by TrainingJob\n",
- "training_job_tc = '{}-aws-training-job'.format(training_job_name)\n",
+ "training_job_tc = \"{}-aws-training-job\".format(training_job_name)\n",
"print(training_job_tc)"
]
},
@@ -1537,10 +1488,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = sm.associate_trial_component(\n",
- " TrialComponentName=training_job_tc,\n",
- " TrialName=pipeline_trial_name\n",
- ")"
+ "response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=pipeline_trial_name)"
]
},
{
@@ -1560,9 +1508,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"balance_dataset\": str(balance_dataset), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"balance_dataset\": str(balance_dataset),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1574,9 +1524,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"train_split_percentage\": str(train_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"train_split_percentage\": str(train_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1588,9 +1540,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"validation_split_percentage\": str(validation_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"validation_split_percentage\": str(validation_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1602,9 +1556,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"test_split_percentage\": str(test_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"test_split_percentage\": str(test_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1616,9 +1572,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"max_seq_length\": str(max_seq_length), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"max_seq_length\": str(max_seq_length),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1630,11 +1588,13 @@
"metadata": {},
"outputs": [],
"source": [
- "time.sleep(5) # avoid throttling exception \n",
+ "time.sleep(5) # avoid throttling exception\n",
"\n",
- "processing_job_tracker.log_parameters({\n",
- " \"feature_store_offline_prefix\": str(feature_store_offline_prefix), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"feature_store_offline_prefix\": str(feature_store_offline_prefix),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1646,11 +1606,13 @@
"metadata": {},
"outputs": [],
"source": [
- "time.sleep(5) # avoid throttling exception \n",
+ "time.sleep(5) # avoid throttling exception\n",
"\n",
- "processing_job_tracker.log_parameters({\n",
- " \"feature_group_name\": str(feature_group_name), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"feature_group_name\": str(feature_group_name),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1671,9 +1633,10 @@
"source": [
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
- "time.sleep(30) # avoid throttling exception\n",
+ "time.sleep(30) # avoid throttling exception\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
diff --git a/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb b/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb
index bbe0e00f..d281310e 100644
--- a/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb
+++ b/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb
@@ -24,12 +24,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -79,18 +79,18 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -107,7 +107,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -117,7 +117,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -147,14 +147,16 @@
"metadata": {},
"outputs": [],
"source": [
- "#for execution_step in reversed(execution.list_steps()):\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
- " if execution_step['StepName'] == 'EvaluateModel':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ "# for execution_step in reversed(execution.list_steps()):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
+ " if execution_step[\"StepName\"] == \"EvaluateModel\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
"\n",
"describe_evaluation_processing_job_response = sm.describe_processing_job(ProcessingJobName=processing_job_name)\n",
"\n",
- "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n",
+ "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response[\"ProcessingOutputConfig\"][\"Outputs\"][0][\n",
+ " \"S3Output\"\n",
+ "][\"S3Uri\"]\n",
"evaluation_metrics_s3_uri"
]
},
@@ -167,9 +169,7 @@
"import json\n",
"from pprint import pprint\n",
"\n",
- "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(\n",
- " evaluation_metrics_s3_uri\n",
- "))\n",
+ "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(evaluation_metrics_s3_uri))\n",
"\n",
"pprint(json.loads(evaluation_json))"
]
@@ -187,15 +187,15 @@
"metadata": {},
"outputs": [],
"source": [
- "training_job_arn=None\n",
+ "training_job_arn = None\n",
"\n",
- "for execution_step in steps['PipelineExecutionSteps']:\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
" if execution_step[\"StepName\"] == \"Train\":\n",
" training_job_arn = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"]\n",
- " \n",
+ "\n",
" break\n",
- " \n",
- "training_job_name = training_job_arn.split('/')[-1]\n",
+ "\n",
+ "training_job_name = training_job_arn.split(\"/\")[-1]\n",
"print(training_job_name)"
]
},
@@ -205,7 +205,7 @@
"metadata": {},
"outputs": [],
"source": [
- "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)['ModelArtifacts']['S3ModelArtifacts']"
+ "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)[\"ModelArtifacts\"][\"S3ModelArtifacts\"]"
]
},
{
@@ -223,8 +223,8 @@
"metadata": {},
"outputs": [],
"source": [
- "!mkdir -p ./model \n",
- "!tar -zxvf model.tar.gz -C ./model "
+ "!mkdir -p ./model\n",
+ "!tar -zxvf model.tar.gz -C ./model"
]
},
{
@@ -261,8 +261,8 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_name=None\n",
- "training_job_name=None"
+ "processing_job_name = None\n",
+ "training_job_name = None"
]
},
{
@@ -276,15 +276,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -307,9 +307,10 @@
"source": [
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
- "time.sleep(30) # avoid throttling exception\n",
+ "time.sleep(30) # avoid throttling exception\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
diff --git a/00_quickstart/12_Register_Deploy_Model.ipynb b/00_quickstart/12_Register_Deploy_Model.ipynb
index cee60eea..d72054e1 100644
--- a/00_quickstart/12_Register_Deploy_Model.ipynb
+++ b/00_quickstart/12_Register_Deploy_Model.ipynb
@@ -28,12 +28,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -65,20 +65,20 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
- "# print('Executions for our pipeline...')\n",
- "# print(pipeline_execution_status)\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
+ " # print('Executions for our pipeline...')\n",
+ " # print(pipeline_execution_status)\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -95,7 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -105,7 +105,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -135,9 +135,9 @@
"metadata": {},
"outputs": [],
"source": [
- "for execution_step in steps['PipelineExecutionSteps']:\n",
- " if execution_step['StepName'] == 'RegisterModel':\n",
- " model_package_arn = execution_step['Metadata']['RegisterModel']['Arn']\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
+ " if execution_step[\"StepName\"] == \"RegisterModel\":\n",
+ " model_package_arn = execution_step[\"Metadata\"][\"RegisterModel\"][\"Arn\"]\n",
" break\n",
"print(model_package_arn)"
]
@@ -150,7 +150,7 @@
"source": [
"model_package_update_response = sm.update_model_package(\n",
" ModelPackageArn=model_package_arn,\n",
- " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n",
+ " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n",
")"
]
},
@@ -167,13 +167,13 @@
"metadata": {},
"outputs": [],
"source": [
- "for execution_step in steps['PipelineExecutionSteps']:\n",
- " if execution_step['StepName'] == 'CreateModel':\n",
- " model_arn = execution_step['Metadata']['Model']['Arn']\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
+ " if execution_step[\"StepName\"] == \"CreateModel\":\n",
+ " model_arn = execution_step[\"Metadata\"][\"Model\"][\"Arn\"]\n",
" break\n",
"print(model_arn)\n",
"\n",
- "model_name = model_arn.split('/')[-1]\n",
+ "model_name = model_arn.split(\"/\")[-1]\n",
"print(model_name)"
]
},
@@ -192,13 +192,14 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "model_from_registry_name = 'bert-model-from-registry-{}'.format(timestamp)\n",
+ "model_from_registry_name = \"bert-model-from-registry-{}\".format(timestamp)\n",
"print(\"Model from registry name : {}\".format(model_from_registry_name))\n",
"\n",
"model_registry_package_container = {\n",
- " 'ModelPackageName': model_package_arn,\n",
+ " \"ModelPackageName\": model_package_arn,\n",
"}"
]
},
@@ -208,12 +209,10 @@
"metadata": {},
"outputs": [],
"source": [
- "from pprint import pprint \n",
+ "from pprint import pprint\n",
"\n",
"create_model_from_registry_respose = sm.create_model(\n",
- " ModelName = model_from_registry_name,\n",
- " ExecutionRoleArn = role,\n",
- " PrimaryContainer = model_registry_package_container\n",
+ " ModelName=model_from_registry_name, ExecutionRoleArn=role, PrimaryContainer=model_registry_package_container\n",
")\n",
"pprint(create_model_from_registry_respose)"
]
@@ -224,7 +223,7 @@
"metadata": {},
"outputs": [],
"source": [
- "model_from_registry_arn = create_model_from_registry_respose['ModelArn']\n",
+ "model_from_registry_arn = create_model_from_registry_respose[\"ModelArn\"]\n",
"model_from_registry_arn"
]
},
@@ -234,17 +233,21 @@
"metadata": {},
"outputs": [],
"source": [
- "endpoint_config_name = 'bert-model-from-registry-epc-{}'.format(timestamp)\n",
+ "endpoint_config_name = \"bert-model-from-registry-epc-{}\".format(timestamp)\n",
"print(endpoint_config_name)\n",
"\n",
"create_endpoint_config_response = sm.create_endpoint_config(\n",
- " EndpointConfigName = endpoint_config_name,\n",
- " ProductionVariants=[{\n",
- " 'InstanceType':'ml.m5.4xlarge',\n",
- " 'InitialVariantWeight':1,\n",
- " 'InitialInstanceCount':1,\n",
- " 'ModelName': model_name,\n",
- " 'VariantName':'AllTraffic'}])"
+ " EndpointConfigName=endpoint_config_name,\n",
+ " ProductionVariants=[\n",
+ " {\n",
+ " \"InstanceType\": \"ml.m5.4xlarge\",\n",
+ " \"InitialVariantWeight\": 1,\n",
+ " \"InitialInstanceCount\": 1,\n",
+ " \"ModelName\": model_name,\n",
+ " \"VariantName\": \"AllTraffic\",\n",
+ " }\n",
+ " ],\n",
+ ")"
]
},
{
@@ -253,13 +256,13 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_endpoint_name = 'bert-model-from-registry-ep-{}'.format(timestamp)\n",
+ "pipeline_endpoint_name = \"bert-model-from-registry-ep-{}\".format(timestamp)\n",
"print(\"EndpointName={}\".format(pipeline_endpoint_name))\n",
"\n",
"create_endpoint_response = sm.create_endpoint(\n",
- " EndpointName=pipeline_endpoint_name,\n",
- " EndpointConfigName=endpoint_config_name)\n",
- "print(create_endpoint_response['EndpointArn'])"
+ " EndpointName=pipeline_endpoint_name, EndpointConfigName=endpoint_config_name\n",
+ ")\n",
+ "print(create_endpoint_response[\"EndpointArn\"])"
]
},
{
@@ -270,7 +273,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, pipeline_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, pipeline_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -297,7 +306,7 @@
"source": [
"%%time\n",
"\n",
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=pipeline_endpoint_name)"
]
},
@@ -348,20 +357,20 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
" display(viz.show(pipeline_execution_step=execution_step))\n",
- " time.sleep(5)\n"
+ " time.sleep(5)"
]
},
{
diff --git a/00_quickstart/13_Cleanup.ipynb b/00_quickstart/13_Cleanup.ipynb
index d724df10..6dbe74c8 100644
--- a/00_quickstart/13_Cleanup.ipynb
+++ b/00_quickstart/13_Cleanup.ipynb
@@ -31,9 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sm.delete_endpoint(\n",
- " EndpointName=pipeline_endpoint_name\n",
- ")"
+ "sm.delete_endpoint(EndpointName=pipeline_endpoint_name)"
]
}
],
diff --git a/00_quickstart/evaluate_model_metrics.py b/00_quickstart/evaluate_model_metrics.py
index 024afdec..f3523174 100644
--- a/00_quickstart/evaluate_model_metrics.py
+++ b/00_quickstart/evaluate_model_metrics.py
@@ -4,13 +4,16 @@
from datetime import datetime
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
import pandas as pd
import os
import re
@@ -33,99 +36,99 @@
from sklearn.utils import resample
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
CLASSES = [1, 2, 3, 4, 5]
-config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
+config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+)
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--input-model', type=str,
- default='/opt/ml/processing/input/model',
+ parser.add_argument(
+ "--input-model",
+ type=str,
+ default="/opt/ml/processing/input/model",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
-
+ )
+
return parser.parse_args()
-
+
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- print('input_data: {}'.format(args.input_data))
- print('input_model: {}'.format(args.input_model))
-
- print('Listing contents of input model dir: {}'.format(args.input_model))
+ print("Current host: {}".format(args.current_host))
+
+ print("input_data: {}".format(args.input_data))
+ print("input_model: {}".format(args.input_model))
+
+ print("Listing contents of input model dir: {}".format(args.input_model))
input_files = os.listdir(args.input_model)
for file in input_files:
print(file)
- model_tar_path = '{}/model.tar.gz'.format(args.input_model)
+ model_tar_path = "{}/model.tar.gz".format(args.input_model)
model_tar = tarfile.open(model_tar_path)
model_tar.extractall(args.input_model)
- model_tar.close()
+ model_tar.close()
- model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model))
+ model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model))
print(model)
-
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=args.max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -133,81 +136,86 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
-
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
###########################################################################################
# TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz #
- ###########################################################################################
-# evaluation_data_path = '/opt/ml/processing/input/data/'
-
- print('Listing contents of input data dir: {}'.format(args.input_data))
+ ###########################################################################################
+ # evaluation_data_path = '/opt/ml/processing/input/data/'
+
+ print("Listing contents of input data dir: {}".format(args.input_data))
input_files = os.listdir(args.input_data)
- test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data)
- print('Using only {} to evaluate.'.format(test_data_path))
- df_test_reviews = pd.read_csv(test_data_path,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data)
+ print("Using only {} to evaluate.".format(test_data_path))
+ df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[
+ ["review_body", "star_rating"]
+ ]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
- y_actual = df_test_reviews['star_rating']
+ y_actual = df_test_reviews["star_rating"]
y_actual
print(classification_report(y_true=y_test, y_pred=y_actual))
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
def plot_conf_mat(cm, classes, title, cmap):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=CLASSES,
- title='Confusion Matrix',
- cmap=plt.cm.Greens)
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens)
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
- # Model Output
- metrics_path = os.path.join(args.output_data, 'metrics/')
+ # Model Output
+ metrics_path = os.path.join(args.output_data, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
report_dict = {
"metrics": {
@@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap):
evaluation_path = "{}/evaluation.json".format(metrics_path)
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))
-
- print('Listing contents of output dir: {}'.format(args.output_data))
+
+ print("Listing contents of output dir: {}".format(args.output_data))
output_files = os.listdir(args.output_data)
for file in output_files:
print(file)
- print('Listing contents of output/metrics dir: {}'.format(metrics_path))
- output_files = os.listdir('{}'.format(metrics_path))
+ print("Listing contents of output/metrics dir: {}".format(metrics_path))
+ output_files = os.listdir("{}".format(metrics_path))
for file in output_files:
print(file)
- print('Complete')
-
-
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
- process(args)
+ process(args)
diff --git a/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py b/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py
index 1211ba85..7e1cd385 100644
--- a/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py
+++ b/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py
@@ -20,16 +20,18 @@
import subprocess
## PIP INSTALLS ##
-# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
+# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
# use anaconda and anaconda only supports 2.3.0 at this time
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"])
import pandas as pd
import re
import sagemaker
@@ -40,51 +42,55 @@
FeatureTypeEnum,
)
-region = os.environ['AWS_DEFAULT_REGION']
-print('Region: {}'.format(region))
+region = os.environ["AWS_DEFAULT_REGION"]
+print("Region: {}".format(region))
#############################
## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc.
## Role and Bucket are malformed if we do this later.
-sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region)
+sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region)
caller_identity = sts.get_caller_identity()
-print('caller_identity: {}'.format(caller_identity))
+print("caller_identity: {}".format(caller_identity))
-assumed_role_arn = caller_identity['Arn']
-print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn))
+assumed_role_arn = caller_identity["Arn"]
+print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn))
-assumed_role_name = assumed_role_arn.split('/')[-2]
+assumed_role_name = assumed_role_arn.split("/")[-2]
-iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region)
-get_role_response = iam.get_role(RoleName=assumed_role_name)
-print('get_role_response {}'.format(get_role_response))
-role = get_role_response['Role']['Arn']
-print('role {}'.format(role))
+iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region)
+get_role_response = iam.get_role(RoleName=assumed_role_name)
+print("get_role_response {}".format(get_role_response))
+role = get_role_response["Role"]["Arn"]
+print("role {}".format(role))
bucket = sagemaker.Session().default_bucket()
-print('The DEFAULT BUCKET is {}'.format(bucket))
+print("The DEFAULT BUCKET is {}".format(bucket))
#############################
-sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region)
+sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region)
-featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region)
+featurestore_runtime = boto3.Session(region_name=region).client(
+ service_name="sagemaker-featurestore-runtime", region_name=region
+)
-s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region)
+s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region)
-sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region),
- sagemaker_client=sm,
- sagemaker_featurestore_runtime_client=featurestore_runtime)
+sagemaker_session = sagemaker.Session(
+ boto_session=boto3.Session(region_name=region),
+ sagemaker_client=sm,
+ sagemaker_featurestore_runtime_client=featurestore_runtime,
+)
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-REVIEW_BODY_COLUMN = 'review_body'
-REVIEW_ID_COLUMN = 'review_id'
+REVIEW_BODY_COLUMN = "review_body"
+REVIEW_ID_COLUMN = "review_id"
# DATE_COLUMN = 'date'
-LABEL_COLUMN = 'star_rating'
+LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]
-
+
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
label_map[label] = i
@@ -92,94 +98,88 @@
def cast_object_to_string(data_frame):
for label in data_frame.columns:
- if data_frame.dtypes[label] == 'object':
+ if data_frame.dtypes[label] == "object":
data_frame[label] = data_frame[label].astype("str").astype("string")
return data_frame
-
+
def wait_for_feature_group_creation_complete(feature_group):
try:
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
while status == "Creating":
print("Waiting for Feature Group Creation")
time.sleep(5)
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
if status != "Created":
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
raise RuntimeError(f"Failed to create feature group {feature_group.name}")
print(f"FeatureGroup {feature_group.name} successfully created.")
except:
- print('No feature group created yet.')
-
-
+ print("No feature group created yet.")
+
+
def create_or_load_feature_group(prefix, feature_group_name):
# Feature Definitions for our records
- feature_definitions= [
- FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),
- FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),
-# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING)
+ feature_definitions = [
+ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
+ FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
+ # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]
-
+
feature_group = FeatureGroup(
- name=feature_group_name,
- feature_definitions=feature_definitions,
- sagemaker_session=sagemaker_session)
-
- print('Feature Group: {}'.format(feature_group))
-
- try:
- print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...')
+ name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
+ )
+
+ print("Feature Group: {}".format(feature_group))
+
+ try:
+ print(
+ "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
+ )
wait_for_feature_group_creation_complete(feature_group)
except Exception as e:
- print('Before CREATE FG wait exeption: {}'.format(e))
-# pass
-
+ print("Before CREATE FG wait exeption: {}".format(e))
+ # pass
+
try:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"
-
- print('Creating Feature Group with role {}...'.format(role))
+
+ print("Creating Feature Group with role {}...".format(role))
feature_group.create(
s3_uri=f"s3://{bucket}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name=event_time_feature_name,
role_arn=role,
- enable_online_store=True
+ enable_online_store=True,
)
- print('Creating Feature Group. Completed.')
-
- print('Waiting for new Feature Group to become available...')
+ print("Creating Feature Group. Completed.")
+
+ print("Waiting for new Feature Group to become available...")
wait_for_feature_group_creation_complete(feature_group)
- print('Feature Group available.')
+ print("Feature Group available.")
feature_group.describe()
-
+
except Exception as e:
- print('Exception: {}'.format(e))
-
+ print("Exception: {}".format(e))
+
return feature_group
-
+
class InputFeatures(object):
- """BERT feature vectors."""
-
- def __init__(self,
- input_ids,
- input_mask,
- segment_ids,
- label_id,
- review_id,
- date,
- label):
-# review_body):
+ """BERT feature vectors."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
+ # review_body):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
@@ -187,36 +187,38 @@ def __init__(self,
self.review_id = review_id
self.date = date
self.label = label
+
+
# self.review_body = review_body
-
-
+
+
class Input(object):
- """A single training/test input for sequence classification."""
-
- def __init__(self, text, review_id, date, label=None):
- """Constructs an Input.
- Args:
- text: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.text = text
- self.review_id = review_id
- self.date = date
- self.label = label
-
-
+ """A single training/test input for sequence classification."""
+
+ def __init__(self, text, review_id, date, label=None):
+ """Constructs an Input.
+ Args:
+ text: string. The untokenized text of the first sequence. For single
+ sequence tasks, only this sequence must be specified.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.text = text
+ self.review_id = review_id
+ self.date = date
+ self.label = label
+
+
def convert_input(the_input, max_seq_length):
# First, we need to preprocess our data so that it matches the data BERT was trained on:
#
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- #
+ #
# Fortunately, the Transformers tokenizer does this for us!
#
- tokens = tokenizer.tokenize(the_input.text)
+ tokens = tokenizer.tokenize(the_input.text)
# Next, we need to do the following:
#
@@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length):
#
# Again, the Transformers tokenizer does this for us!
#
- encode_plus_tokens = tokenizer.encode_plus(the_input.text,
- pad_to_max_length=True,
- max_length=max_seq_length,
-# truncation=True
- )
+ encode_plus_tokens = tokenizer.encode_plus(
+ the_input.text,
+ pad_to_max_length=True,
+ max_length=max_seq_length,
+ # truncation=True
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
-
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ input_ids = encode_plus_tokens["input_ids"]
+
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
# Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
segment_ids = [0] * max_seq_length
@@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length):
label_id=label_id,
review_id=the_input.review_id,
date=the_input.date,
- label=the_input.label)
-# review_body=the_input.text)
-
-# print('**input_ids**\n{}\n'.format(features.input_ids))
-# print('**input_mask**\n{}\n'.format(features.input_mask))
-# print('**segment_ids**\n{}\n'.format(features.segment_ids))
-# print('**label_id**\n{}\n'.format(features.label_id))
-# print('**review_id**\n{}\n'.format(features.review_id))
-# print('**date**\n{}\n'.format(features.date))
-# print('**label**\n{}\n'.format(features.label))
-# print('**review_body**\n{}\n'.format(features.review_body))
+ label=the_input.label,
+ )
+ # review_body=the_input.text)
+
+ # print('**input_ids**\n{}\n'.format(features.input_ids))
+ # print('**input_mask**\n{}\n'.format(features.input_mask))
+ # print('**segment_ids**\n{}\n'.format(features.segment_ids))
+ # print('**label_id**\n{}\n'.format(features.label_id))
+ # print('**review_id**\n{}\n'.format(features.review_id))
+ # print('**date**\n{}\n'.format(features.date))
+ # print('**label**\n{}\n'.format(features.label))
+ # print('**review_body**\n{}\n'.format(features.review_body))
return features
-def transform_inputs_to_tfrecord(inputs,
- output_file,
- max_seq_length):
+def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
"""Convert a set of `Input`s to a TFRecord file."""
records = []
tf_record_writer = tf.io.TFRecordWriter(output_file)
-
+
for (input_idx, the_input) in enumerate(inputs):
if input_idx % 10000 == 0:
- print('Writing input {} of {}\n'.format(input_idx, len(inputs)))
+ print("Writing input {} of {}\n".format(input_idx, len(inputs)))
features = convert_input(the_input, max_seq_length)
all_features = collections.OrderedDict()
- all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
- all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
- all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
- all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
+ all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
+ all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
+ all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
+ all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
tf_record_writer.write(tf_record.SerializeToString())
- records.append({#'tf_record': tf_record.SerializeToString(),
- 'input_ids': features.input_ids,
- 'input_mask': features.input_mask,
- 'segment_ids': features.segment_ids,
- 'label_id': features.label_id,
- 'review_id': the_input.review_id,
- 'date': the_input.date,
- 'label': features.label,
-# 'review_body': features.review_body
- })
+ records.append(
+ { #'tf_record': tf_record.SerializeToString(),
+ "input_ids": features.input_ids,
+ "input_mask": features.input_mask,
+ "segment_ids": features.segment_ids,
+ "label_id": features.label_id,
+ "review_id": the_input.review_id,
+ "date": the_input.date,
+ "label": features.label,
+ # 'review_body': features.review_body
+ }
+ )
#####################################
####### TODO: REMOVE THIS BREAK #######
- #####################################
+ #####################################
# break
-
+
tf_record_writer.close()
-
+
return records
-
+
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--train-split-percentage', type=float,
+ parser.add_argument(
+ "--train-split-percentage",
+ type=float,
default=0.90,
)
- parser.add_argument('--validation-split-percentage', type=float,
- default=0.05,
- )
- parser.add_argument('--test-split-percentage', type=float,
+ parser.add_argument(
+ "--validation-split-percentage",
+ type=float,
default=0.05,
)
- parser.add_argument('--balance-dataset', type=eval,
- default=True
+ parser.add_argument(
+ "--test-split-percentage",
+ type=float,
+ default=0.05,
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument("--balance-dataset", type=eval, default=True)
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
- parser.add_argument('--feature-store-offline-prefix', type=str,
+ )
+ parser.add_argument(
+ "--feature-store-offline-prefix",
+ type=str,
default=None,
- )
- parser.add_argument('--feature-group-name', type=str,
+ )
+ parser.add_argument(
+ "--feature-group-name",
+ type=str,
default=None,
- )
-
+ )
+
return parser.parse_args()
-
-def _transform_tsv_to_tfrecord(file,
- max_seq_length,
- balance_dataset,
- prefix,
- feature_group_name):
- print('file {}'.format(file))
- print('max_seq_length {}'.format(max_seq_length))
- print('balance_dataset {}'.format(balance_dataset))
- print('prefix {}'.format(prefix))
- print('feature_group_name {}'.format(feature_group_name))
+
+def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name):
+ print("file {}".format(file))
+ print("max_seq_length {}".format(max_seq_length))
+ print("balance_dataset {}".format(balance_dataset))
+ print("prefix {}".format(prefix))
+ print("feature_group_name {}".format(feature_group_name))
# need to re-load since we can't pass feature_group object in _partial functions for some reason
feature_group = create_or_load_feature_group(prefix, feature_group_name)
-
+
filename_without_extension = Path(Path(file).stem).stem
- df = pd.read_csv(file,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')
+ df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")
df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)
- print('Shape of dataframe {}'.format(df.shape))
+ print("Shape of dataframe {}".format(df.shape))
- if balance_dataset:
+ if balance_dataset:
# Balance the dataset down to the minority class
from sklearn.utils import resample
- five_star_df = df.query('star_rating == 5')
- four_star_df = df.query('star_rating == 4')
- three_star_df = df.query('star_rating == 3')
- two_star_df = df.query('star_rating == 2')
- one_star_df = df.query('star_rating == 1')
-
- minority_count = min(five_star_df.shape[0],
- four_star_df.shape[0],
- three_star_df.shape[0],
- two_star_df.shape[0],
- one_star_df.shape[0])
-
- five_star_df = resample(five_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- four_star_df = resample(four_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- three_star_df = resample(three_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- two_star_df = resample(two_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- one_star_df = resample(one_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
+ five_star_df = df.query("star_rating == 5")
+ four_star_df = df.query("star_rating == 4")
+ three_star_df = df.query("star_rating == 3")
+ two_star_df = df.query("star_rating == 2")
+ one_star_df = df.query("star_rating == 1")
+
+ minority_count = min(
+ five_star_df.shape[0],
+ four_star_df.shape[0],
+ three_star_df.shape[0],
+ two_star_df.shape[0],
+ one_star_df.shape[0],
+ )
+
+ five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)
df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])
- df_balanced = df_balanced.reset_index(drop=True)
- print('Shape of balanced dataframe {}'.format(df_balanced.shape))
- print(df_balanced['star_rating'].head(100))
+ df_balanced = df_balanced.reset_index(drop=True)
+ print("Shape of balanced dataframe {}".format(df_balanced.shape))
+ print(df_balanced["star_rating"].head(100))
df = df_balanced
-
- print('Shape of dataframe before splitting {}'.format(df.shape))
-
- print('train split percentage {}'.format(args.train_split_percentage))
- print('validation split percentage {}'.format(args.validation_split_percentage))
- print('test split percentage {}'.format(args.test_split_percentage))
-
+
+ print("Shape of dataframe before splitting {}".format(df.shape))
+
+ print("train split percentage {}".format(args.train_split_percentage))
+ print("validation split percentage {}".format(args.validation_split_percentage))
+ print("test split percentage {}".format(args.test_split_percentage))
+
holdout_percentage = 1.00 - args.train_split_percentage
- print('holdout percentage {}'.format(holdout_percentage))
- df_train, df_holdout = train_test_split(df,
- test_size=holdout_percentage,
- stratify=df['star_rating'])
+ print("holdout percentage {}".format(holdout_percentage))
+ df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])
test_holdout_percentage = args.test_split_percentage / holdout_percentage
- print('test holdout percentage {}'.format(test_holdout_percentage))
- df_validation, df_test = train_test_split(df_holdout,
- test_size=test_holdout_percentage,
- stratify=df_holdout['star_rating'])
-
+ print("test holdout percentage {}".format(test_holdout_percentage))
+ df_validation, df_test = train_test_split(
+ df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"]
+ )
+
df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
- print('Shape of train dataframe {}'.format(df_train.shape))
- print('Shape of validation dataframe {}'.format(df_validation.shape))
- print('Shape of test dataframe {}'.format(df_test.shape))
+ print("Shape of train dataframe {}".format(df_train.shape))
+ print("Shape of validation dataframe {}".format(df_validation.shape))
+ print("Shape of test dataframe {}".format(df_test.shape))
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)
- train_inputs = df_train.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- validation_inputs = df_validation.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- test_inputs = df_test.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
+ train_inputs = df_train.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ validation_inputs = df_validation.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ test_inputs = df_test.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
# Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
- #
- #
+ #
+ #
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
# 4. Map our words to indexes using a vocab file that BERT provides
# 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
# 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
- #
+ #
# We don't have to worry about these details. The Transformers tokenizer does this for us.
- #
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
+ #
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
# Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
- train_records = transform_inputs_to_tfrecord(train_inputs,
- '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- validation_records = transform_inputs_to_tfrecord(validation_inputs,
- '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- test_records = transform_inputs_to_tfrecord(test_inputs,
- '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension),
- max_seq_length)
-
+ train_records = transform_inputs_to_tfrecord(
+ train_inputs,
+ "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ validation_records = transform_inputs_to_tfrecord(
+ validation_inputs,
+ "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ test_records = transform_inputs_to_tfrecord(
+ test_inputs,
+ "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
df_train_records = pd.DataFrame.from_dict(train_records)
- df_train_records['split_type'] = 'train'
- df_train_records.head()
-
+ df_train_records["split_type"] = "train"
+ df_train_records.head()
+
df_validation_records = pd.DataFrame.from_dict(validation_records)
- df_validation_records['split_type'] = 'validation'
- df_validation_records.head()
+ df_validation_records["split_type"] = "validation"
+ df_validation_records.head()
df_test_records = pd.DataFrame.from_dict(test_records)
- df_test_records['split_type'] = 'test'
- df_test_records.head()
-
- # Add record to feature store
+ df_test_records["split_type"] = "test"
+ df_test_records.head()
+
+ # Add record to feature store
df_fs_train_records = cast_object_to_string(df_train_records)
df_fs_validation_records = cast_object_to_string(df_validation_records)
df_fs_test_records = cast_object_to_string(df_test_records)
- print('Ingesting Features...')
- feature_group.ingest(
- data_frame=df_fs_train_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_validation_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_test_records, max_workers=3, wait=True
- )
- print('Feature ingest completed.')
+ print("Ingesting Features...")
+ feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True)
+ print("Feature ingest completed.")
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
+ print("Current host: {}".format(args.current_host))
+
+ feature_group = create_or_load_feature_group(
+ prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name
+ )
feature_group.describe()
-
+
print(feature_group.as_hive_ddl())
-
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
-
- transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord,
- max_seq_length=args.max_seq_length,
- balance_dataset=args.balance_dataset,
- prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
-
- input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data))
+
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
+
+ transform_tsv_to_tfrecord = functools.partial(
+ _transform_tsv_to_tfrecord,
+ max_seq_length=args.max_seq_length,
+ balance_dataset=args.balance_dataset,
+ prefix=args.feature_store_offline_prefix,
+ feature_group_name=args.feature_group_name,
+ )
+
+ input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
num_cpus = multiprocessing.cpu_count()
- print('num_cpus {}'.format(num_cpus))
+ print("num_cpus {}".format(num_cpus))
p = multiprocessing.Pool(num_cpus)
p.map(transform_tsv_to_tfrecord, input_files)
- print('Listing contents of {}'.format(args.output_data))
+ print("Listing contents of {}".format(args.output_data))
dirs_output = os.listdir(args.output_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(train_data))
+ print("Listing contents of {}".format(train_data))
dirs_output = os.listdir(train_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(validation_data))
+ print("Listing contents of {}".format(validation_data))
dirs_output = os.listdir(validation_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(test_data))
+ print("Listing contents of {}".format(test_data))
dirs_output = os.listdir(test_data)
for file in dirs_output:
print(file)
-
+
offline_store_contents = None
- while (offline_store_contents is None):
- objects_in_bucket = s3.list_objects(Bucket=bucket,
- Prefix=args.feature_store_offline_prefix)
- if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
- offline_store_contents = objects_in_bucket['Contents']
+ while offline_store_contents is None:
+ objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix)
+ if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
+ offline_store_contents = objects_in_bucket["Contents"]
else:
- print('Waiting for data in offline store...\n')
+ print("Waiting for data in offline store...\n")
sleep(60)
- print('Data available.')
-
- print('Complete')
-
-
+ print("Data available.")
+
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
process(args)
diff --git a/00_quickstart/src/inference.py b/00_quickstart/src/inference.py
index 2975dc2d..53196737 100644
--- a/00_quickstart/src/inference.py
+++ b/00_quickstart/src/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/00_quickstart/src/tf_bert_reviews.py b/00_quickstart/src/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/00_quickstart/src/tf_bert_reviews.py
+++ b/00_quickstart/src/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/01_setup/01_Setup_Dependencies.ipynb b/01_setup/01_Setup_Dependencies.ipynb
index d74392a0..66967a9e 100644
--- a/01_setup/01_Setup_Dependencies.ipynb
+++ b/01_setup/01_Setup_Dependencies.ipynb
@@ -95,7 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!conda install -y pytorch==1.6.0 -c pytorch "
+ "!conda install -y pytorch==1.6.0 -c pytorch"
]
},
{
@@ -260,7 +260,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_dependencies_passed=True"
+ "setup_dependencies_passed = True"
]
},
{
diff --git a/01_setup/02_Check_Environment.ipynb b/01_setup/02_Check_Environment.ipynb
index 45ff455e..90925adb 100644
--- a/01_setup/02_Check_Environment.ipynb
+++ b/01_setup/02_Check_Environment.ipynb
@@ -18,8 +18,8 @@
"region = boto3.Session().region_name\n",
"session = boto3.session.Session()\n",
"\n",
- "ec2 = boto3.Session().client(service_name='ec2', region_name=region)\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)\n",
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -36,21 +36,22 @@
"outputs": [],
"source": [
"import json\n",
+ "\n",
"notebook_instance_name = None\n",
"\n",
"try:\n",
- " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n",
+ " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n",
" data = json.load(notebook_info)\n",
- " domain_id = data['DomainId']\n",
- " resource_arn = data['ResourceArn']\n",
- " region = resource_arn.split(':')[3]\n",
- " name = data['ResourceName']\n",
- " print('DomainId: {}'.format(domain_id))\n",
- " print('Name: {}'.format(name)) \n",
+ " domain_id = data[\"DomainId\"]\n",
+ " resource_arn = data[\"ResourceArn\"]\n",
+ " region = resource_arn.split(\":\")[3]\n",
+ " name = data[\"ResourceName\"]\n",
+ " print(\"DomainId: {}\".format(domain_id))\n",
+ " print(\"Name: {}\".format(name))\n",
"except:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR]: COULD NOT RETRIEVE THE METADATA.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR]: COULD NOT RETRIEVE THE METADATA.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -59,10 +60,8 @@
"metadata": {},
"outputs": [],
"source": [
- "describe_domain_response = sm.describe_domain(\n",
- " DomainId=domain_id\n",
- ")\n",
- "print(describe_domain_response['Status'])"
+ "describe_domain_response = sm.describe_domain(DomainId=domain_id)\n",
+ "print(describe_domain_response[\"Status\"])"
]
},
{
@@ -73,7 +72,7 @@
"source": [
"try:\n",
" get_status_response = sm.get_sagemaker_servicecatalog_portfolio_status()\n",
- " print(get_status_response['Status'])\n",
+ " print(get_status_response[\"Status\"])\n",
"except:\n",
" pass"
]
@@ -91,17 +90,21 @@
"metadata": {},
"outputs": [],
"source": [
- "if describe_domain_response['Status'] == 'InService' and get_status_response['Status'] == 'Enabled' and 'datascience' in name:\n",
- " setup_instance_check_passed=True\n",
- " print('[OK] Checks passed! Great Job!! Please Continue.')\n",
+ "if (\n",
+ " describe_domain_response[\"Status\"] == \"InService\"\n",
+ " and get_status_response[\"Status\"] == \"Enabled\"\n",
+ " and \"datascience\" in name\n",
+ "):\n",
+ " setup_instance_check_passed = True\n",
+ " print(\"[OK] Checks passed! Great Job!! Please Continue.\")\n",
"else:\n",
- " setup_instance_check_passed=False\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR]: WE HAVE IDENTIFIED A MISCONFIGURATION.')\n",
- " print(describe_domain_response['Status'])\n",
- " print(get_status_response['Status'])\n",
+ " setup_instance_check_passed = False\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR]: WE HAVE IDENTIFIED A MISCONFIGURATION.\")\n",
+ " print(describe_domain_response[\"Status\"])\n",
+ " print(get_status_response[\"Status\"])\n",
" print(name)\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -146,7 +149,7 @@
"metadata": {},
"outputs": [],
"source": [
- "%store "
+ "%store"
]
},
{
diff --git a/01_setup/03_Create_S3_Bucket.ipynb b/01_setup/03_Create_S3_Bucket.ipynb
index 30ee5836..a7072e17 100644
--- a/01_setup/03_Create_S3_Bucket.ipynb
+++ b/01_setup/03_Create_S3_Bucket.ipynb
@@ -21,7 +21,7 @@
"sagemaker_session = sagemaker.Session()\n",
"bucket = sagemaker_session.default_bucket()\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)"
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)"
]
},
{
@@ -30,7 +30,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_s3_bucket_passed=False"
+ "setup_s3_bucket_passed = False"
]
},
{
@@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Default bucket: {}'.format(bucket))"
+ "print(\"Default bucket: {}\".format(bucket))"
]
},
{
@@ -73,9 +73,9 @@
"try:\n",
" response = s3.head_bucket(Bucket=bucket)\n",
" print(response)\n",
- " setup_s3_bucket_passed=True\n",
+ " setup_s3_bucket_passed = True\n",
"except ClientError as e:\n",
- " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(bucket, response, e))"
+ " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(bucket, response, e))"
]
},
{
diff --git a/01_setup/04_Update_IAM_Roles_And_Policies.ipynb b/01_setup/04_Update_IAM_Roles_And_Policies.ipynb
index fef8781a..587ade3e 100644
--- a/01_setup/04_Update_IAM_Roles_And_Policies.ipynb
+++ b/01_setup/04_Update_IAM_Roles_And_Policies.ipynb
@@ -25,14 +25,9 @@
"\n",
"from botocore.config import Config\n",
"\n",
- "config = Config(\n",
- " retries = {\n",
- " 'max_attempts': 10,\n",
- " 'mode': 'adaptive'\n",
- " }\n",
- ")\n",
- "\n",
- "iam = boto3.client('iam', config=config)"
+ "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n",
+ "\n",
+ "iam = boto3.client(\"iam\", config=config)"
]
},
{
@@ -48,9 +43,9 @@
"metadata": {},
"outputs": [],
"source": [
- "role_name = role.split('/')[-1]\n",
+ "role_name = role.split(\"/\")[-1]\n",
"\n",
- "print('Role name: {}'.format(role_name))"
+ "print(\"Role name: {}\".format(role_name))"
]
},
{
@@ -59,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_iam_roles_passed=False"
+ "setup_iam_roles_passed = False"
]
},
{
@@ -76,14 +71,14 @@
"outputs": [],
"source": [
"admin = False\n",
- "post_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n",
+ "post_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n",
"for post_policy in post_policies:\n",
- " if post_policy['PolicyName'] == 'AdministratorAccess':\n",
+ " if post_policy[\"PolicyName\"] == \"AdministratorAccess\":\n",
" admin = True\n",
" break\n",
"\n",
- "setup_iam_roles_passed=True\n",
- "print('[OK] You are all set up to continue with this workshop!')"
+ "setup_iam_roles_passed = True\n",
+ "print(\"[OK] You are all set up to continue with this workshop!\")"
]
},
{
@@ -93,27 +88,29 @@
"outputs": [],
"source": [
"if not admin:\n",
- " pre_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n",
+ " pre_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n",
+ "\n",
+ " required_policies = [\"IAMFullAccess\"]\n",
"\n",
- " required_policies = ['IAMFullAccess']\n",
- " \n",
" for pre_policy in pre_policies:\n",
" for role_req in required_policies:\n",
- " if pre_policy['PolicyName'] == role_req:\n",
- " print('Attached: {}'.format(pre_policy['PolicyName']))\n",
+ " if pre_policy[\"PolicyName\"] == role_req:\n",
+ " print(\"Attached: {}\".format(pre_policy[\"PolicyName\"]))\n",
" try:\n",
- " required_policies.remove(pre_policy['PolicyName'])\n",
+ " required_policies.remove(pre_policy[\"PolicyName\"])\n",
" except:\n",
" pass\n",
"\n",
" if len(required_policies) > 0:\n",
- " print('*************** [ERROR] You need to attach the following policies in order to continue with this workshop *****************\\n')\n",
+ " print(\n",
+ " \"*************** [ERROR] You need to attach the following policies in order to continue with this workshop *****************\\n\"\n",
+ " )\n",
" for required_policy in required_policies:\n",
- " print('Not Attached: {}'.format(required_policy))\n",
+ " print(\"Not Attached: {}\".format(required_policy))\n",
" else:\n",
- " print('[OK] You are all set to continue with this notebook!')\n",
+ " print(\"[OK] You are all set to continue with this notebook!\")\n",
"else:\n",
- " print('[OK] You are all set to continue with this notebook!')"
+ " print(\"[OK] You are all set to continue with this notebook!\")"
]
},
{
@@ -132,19 +129,16 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AdministratorAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AdministratorAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
"\n",
"time.sleep(5)"
]
@@ -158,19 +152,16 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonSageMakerFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonSageMakerFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
"\n",
"time.sleep(5)"
]
@@ -184,19 +175,16 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='IAMFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"IAMFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
"\n",
"time.sleep(5)"
]
@@ -210,20 +198,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonS3FullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonS3FullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -236,20 +221,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='ComprehendFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"ComprehendFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -262,20 +244,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonAthenaFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonAthenaFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -288,20 +267,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='SecretsManagerReadWrite'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"SecretsManagerReadWrite\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -314,20 +290,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonRedshiftFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonRedshiftFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -340,20 +313,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonEC2ContainerRegistryFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonEC2ContainerRegistryFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -366,20 +336,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AWSStepFunctionsFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AWSStepFunctionsFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -392,20 +359,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonKinesisFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonKinesisFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -418,20 +382,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonKinesisFirehoseFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonKinesisFirehoseFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -444,20 +405,17 @@
"from botocore.exceptions import ClientError\n",
"\n",
"try:\n",
- " policy='AmazonKinesisAnalyticsFullAccess'\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n",
- " RoleName=role_name\n",
- " )\n",
- " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n",
+ " policy = \"AmazonKinesisAnalyticsFullAccess\"\n",
+ " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n",
+ " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
- " print('[OK] Policy is already attached.')\n",
- " elif e.response['Error']['Code'] == 'LimitExceeded':\n",
- " print('[OK]')\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
+ " print(\"[OK] Policy is already attached.\")\n",
+ " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n",
+ " print(\"[OK]\")\n",
" else:\n",
- " print('*************** [ERROR] {} *****************'.format(e))\n",
- " \n",
+ " print(\"*************** [ERROR] {} *****************\".format(e))\n",
+ "\n",
"time.sleep(5)"
]
},
@@ -474,47 +432,47 @@
"metadata": {},
"outputs": [],
"source": [
- "#role = iam.get_role(RoleName=role_name)\n",
- "post_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n",
+ "# role = iam.get_role(RoleName=role_name)\n",
+ "post_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n",
"\n",
"required_policies = [\n",
- " 'AdministratorAccess',\n",
- " 'SecretsManagerReadWrite', \n",
- " 'IAMFullAccess', \n",
- " 'AmazonS3FullAccess', \n",
- " 'AmazonAthenaFullAccess', \n",
- " 'ComprehendFullAccess',\n",
- " 'AmazonEC2ContainerRegistryFullAccess',\n",
- " 'AmazonRedshiftFullAccess',\n",
- " 'AWSStepFunctionsFullAccess',\n",
- " 'AmazonSageMakerFullAccess',\n",
- " 'AmazonKinesisFullAccess',\n",
- " 'AmazonKinesisFirehoseFullAccess',\n",
- " 'AmazonKinesisAnalyticsFullAccess'\n",
- " ]\n",
+ " \"AdministratorAccess\",\n",
+ " \"SecretsManagerReadWrite\",\n",
+ " \"IAMFullAccess\",\n",
+ " \"AmazonS3FullAccess\",\n",
+ " \"AmazonAthenaFullAccess\",\n",
+ " \"ComprehendFullAccess\",\n",
+ " \"AmazonEC2ContainerRegistryFullAccess\",\n",
+ " \"AmazonRedshiftFullAccess\",\n",
+ " \"AWSStepFunctionsFullAccess\",\n",
+ " \"AmazonSageMakerFullAccess\",\n",
+ " \"AmazonKinesisFullAccess\",\n",
+ " \"AmazonKinesisFirehoseFullAccess\",\n",
+ " \"AmazonKinesisAnalyticsFullAccess\",\n",
+ "]\n",
"\n",
"admin = False\n",
"\n",
"for post_policy in post_policies:\n",
- " if post_policy['PolicyName'] == 'AdministratorAccess':\n",
+ " if post_policy[\"PolicyName\"] == \"AdministratorAccess\":\n",
" admin = True\n",
" try:\n",
- " required_policies.remove(post_policy['PolicyName'])\n",
+ " required_policies.remove(post_policy[\"PolicyName\"])\n",
" except:\n",
" break\n",
- " else: \n",
+ " else:\n",
" try:\n",
- " required_policies.remove(post_policy['PolicyName'])\n",
+ " required_policies.remove(post_policy[\"PolicyName\"])\n",
" except:\n",
" pass\n",
"\n",
"if not admin and len(required_policies) > 0:\n",
- " print('*************** [ERROR] RE-RUN THIS NOTEBOOK *****************')\n",
+ " print(\"*************** [ERROR] RE-RUN THIS NOTEBOOK *****************\")\n",
" for required_policy in required_policies:\n",
- " print('Not Attached: {}'.format(required_policy))\n",
+ " print(\"Not Attached: {}\".format(required_policy))\n",
"else:\n",
- " setup_iam_roles_passed=True\n",
- " print('[OK] You are all set up to continue with this workshop!')"
+ " setup_iam_roles_passed = True\n",
+ " print(\"[OK] You are all set up to continue with this workshop!\")"
]
},
{
diff --git a/02_usecases/01_Setup.ipynb b/02_usecases/01_Setup.ipynb
index b7703836..ee112f14 100644
--- a/02_usecases/01_Setup.ipynb
+++ b/02_usecases/01_Setup.ipynb
@@ -21,7 +21,7 @@
"bucket = sagemaker_session.default_bucket()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
diff --git a/02_usecases/03_Celebrity_Recognition.ipynb b/02_usecases/03_Celebrity_Recognition.ipynb
index 82a20d00..4e38bbe0 100644
--- a/02_usecases/03_Celebrity_Recognition.ipynb
+++ b/02_usecases/03_Celebrity_Recognition.ipynb
@@ -61,8 +61,8 @@
"metadata": {},
"outputs": [],
"source": [
- "rekognition = boto3.client('rekognition')\n",
- "s3 = boto3.client('s3')"
+ "rekognition = boto3.client(\"rekognition\")\n",
+ "s3 = boto3.client(\"s3\")"
]
},
{
@@ -72,7 +72,7 @@
"outputs": [],
"source": [
"!mkdir -p ./tmp\n",
- "temp_folder = 'tmp/'"
+ "temp_folder = \"tmp/\""
]
},
{
@@ -88,7 +88,7 @@
"metadata": {},
"outputs": [],
"source": [
- "imageName = 'content-moderation/media/GrandTourjc.png'"
+ "imageName = \"content-moderation/media/GrandTourjc.png\""
]
},
{
@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))"
+ "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))"
]
},
{
@@ -116,9 +116,9 @@
"source": [
"recognizeCelebritiesResponse = rekognition.recognize_celebrities(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': imageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": imageName,\n",
" }\n",
" }\n",
")"
@@ -156,10 +156,10 @@
"metadata": {},
"outputs": [],
"source": [
- "def drawBoundingBoxes (sourceImage, boxes):\n",
+ "def drawBoundingBoxes(sourceImage, boxes):\n",
" # blue, green, red, grey\n",
- " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n",
- " \n",
+ " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n",
+ "\n",
" # Download image locally\n",
" imageLocation = temp_folder + os.path.basename(sourceImage)\n",
" s3.download_file(bucket, sourceImage, imageLocation)\n",
@@ -170,24 +170,24 @@
" width, height = bbImage.size\n",
" col = 0\n",
" maxcol = len(colors)\n",
- " line= 3\n",
+ " line = 3\n",
" for box in boxes:\n",
- " x1 = int(box[1]['Left'] * width)\n",
- " y1 = int(box[1]['Top'] * height)\n",
- " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n",
- " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n",
- " \n",
- " draw.text((x1,y1),box[0],colors[col])\n",
+ " x1 = int(box[1][\"Left\"] * width)\n",
+ " y1 = int(box[1][\"Top\"] * height)\n",
+ " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n",
+ " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n",
+ "\n",
+ " draw.text((x1, y1), box[0], colors[col])\n",
" for l in range(line):\n",
- " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n",
- " col = (col+1)%maxcol\n",
- " \n",
+ " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n",
+ " col = (col + 1) % maxcol\n",
+ "\n",
" imageFormat = \"PNG\"\n",
" ext = sourceImage.lower()\n",
- " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n",
- " imageFormat = 'JPEG'\n",
+ " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n",
+ " imageFormat = \"JPEG\"\n",
"\n",
- " bbImage.save(imageLocation,format=imageFormat)\n",
+ " bbImage.save(imageLocation, format=imageFormat)\n",
"\n",
" display(bbImage)"
]
@@ -199,10 +199,10 @@
"outputs": [],
"source": [
"boxes = []\n",
- "celebrities = recognizeCelebritiesResponse['CelebrityFaces']\n",
+ "celebrities = recognizeCelebritiesResponse[\"CelebrityFaces\"]\n",
"for celebrity in celebrities:\n",
- " boxes.append ((celebrity['Name'], celebrity['Face']['BoundingBox']))\n",
- " \n",
+ " boxes.append((celebrity[\"Name\"], celebrity[\"Face\"][\"BoundingBox\"]))\n",
+ "\n",
"drawBoundingBoxes(imageName, boxes)"
]
},
@@ -224,9 +224,9 @@
"metadata": {},
"outputs": [],
"source": [
- "videoName = 'content-moderation/media/GrandTour720.mp4'\n",
- "strDetail = 'Celebrites detected in video
=======================================
'\n",
- "strOverall = 'Celebrities in the overall video:
=======================================
'"
+ "videoName = \"content-moderation/media/GrandTour720.mp4\"\n",
+ "strDetail = \"Celebrites detected in video
=======================================
\"\n",
+ "strOverall = \"Celebrities in the overall video:
=======================================
\""
]
},
{
@@ -236,14 +236,18 @@
"outputs": [],
"source": [
"s3FilePrefix = \"https://s3.amazonaws.com\"\n",
- "if(not region == 'us-east-1'):\n",
+ "if not region == \"us-east-1\":\n",
" s3FilePrefix = \"https://s3-{}.amazonaws.com\".format(region)\n",
"\n",
"s3VideoUrl = \"{0}/{1}/{2}\".format(s3FilePrefix, bucket, videoName)\n",
"\n",
- "videoTag = \"\".format(s3VideoUrl)\n",
+ "videoTag = \"\".format(\n",
+ " s3VideoUrl\n",
+ ")\n",
"\n",
- "videoui = \"
\".format(videoTag, strDetail)\n",
+ "videoui = \"\".format(\n",
+ " videoTag, strDetail\n",
+ ")\n",
"\n",
"display(HTML(videoui))"
]
@@ -263,14 +267,14 @@
"source": [
"startCelebrityRekognition = rekognition.start_celebrity_recognition(\n",
" Video={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': videoName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": videoName,\n",
" }\n",
" },\n",
")\n",
"\n",
- "celebrityJobId = startCelebrityRekognition['JobId']\n",
+ "celebrityJobId = startCelebrityRekognition[\"JobId\"]\n",
"display(\"Job Id: {0}\".format(celebrityJobId))"
]
},
@@ -290,20 +294,15 @@
"source": [
"%%time\n",
"\n",
- "getCelebrityRecognition = rekognition.get_celebrity_recognition(\n",
- " JobId=celebrityJobId,\n",
- " SortBy='TIMESTAMP'\n",
- ")\n",
+ "getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n",
"\n",
- "while(getCelebrityRecognition['JobStatus'] == 'IN_PROGRESS'):\n",
+ "while getCelebrityRecognition[\"JobStatus\"] == \"IN_PROGRESS\":\n",
" time.sleep(5)\n",
- " print('.', end='')\n",
- " \n",
- " getCelebrityRecognition = rekognition.get_celebrity_recognition(\n",
- " JobId=celebrityJobId,\n",
- " SortBy='TIMESTAMP')\n",
- " \n",
- "display(getCelebrityRecognition['JobStatus'])"
+ " print(\".\", end=\"\")\n",
+ "\n",
+ " getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n",
+ "\n",
+ "display(getCelebrityRecognition[\"JobStatus\"])"
]
},
{
@@ -343,16 +342,16 @@
"theCelebs = {}\n",
"\n",
"# Celebrities detected in each frame\n",
- "for celebrity in getCelebrityRecognition['Celebrities']:\n",
- " if 'Celebrity' in celebrity :\n",
+ "for celebrity in getCelebrityRecognition[\"Celebrities\"]:\n",
+ " if \"Celebrity\" in celebrity:\n",
" cconfidence = celebrity[\"Celebrity\"][\"Confidence\"]\n",
- " if(cconfidence > 95):\n",
- " ts = celebrity [\"Timestamp\"]\n",
+ " if cconfidence > 95:\n",
+ " ts = celebrity[\"Timestamp\"]\n",
" cname = celebrity[\"Celebrity\"][\"Name\"]\n",
- " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence,2))\n",
+ " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence, 2))\n",
" if not cname in theCelebs:\n",
" theCelebs[cname] = cname\n",
- " \n",
+ "\n",
"\n",
"# Unique faces detected in video\n",
"for theCeleb in theCelebs:\n",
@@ -376,7 +375,7 @@
"metadata": {},
"outputs": [],
"source": [
- "customCelebrityImageName = 'content-moderation/media/chris-antje.png'"
+ "customCelebrityImageName = \"content-moderation/media/chris-antje.png\""
]
},
{
@@ -385,7 +384,9 @@
"metadata": {},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': customCelebrityImageName})))"
+ "display(\n",
+ " IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": customCelebrityImageName}))\n",
+ ")"
]
},
{
@@ -398,9 +399,9 @@
"\n",
"customCelebrityResponse = rekognition.recognize_celebrities(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': customCelebrityImageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": customCelebrityImageName,\n",
" }\n",
" }\n",
")"
@@ -441,10 +442,10 @@
"outputs": [],
"source": [
"cboxes = []\n",
- "faces = customCelebrityResponse['UnrecognizedFaces']\n",
+ "faces = customCelebrityResponse[\"UnrecognizedFaces\"]\n",
"for face in faces:\n",
- " cboxes.append (('Unrecognized Face', face['BoundingBox']))\n",
- " \n",
+ " cboxes.append((\"Unrecognized Face\", face[\"BoundingBox\"]))\n",
+ "\n",
"drawBoundingBoxes(customCelebrityImageName, cboxes)"
]
},
diff --git a/02_usecases/04_Content_Moderation.ipynb b/02_usecases/04_Content_Moderation.ipynb
index fd0a8152..a46282f2 100644
--- a/02_usecases/04_Content_Moderation.ipynb
+++ b/02_usecases/04_Content_Moderation.ipynb
@@ -49,8 +49,8 @@
"metadata": {},
"outputs": [],
"source": [
- "rekognition = boto3.client('rekognition')\n",
- "s3 = boto3.client('s3')"
+ "rekognition = boto3.client(\"rekognition\")\n",
+ "s3 = boto3.client(\"s3\")"
]
},
{
@@ -66,7 +66,7 @@
"metadata": {},
"outputs": [],
"source": [
- "imageName = 'content-moderation/media/weapon.png'"
+ "imageName = \"content-moderation/media/weapon.png\""
]
},
{
@@ -84,12 +84,12 @@
"outputs": [],
"source": [
"detectModerationLabelsResponse = rekognition.detect_moderation_labels(\n",
- " Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': imageName,\n",
- " }\n",
- " }\n",
+ " Image={\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": imageName,\n",
+ " }\n",
+ " }\n",
")"
]
},
@@ -99,7 +99,7 @@
"metadata": {},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))"
+ "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))"
]
},
{
@@ -157,10 +157,10 @@
"metadata": {},
"outputs": [],
"source": [
- "videoName = 'content-moderation/media/weapon.mp4'\n",
+ "videoName = \"content-moderation/media/weapon.mp4\"\n",
"\n",
- "strDetail = 'Moderation labels in video
=======================================
'\n",
- "strOverall = 'Moderation labels in the overall video:
=======================================
'"
+ "strDetail = \"Moderation labels in video
=======================================
\"\n",
+ "strOverall = \"Moderation labels in the overall video:
=======================================
\""
]
},
{
@@ -171,9 +171,11 @@
},
"outputs": [],
"source": [
- "s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': videoName})\n",
+ "s3VideoUrl = s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": videoName})\n",
"\n",
- "videoTag = \"\".format(s3VideoUrl)\n",
+ "videoTag = \"\".format(\n",
+ " s3VideoUrl\n",
+ ")\n",
"\n",
"videoui = \"\".format(videoTag)\n",
"\n",
@@ -210,14 +212,14 @@
"# Start content moderation job\n",
"startModerationLabelDetection = rekognition.start_content_moderation(\n",
" Video={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': videoName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": videoName,\n",
" }\n",
" },\n",
")\n",
"\n",
- "moderationJobId = startModerationLabelDetection['JobId']\n",
+ "moderationJobId = startModerationLabelDetection[\"JobId\"]\n",
"display(\"Job Id: {0}\".format(moderationJobId))"
]
},
@@ -237,20 +239,15 @@
"source": [
"%%time\n",
"\n",
- "getContentModeration = rekognition.get_content_moderation(\n",
- " JobId=moderationJobId,\n",
- " SortBy='TIMESTAMP'\n",
- ")\n",
+ "getContentModeration = rekognition.get_content_moderation(JobId=moderationJobId, SortBy=\"TIMESTAMP\")\n",
"\n",
- "while(getContentModeration['JobStatus'] == 'IN_PROGRESS'):\n",
+ "while getContentModeration[\"JobStatus\"] == \"IN_PROGRESS\":\n",
" time.sleep(5)\n",
- " print('.', end='')\n",
- " \n",
- " getContentModeration = rekognition.get_content_moderation(\n",
- " JobId=moderationJobId,\n",
- " SortBy='TIMESTAMP')\n",
- " \n",
- "display(getContentModeration['JobStatus'])"
+ " print(\".\", end=\"\")\n",
+ "\n",
+ " getContentModeration = rekognition.get_content_moderation(JobId=moderationJobId, SortBy=\"TIMESTAMP\")\n",
+ "\n",
+ "display(getContentModeration[\"JobStatus\"])"
]
},
{
@@ -289,16 +286,16 @@
"theObjects = {}\n",
"\n",
"# Potentially unsafe detected in each frame\n",
- "for obj in getContentModeration['ModerationLabels']:\n",
- " ts = obj [\"Timestamp\"]\n",
- " cconfidence = obj['ModerationLabel'][\"Confidence\"]\n",
- " oname = obj['ModerationLabel'][\"Name\"]\n",
- " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n",
+ "for obj in getContentModeration[\"ModerationLabels\"]:\n",
+ " ts = obj[\"Timestamp\"]\n",
+ " cconfidence = obj[\"ModerationLabel\"][\"Confidence\"]\n",
+ " oname = obj[\"ModerationLabel\"][\"Name\"]\n",
+ " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence, 2))\n",
" if oname in theObjects:\n",
" cojb = theObjects[oname]\n",
- " theObjects[oname] = {\"Name\" : oname, \"Count\": 1+cojb[\"Count\"]}\n",
+ " theObjects[oname] = {\"Name\": oname, \"Count\": 1 + cojb[\"Count\"]}\n",
" else:\n",
- " theObjects[oname] = {\"Name\" : oname, \"Count\": 1}\n",
+ " theObjects[oname] = {\"Name\": oname, \"Count\": 1}\n",
"\n",
"# Unique objects detected in video\n",
"for theObject in theObjects:\n",
diff --git a/02_usecases/05_Inappropriate_Text_Detection.ipynb b/02_usecases/05_Inappropriate_Text_Detection.ipynb
index 6b6d2cae..8273cf6c 100644
--- a/02_usecases/05_Inappropriate_Text_Detection.ipynb
+++ b/02_usecases/05_Inappropriate_Text_Detection.ipynb
@@ -48,8 +48,8 @@
"metadata": {},
"outputs": [],
"source": [
- "rekognition = boto3.client('rekognition')\n",
- "s3 = boto3.client('s3')"
+ "rekognition = boto3.client(\"rekognition\")\n",
+ "s3 = boto3.client(\"s3\")"
]
},
{
@@ -59,7 +59,7 @@
"outputs": [],
"source": [
"!mkdir -p ./tmp\n",
- "temp_folder = 'tmp/'"
+ "temp_folder = \"tmp/\""
]
},
{
@@ -75,7 +75,7 @@
"metadata": {},
"outputs": [],
"source": [
- "imageName = 'content-moderation/media/coffee.jpg'"
+ "imageName = \"content-moderation/media/coffee.jpg\""
]
},
{
@@ -86,7 +86,7 @@
},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))"
+ "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))"
]
},
{
@@ -105,16 +105,12 @@
"source": [
"detectTextResponse = rekognition.detect_text(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': imageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": imageName,\n",
" }\n",
- " },\n",
- " Filters={\n",
- " 'WordFilter': {\n",
- " 'MinConfidence': 90\n",
- " }\n",
- " }\n",
+ " },\n",
+ " Filters={\"WordFilter\": {\"MinConfidence\": 90}},\n",
")"
]
},
@@ -149,11 +145,12 @@
"outputs": [],
"source": [
"import string\n",
+ "\n",
"unsafeWords = [\"crap\", \"darn\", \"damm\"]\n",
"for textDetection in detectTextResponse[\"TextDetections\"]:\n",
" # strip punctuation before checking match\n",
- " text = textDetection[\"DetectedText\"].translate(str.maketrans('', '', string.punctuation))\n",
- " if(textDetection[\"Type\"] == \"WORD\" and text in unsafeWords):\n",
+ " text = textDetection[\"DetectedText\"].translate(str.maketrans(\"\", \"\", string.punctuation))\n",
+ " if textDetection[\"Type\"] == \"WORD\" and text in unsafeWords:\n",
" print(\"Detected unsafe word: {}\".format(textDetection[\"DetectedText\"]))"
]
},
@@ -170,10 +167,10 @@
"metadata": {},
"outputs": [],
"source": [
- "def drawBoundingBoxes (sourceImage, boxes):\n",
+ "def drawBoundingBoxes(sourceImage, boxes):\n",
" # blue, green, red, grey\n",
- " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n",
- " \n",
+ " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n",
+ "\n",
" # Download image locally\n",
" imageLocation = temp_folder + os.path.basename(sourceImage)\n",
" s3.download_file(bucket, sourceImage, imageLocation)\n",
@@ -184,24 +181,24 @@
" width, height = bbImage.size\n",
" col = 0\n",
" maxcol = len(colors)\n",
- " line= 3\n",
+ " line = 3\n",
" for box in boxes:\n",
- " x1 = int(box[1]['Left'] * width)\n",
- " y1 = int(box[1]['Top'] * height)\n",
- " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n",
- " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n",
- " \n",
- " draw.text((x1,y1),box[0],colors[col])\n",
+ " x1 = int(box[1][\"Left\"] * width)\n",
+ " y1 = int(box[1][\"Top\"] * height)\n",
+ " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n",
+ " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n",
+ "\n",
+ " draw.text((x1, y1), box[0], colors[col])\n",
" for l in range(line):\n",
- " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n",
- " col = (col+1)%maxcol\n",
- " \n",
+ " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n",
+ " col = (col + 1) % maxcol\n",
+ "\n",
" imageFormat = \"PNG\"\n",
" ext = sourceImage.lower()\n",
- " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n",
- " imageFormat = 'JPEG'\n",
+ " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n",
+ " imageFormat = \"JPEG\"\n",
"\n",
- " bbImage.save(imageLocation,format=imageFormat)\n",
+ " bbImage.save(imageLocation, format=imageFormat)\n",
"\n",
" display(bbImage)"
]
@@ -215,10 +212,10 @@
"outputs": [],
"source": [
"boxes = []\n",
- "textDetections = detectTextResponse['TextDetections']\n",
+ "textDetections = detectTextResponse[\"TextDetections\"]\n",
"for textDetection in textDetections:\n",
- " boxes.append ((textDetection['Type'], textDetection[\"Geometry\"]['BoundingBox']))\n",
- " \n",
+ " boxes.append((textDetection[\"Type\"], textDetection[\"Geometry\"][\"BoundingBox\"]))\n",
+ "\n",
"drawBoundingBoxes(imageName, boxes)"
]
},
@@ -235,7 +232,7 @@
"metadata": {},
"outputs": [],
"source": [
- "imageName = 'content-moderation/media/coffee.jpg'"
+ "imageName = \"content-moderation/media/coffee.jpg\""
]
},
{
@@ -244,7 +241,7 @@
"metadata": {},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))"
+ "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))"
]
},
{
@@ -258,28 +255,17 @@
"\n",
"detectTextResponse = rekognition.detect_text(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': imageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": imageName,\n",
" }\n",
" },\n",
" Filters={\n",
- " 'WordFilter': {\n",
- " 'MinConfidence': 90,\n",
- " 'MinBoundingBoxHeight': 0.05,\n",
- " 'MinBoundingBoxWidth': 0.02\n",
- " },\n",
- " 'RegionsOfInterest': [\n",
- " {\n",
- " 'BoundingBox': {\n",
- " 'Width': 0.1,\n",
- " 'Height': 0.05,\n",
- " 'Left': 0.01,\n",
- " 'Top': 0.01\n",
- " }\n",
- " },\n",
- " ]\n",
- " }\n",
+ " \"WordFilter\": {\"MinConfidence\": 90, \"MinBoundingBoxHeight\": 0.05, \"MinBoundingBoxWidth\": 0.02},\n",
+ " \"RegionsOfInterest\": [\n",
+ " {\"BoundingBox\": {\"Width\": 0.1, \"Height\": 0.05, \"Left\": 0.01, \"Top\": 0.01}},\n",
+ " ],\n",
+ " },\n",
")"
]
},
@@ -308,7 +294,7 @@
"source": [
"for textDetection in detectTextResponse[\"TextDetections\"]:\n",
" text = textDetection[\"DetectedText\"]\n",
- " if(textDetection[\"Type\"] == \"WORD\"):\n",
+ " if textDetection[\"Type\"] == \"WORD\":\n",
" print(\"Word: {}\".format(textDetection[\"DetectedText\"]))"
]
},
@@ -331,10 +317,10 @@
"metadata": {},
"outputs": [],
"source": [
- "videoName = 'content-moderation/media/serverless-bytes.mov'\n",
+ "videoName = \"content-moderation/media/serverless-bytes.mov\"\n",
"\n",
- "strDetail = 'Text detected in video
=======================================
'\n",
- "strOverall = 'Text in the overall video:
=======================================
'"
+ "strDetail = \"Text detected in video
=======================================
\"\n",
+ "strOverall = \"Text in the overall video:
=======================================
\""
]
},
{
@@ -343,9 +329,11 @@
"metadata": {},
"outputs": [],
"source": [
- "s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': videoName})\n",
+ "s3VideoUrl = s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": videoName})\n",
"\n",
- "videoTag = \"\".format(s3VideoUrl)\n",
+ "videoTag = \"\".format(\n",
+ " s3VideoUrl\n",
+ ")\n",
"\n",
"videoui = \"\".format(videoTag)\n",
"\n",
@@ -367,14 +355,14 @@
"source": [
"startTextDetection = rekognition.start_text_detection(\n",
" Video={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucket,\n",
- " 'Name': videoName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucket,\n",
+ " \"Name\": videoName,\n",
" }\n",
" },\n",
")\n",
"\n",
- "textJobId = startTextDetection['JobId']\n",
+ "textJobId = startTextDetection[\"JobId\"]\n",
"display(\"Job Id: {0}\".format(textJobId))"
]
},
@@ -392,19 +380,15 @@
"metadata": {},
"outputs": [],
"source": [
- "getTextDetection = rekognition.get_text_detection(\n",
- " JobId=textJobId\n",
- ")\n",
+ "getTextDetection = rekognition.get_text_detection(JobId=textJobId)\n",
"\n",
- "while(getTextDetection['JobStatus'] == 'IN_PROGRESS'):\n",
+ "while getTextDetection[\"JobStatus\"] == \"IN_PROGRESS\":\n",
" time.sleep(5)\n",
- " print('.', end='')\n",
- " \n",
- " getTextDetection = rekognition.get_text_detection(\n",
- " JobId=textJobId\n",
- " )\n",
- " \n",
- "display(getTextDetection['JobStatus'])"
+ " print(\".\", end=\"\")\n",
+ "\n",
+ " getTextDetection = rekognition.get_text_detection(JobId=textJobId)\n",
+ "\n",
+ "display(getTextDetection[\"JobStatus\"])"
]
},
{
@@ -444,21 +428,21 @@
"theLines = {}\n",
"\n",
"# Objects detected in each frame\n",
- "for obj in getTextDetection['TextDetections']:\n",
- " if(obj['TextDetection']['Type'] == 'WORD'):\n",
- " ts = obj [\"Timestamp\"]\n",
- " cconfidence = obj['TextDetection'][\"Confidence\"]\n",
- " oname = obj['TextDetection'][\"DetectedText\"]\n",
+ "for obj in getTextDetection[\"TextDetections\"]:\n",
+ " if obj[\"TextDetection\"][\"Type\"] == \"WORD\":\n",
+ " ts = obj[\"Timestamp\"]\n",
+ " cconfidence = obj[\"TextDetection\"][\"Confidence\"]\n",
+ " oname = obj[\"TextDetection\"][\"DetectedText\"]\n",
"\n",
- " if(oname in flaggedTextInVideo):\n",
- " print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence,2)))\n",
+ " if oname in flaggedTextInVideo:\n",
+ " print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence, 2)))\n",
"\n",
- " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n",
+ " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence, 2))\n",
" if oname in theLines:\n",
" cojb = theLines[oname]\n",
- " theLines[oname] = {\"Text\" : oname, \"Count\": 1+cojb[\"Count\"]}\n",
+ " theLines[oname] = {\"Text\": oname, \"Count\": 1 + cojb[\"Count\"]}\n",
" else:\n",
- " theLines[oname] = {\"Text\" : oname, \"Count\": 1}\n",
+ " theLines[oname] = {\"Text\": oname, \"Count\": 1}\n",
"\n",
"# Unique objects detected in video\n",
"for theLine in theLines:\n",
diff --git a/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb b/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb
index 997206f8..4aaa6d8f 100644
--- a/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb
+++ b/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb
@@ -44,7 +44,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -76,10 +76,12 @@
"source": [
"import csv\n",
"\n",
- "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df.shape"
]
},
@@ -99,12 +101,13 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(kind=\"bar\", title=\"Breakdown by Star Rating\")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -122,43 +125,26 @@
"source": [
"from sklearn.utils import resample\n",
"\n",
- "five_star_df = df.query('star_rating == 5')\n",
- "four_star_df = df.query('star_rating == 4')\n",
- "three_star_df = df.query('star_rating == 3')\n",
- "two_star_df = df.query('star_rating == 2')\n",
- "one_star_df = df.query('star_rating == 1')\n",
+ "five_star_df = df.query(\"star_rating == 5\")\n",
+ "four_star_df = df.query(\"star_rating == 4\")\n",
+ "three_star_df = df.query(\"star_rating == 3\")\n",
+ "two_star_df = df.query(\"star_rating == 2\")\n",
+ "one_star_df = df.query(\"star_rating == 1\")\n",
"\n",
"# Check which sentiment has the least number of samples\n",
- "minority_count = min(five_star_df.shape[0], \n",
- " four_star_df.shape[0], \n",
- " three_star_df.shape[0], \n",
- " two_star_df.shape[0], \n",
- " one_star_df.shape[0]) \n",
+ "minority_count = min(\n",
+ " five_star_df.shape[0], four_star_df.shape[0], three_star_df.shape[0], two_star_df.shape[0], one_star_df.shape[0]\n",
+ ")\n",
"\n",
- "five_star_df = resample(five_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "four_star_df = resample(four_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "three_star_df = resample(three_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "two_star_df = resample(two_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "one_star_df = resample(one_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
"df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])\n",
"df_balanced = df_balanced.reset_index(drop=True)\n",
@@ -172,9 +158,11 @@
"metadata": {},
"outputs": [],
"source": [
- "df_balanced[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_balanced[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -202,14 +190,10 @@
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Split all data into 90% train and 10% holdout\n",
- "df_train, df_holdout = train_test_split(df_balanced, \n",
- " test_size=0.10,\n",
- " stratify=df_balanced['star_rating'])\n",
+ "df_train, df_holdout = train_test_split(df_balanced, test_size=0.10, stratify=df_balanced[\"star_rating\"])\n",
"\n",
"# Split holdout data into 50% validation and 50% test\n",
- "df_validation, df_test = train_test_split(df_holdout,\n",
- " test_size=0.50, \n",
- " stratify=df_holdout['star_rating'])\n"
+ "df_validation, df_test = train_test_split(df_holdout, test_size=0.50, stratify=df_holdout[\"star_rating\"])"
]
},
{
@@ -219,16 +203,16 @@
"outputs": [],
"source": [
"# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n",
- "labels = ['Train', 'Validation', 'Test']\n",
+ "labels = [\"Train\", \"Validation\", \"Test\"]\n",
"sizes = [len(df_train.index), len(df_validation.index), len(df_test.index)]\n",
- "explode = (0.1, 0, 0) \n",
+ "explode = (0.1, 0, 0)\n",
"\n",
"fig1, ax1 = plt.subplots()\n",
"\n",
- "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)\n",
+ "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90)\n",
"\n",
"# Equal aspect ratio ensures that pie is drawn as a circle.\n",
- "ax1.axis('equal') \n",
+ "ax1.axis(\"equal\")\n",
"\n",
"plt.show()"
]
@@ -255,7 +239,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_train[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='90% Train Breakdown by Star Rating')"
+ "df_train[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"90% Train Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -280,7 +266,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_validation[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Validation Breakdown by Star Rating')"
+ "df_validation[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"5% Validation Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -305,7 +293,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_test[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Test Breakdown by Star Rating')"
+ "df_test[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"5% Test Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -321,7 +311,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_train = df_train[['star_rating', 'review_body']]\n",
+ "df_train = df_train[[\"star_rating\", \"review_body\"]]\n",
"df_train.shape"
]
},
@@ -347,7 +337,7 @@
"metadata": {},
"outputs": [],
"source": [
- "comprehend_train_path = './amazon_reviews_us_Digital_Software_v1_00_comprehend.csv'\n",
+ "comprehend_train_path = \"./amazon_reviews_us_Digital_Software_v1_00_comprehend.csv\"\n",
"df_train.to_csv(comprehend_train_path, index=False, header=False)"
]
},
@@ -364,7 +354,7 @@
"metadata": {},
"outputs": [],
"source": [
- "train_s3_prefix = 'data'\n",
+ "train_s3_prefix = \"data\"\n",
"comprehend_train_s3_uri = sess.upload_data(path=comprehend_train_path, key_prefix=train_s3_prefix)\n",
"comprehend_train_s3_uri"
]
diff --git a/02_usecases/07_Text_Classification_Train_Model.ipynb b/02_usecases/07_Text_Classification_Train_Model.ipynb
index 904fb775..045f2703 100644
--- a/02_usecases/07_Text_Classification_Train_Model.ipynb
+++ b/02_usecases/07_Text_Classification_Train_Model.ipynb
@@ -38,22 +38,17 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
"from botocore.config import Config\n",
"\n",
- "config = Config(\n",
- " retries = {\n",
- " 'max_attempts': 10,\n",
- " 'mode': 'adaptive'\n",
- " }\n",
- ")\n",
+ "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n",
"\n",
- "iam = boto3.client('iam', config=config)\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "iam = boto3.client(\"iam\", config=config)\n",
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -69,15 +64,28 @@
"metadata": {},
"outputs": [],
"source": [
- "if region in ['ap-south-1', 'eu-west-2', 'eu-west-1', 'ap-northeast-2', 'ap-northeast-1', 'ca-central-1', 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'us-east-1', 'us-east-2', 'us-west-2']:\n",
- " print(' [OK] COMPREHEND IS SUPPORTED IN {}'.format(region))\n",
- " print(' [OK] Please proceed with this notebook.' )\n",
+ "if region in [\n",
+ " \"ap-south-1\",\n",
+ " \"eu-west-2\",\n",
+ " \"eu-west-1\",\n",
+ " \"ap-northeast-2\",\n",
+ " \"ap-northeast-1\",\n",
+ " \"ca-central-1\",\n",
+ " \"ap-southeast-1\",\n",
+ " \"ap-southeast-2\",\n",
+ " \"eu-central-1\",\n",
+ " \"us-east-1\",\n",
+ " \"us-east-2\",\n",
+ " \"us-west-2\",\n",
+ "]:\n",
+ " print(\" [OK] COMPREHEND IS SUPPORTED IN {}\".format(region))\n",
+ " print(\" [OK] Please proceed with this notebook.\")\n",
"else:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print(' [ERROR] COMPREHEND IS NOT YET SUPPORTED IN {}.'.format(region))\n",
- " print(' [INFO] This is OK. Skip this notebook and continue with the next use case.' )\n",
- " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\" [ERROR] COMPREHEND IS NOT YET SUPPORTED IN {}.\".format(region))\n",
+ " print(\" [INFO] This is OK. Skip this notebook and continue with the next use case.\")\n",
+ " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -86,7 +94,7 @@
"metadata": {},
"outputs": [],
"source": [
- "comprehend = boto3.client('comprehend')"
+ "comprehend = boto3.client(\"comprehend\")"
]
},
{
@@ -112,10 +120,10 @@
"outputs": [],
"source": [
"if not comprehend_train_s3_uri:\n",
- " print('****************************************************************************************')\n",
- " print('**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************')\n",
- " print('**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************')\n",
- " print('****************************************************************************************')"
+ " print(\"****************************************************************************************\")\n",
+ " print(\"**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************\")\n",
+ " print(\"**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************\")\n",
+ " print(\"****************************************************************************************\")"
]
},
{
@@ -160,7 +168,7 @@
"source": [
"import csv\n",
"\n",
- "df = pd.read_csv('./tmp/amazon_reviews_us_Digital_Software_v1_00_comprehend.csv', header=None)\n",
+ "df = pd.read_csv(\"./tmp/amazon_reviews_us_Digital_Software_v1_00_comprehend.csv\", header=None)\n",
"df.head()"
]
},
@@ -185,17 +193,11 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"comprehend.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"comprehend.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n",
+ " ],\n",
+ "}"
]
},
{
@@ -211,7 +213,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_comprehend_role_name = 'DSOAWS_Comprehend'"
+ "iam_comprehend_role_name = \"DSOAWS_Comprehend\""
]
},
{
@@ -229,15 +231,15 @@
" iam_role_comprehend = iam.create_role(\n",
" RoleName=iam_comprehend_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Comprehend Role'\n",
+ " Description=\"DSOAWS Comprehend Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)\n",
" print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)\n",
- " \n",
+ "\n",
"time.sleep(30)"
]
},
@@ -250,34 +252,10 @@
"comprehend_s3_policy_doc = {\n",
" \"Version\": \"2012-10-17\",\n",
" \"Statement\": [\n",
- " {\n",
- " \"Action\": [\n",
- " \"s3:GetObject\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:s3:::{}/*\".format(bucket)\n",
- " ],\n",
- " \"Effect\": \"Allow\"\n",
- " },\n",
- " {\n",
- " \"Action\": [\n",
- " \"s3:ListBucket\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:s3:::{}\".format(bucket)\n",
- " ],\n",
- " \"Effect\": \"Allow\"\n",
- " },\n",
- " {\n",
- " \"Action\": [\n",
- " \"s3:PutObject\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:s3:::{}/*\".format(bucket)\n",
- " ],\n",
- " \"Effect\": \"Allow\"\n",
- " }\n",
- " ]\n",
+ " {\"Action\": [\"s3:GetObject\"], \"Resource\": [\"arn:aws:s3:::{}/*\".format(bucket)], \"Effect\": \"Allow\"},\n",
+ " {\"Action\": [\"s3:ListBucket\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)], \"Effect\": \"Allow\"},\n",
+ " {\"Action\": [\"s3:PutObject\"], \"Resource\": [\"arn:aws:s3:::{}/*\".format(bucket)], \"Effect\": \"Allow\"},\n",
+ " ],\n",
"}\n",
"\n",
"print(comprehend_s3_policy_doc)"
@@ -300,8 +278,8 @@
"\n",
"response = iam.put_role_policy(\n",
" RoleName=iam_comprehend_role_name,\n",
- " PolicyName='DSOAWS_ComprehendPolicyToS3',\n",
- " PolicyDocument=json.dumps(comprehend_s3_policy_doc)\n",
+ " PolicyName=\"DSOAWS_ComprehendPolicyToS3\",\n",
+ " PolicyDocument=json.dumps(comprehend_s3_policy_doc),\n",
")\n",
"\n",
"print(response)\n",
@@ -322,9 +300,9 @@
"metadata": {},
"outputs": [],
"source": [
- "prefix = 'models'\n",
+ "prefix = \"models\"\n",
"\n",
- "s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')\n",
+ "s3_output_job = \"s3://{}/{}/{}\".format(bucket, prefix, \"comprehend/output\")\n",
"print(s3_output_job)"
]
},
@@ -334,7 +312,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_comprehend_arn = iam_role_comprehend['Role']['Arn']"
+ "iam_role_comprehend_arn = iam_role_comprehend[\"Role\"][\"Arn\"]"
]
},
{
@@ -348,7 +326,7 @@
"\n",
"timestamp = str(datetime.datetime.now().strftime(\"%s\"))\n",
"\n",
- "comprehend_training_job_name = 'Amazon-Customer-Reviews-Classifier-{}'.format(timestamp) \n",
+ "comprehend_training_job_name = \"Amazon-Customer-Reviews-Classifier-{}\".format(timestamp)\n",
"\n",
"print(comprehend_training_job_name)"
]
@@ -362,13 +340,9 @@
"training_job = comprehend.create_document_classifier(\n",
" DocumentClassifierName=comprehend_training_job_name,\n",
" DataAccessRoleArn=iam_role_comprehend_arn,\n",
- " InputDataConfig={\n",
- " 'S3Uri': comprehend_train_s3_uri\n",
- " },\n",
- " OutputDataConfig={\n",
- " 'S3Uri': s3_output_job\n",
- " },\n",
- " LanguageCode='en'\n",
+ " InputDataConfig={\"S3Uri\": comprehend_train_s3_uri},\n",
+ " OutputDataConfig={\"S3Uri\": s3_output_job},\n",
+ " LanguageCode=\"en\",\n",
")\n",
"\n",
"time.sleep(30)"
@@ -380,7 +354,7 @@
"metadata": {},
"outputs": [],
"source": [
- "comprehend_training_job_arn = training_job['DocumentClassifierArn']\n",
+ "comprehend_training_job_arn = training_job[\"DocumentClassifierArn\"]\n",
"\n",
"print(comprehend_training_job_arn)"
]
@@ -393,7 +367,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Comprehend Training Job'.format(region, comprehend_training_job_arn)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Comprehend Training Job'.format(\n",
+ " region, comprehend_training_job_arn\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -414,21 +394,21 @@
"source": [
"import time\n",
"\n",
- "max_time = time.time() + 3 * 60 * 60 # 3 hours\n",
+ "max_time = time.time() + 3 * 60 * 60 # 3 hours\n",
"while time.time() < max_time:\n",
" describe_custom_classifier = comprehend.describe_document_classifier(\n",
- " DocumentClassifierArn = comprehend_training_job_arn\n",
+ " DocumentClassifierArn=comprehend_training_job_arn\n",
" )\n",
" status = describe_custom_classifier[\"DocumentClassifierProperties\"][\"Status\"]\n",
" print(\"Custom classifier: {}\".format(status))\n",
- " \n",
+ "\n",
" if status == \"TRAINED\" or status == \"IN_ERROR\":\n",
- " print('')\n",
- " print('Status {}'.format(status))\n",
- " print('')\n",
+ " print(\"\")\n",
+ " print(\"Status {}\".format(status))\n",
+ " print(\"\")\n",
" print(describe_custom_classifier[\"DocumentClassifierProperties\"])\n",
" break\n",
- " \n",
+ "\n",
" time.sleep(10)"
]
},
@@ -479,11 +459,12 @@
"outputs": [],
"source": [
"import os\n",
- "#Retrieve the S3URI from the model output and create jobkey variable.\n",
+ "\n",
+ "# Retrieve the S3URI from the model output and create jobkey variable.\n",
"job_output = describe_custom_classifier[\"DocumentClassifierProperties\"][\"OutputDataConfig\"][\"S3Uri\"]\n",
"print(job_output)\n",
"\n",
- "path_prefix = 's3://{}/'.format(bucket)\n",
+ "path_prefix = \"s3://{}/\".format(bucket)\n",
"\n",
"job_key = os.path.relpath(job_output, path_prefix)\n",
"\n",
@@ -503,9 +484,9 @@
"metadata": {},
"outputs": [],
"source": [
- "s3 = boto3.resource('s3')\n",
+ "s3 = boto3.resource(\"s3\")\n",
"\n",
- "s3.Bucket(bucket).download_file(job_key, './output.tar.gz')"
+ "s3.Bucket(bucket).download_file(job_key, \"./output.tar.gz\")"
]
},
{
@@ -514,7 +495,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#Unpack the gzip file\n",
+ "# Unpack the gzip file\n",
"!tar xvzf ./output.tar.gz"
]
},
@@ -526,7 +507,7 @@
"source": [
"import json\n",
"\n",
- "with open('./output/confusion_matrix.json') as json_file:\n",
+ "with open(\"./output/confusion_matrix.json\") as json_file:\n",
" data = json.load(json_file)\n",
"print(json.dumps(data, indent=2, default=str))"
]
@@ -548,14 +529,52 @@
"source": [
"from IPython.display import HTML, display\n",
"import tabulate\n",
- "table = [['', '1', '2', '3', '4', '5', '(Predicted)'],\n",
- " ['1', data['confusion_matrix'][0][0], data['confusion_matrix'][0][1], data['confusion_matrix'][0][2], data['confusion_matrix'][0][3], data['confusion_matrix'][0][4]],\n",
- " ['2', data['confusion_matrix'][1][0], data['confusion_matrix'][1][1], data['confusion_matrix'][1][2], data['confusion_matrix'][1][3], data['confusion_matrix'][1][4]],\n",
- " ['3', data['confusion_matrix'][2][0], data['confusion_matrix'][2][1], data['confusion_matrix'][2][2], data['confusion_matrix'][2][3], data['confusion_matrix'][2][4]],\n",
- " ['4', data['confusion_matrix'][3][0], data['confusion_matrix'][3][1], data['confusion_matrix'][3][2], data['confusion_matrix'][3][3], data['confusion_matrix'][3][4]],\n",
- " ['5', data['confusion_matrix'][4][0], data['confusion_matrix'][4][1], data['confusion_matrix'][4][2], data['confusion_matrix'][4][3], data['confusion_matrix'][4][4]],\n",
- " ['(Actual)']]\n",
- "display(HTML(tabulate.tabulate(table, tablefmt='html')))"
+ "\n",
+ "table = [\n",
+ " [\"\", \"1\", \"2\", \"3\", \"4\", \"5\", \"(Predicted)\"],\n",
+ " [\n",
+ " \"1\",\n",
+ " data[\"confusion_matrix\"][0][0],\n",
+ " data[\"confusion_matrix\"][0][1],\n",
+ " data[\"confusion_matrix\"][0][2],\n",
+ " data[\"confusion_matrix\"][0][3],\n",
+ " data[\"confusion_matrix\"][0][4],\n",
+ " ],\n",
+ " [\n",
+ " \"2\",\n",
+ " data[\"confusion_matrix\"][1][0],\n",
+ " data[\"confusion_matrix\"][1][1],\n",
+ " data[\"confusion_matrix\"][1][2],\n",
+ " data[\"confusion_matrix\"][1][3],\n",
+ " data[\"confusion_matrix\"][1][4],\n",
+ " ],\n",
+ " [\n",
+ " \"3\",\n",
+ " data[\"confusion_matrix\"][2][0],\n",
+ " data[\"confusion_matrix\"][2][1],\n",
+ " data[\"confusion_matrix\"][2][2],\n",
+ " data[\"confusion_matrix\"][2][3],\n",
+ " data[\"confusion_matrix\"][2][4],\n",
+ " ],\n",
+ " [\n",
+ " \"4\",\n",
+ " data[\"confusion_matrix\"][3][0],\n",
+ " data[\"confusion_matrix\"][3][1],\n",
+ " data[\"confusion_matrix\"][3][2],\n",
+ " data[\"confusion_matrix\"][3][3],\n",
+ " data[\"confusion_matrix\"][3][4],\n",
+ " ],\n",
+ " [\n",
+ " \"5\",\n",
+ " data[\"confusion_matrix\"][4][0],\n",
+ " data[\"confusion_matrix\"][4][1],\n",
+ " data[\"confusion_matrix\"][4][2],\n",
+ " data[\"confusion_matrix\"][4][3],\n",
+ " data[\"confusion_matrix\"][4][4],\n",
+ " ],\n",
+ " [\"(Actual)\"],\n",
+ "]\n",
+ "display(HTML(tabulate.tabulate(table, tablefmt=\"html\")))"
]
},
{
@@ -572,14 +591,13 @@
"outputs": [],
"source": [
"from time import gmtime, strftime, sleep\n",
- "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
"\n",
- "comprehend_endpoint_name = 'comprehend-inference-ep-' + timestamp_suffix\n",
+ "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
+ "\n",
+ "comprehend_endpoint_name = \"comprehend-inference-ep-\" + timestamp_suffix\n",
"\n",
"inference_endpoint_response = comprehend.create_endpoint(\n",
- " EndpointName=comprehend_endpoint_name,\n",
- " ModelArn=model_arn,\n",
- " DesiredInferenceUnits=1\n",
+ " EndpointName=comprehend_endpoint_name, ModelArn=model_arn, DesiredInferenceUnits=1\n",
")"
]
},
diff --git a/02_usecases/08_Text_Classification_Predict.ipynb b/02_usecases/08_Text_Classification_Predict.ipynb
index 0e3c0cd3..52ef9466 100644
--- a/02_usecases/08_Text_Classification_Predict.ipynb
+++ b/02_usecases/08_Text_Classification_Predict.ipynb
@@ -38,21 +38,16 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
"from botocore.config import Config\n",
"\n",
- "config = Config(\n",
- " retries = {\n",
- " 'max_attempts': 10,\n",
- " 'mode': 'adaptive'\n",
- " }\n",
- ")\n",
+ "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n",
"\n",
- "comprehend = boto3.Session().client(service_name='comprehend', region_name=region)"
+ "comprehend = boto3.Session().client(service_name=\"comprehend\", region_name=region)"
]
},
{
@@ -73,10 +68,10 @@
"try:\n",
" comprehend_training_job_arn\n",
"except NameError:\n",
- " print('***************************************************************************')\n",
- " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n",
- " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n",
- " print('***************************************************************************')"
+ " print(\"***************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n",
+ " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n",
+ " print(\"***************************************************************************\")"
]
},
{
@@ -108,10 +103,10 @@
"try:\n",
" comprehend_endpoint_arn\n",
"except NameError:\n",
- " print('***************************************************************************')\n",
- " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n",
- " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n",
- " print('***************************************************************************')"
+ " print(\"***************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n",
+ " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n",
+ " print(\"***************************************************************************\")"
]
},
{
@@ -140,9 +135,7 @@
},
"outputs": [],
"source": [
- "describe_response = comprehend.describe_endpoint(\n",
- " EndpointArn = comprehend_endpoint_arn\n",
- ")\n",
+ "describe_response = comprehend.describe_endpoint(EndpointArn=comprehend_endpoint_arn)\n",
"print(describe_response)"
]
},
@@ -161,7 +154,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Comprehend Model Endpoint'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Comprehend Model Endpoint'.format(\n",
+ " region, comprehend_training_job_arn, comprehend_endpoint_arn\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -172,17 +171,15 @@
"source": [
"import time\n",
"\n",
- "max_time = time.time() + 3*60*60 # 3 hours\n",
+ "max_time = time.time() + 3 * 60 * 60 # 3 hours\n",
"while time.time() < max_time:\n",
- " describe_response = comprehend.describe_endpoint(\n",
- " EndpointArn = comprehend_endpoint_arn\n",
- " )\n",
+ " describe_response = comprehend.describe_endpoint(EndpointArn=comprehend_endpoint_arn)\n",
" status = describe_response[\"EndpointProperties\"][\"Status\"]\n",
" print(\"Endpoint: {}\".format(status))\n",
- " \n",
+ "\n",
" if status == \"IN_SERVICE\" or status == \"IN_ERROR\":\n",
" break\n",
- " \n",
+ "\n",
" time.sleep(5)"
]
},
@@ -208,12 +205,10 @@
"source": [
"txt = \"\"\"I loved it! I will recommend this to everyone.\"\"\"\n",
"\n",
- "response = comprehend.classify_document(\n",
- " Text= txt,\n",
- " EndpointArn = comprehend_endpoint_arn\n",
- ")\n",
+ "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n",
"\n",
"import json\n",
+ "\n",
"print(json.dumps(response, indent=2, default=str))"
]
},
@@ -225,12 +220,10 @@
"source": [
"txt = \"\"\"It's OK.\"\"\"\n",
"\n",
- "response = comprehend.classify_document(\n",
- " Text= txt,\n",
- " EndpointArn = comprehend_endpoint_arn\n",
- ")\n",
+ "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n",
"\n",
"import json\n",
+ "\n",
"print(json.dumps(response, indent=2, default=str))"
]
},
@@ -244,12 +237,10 @@
"source": [
"txt = \"\"\"Really bad. I hope they don't make this anymore.\"\"\"\n",
"\n",
- "response = comprehend.classify_document(\n",
- " Text= txt,\n",
- " EndpointArn = comprehend_endpoint_arn\n",
- ")\n",
+ "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n",
"\n",
"import json\n",
+ "\n",
"print(json.dumps(response, indent=2, default=str))"
]
},
diff --git a/02_usecases/archive/05_Celebrity_Detection.ipynb b/02_usecases/archive/05_Celebrity_Detection.ipynb
index f542e985..1f9e1b63 100644
--- a/02_usecases/archive/05_Celebrity_Detection.ipynb
+++ b/02_usecases/archive/05_Celebrity_Detection.ipynb
@@ -68,8 +68,8 @@
"outputs": [],
"source": [
"# Init clients\n",
- "rekognition = boto3.client('rekognition')\n",
- "s3 = boto3.client('s3')"
+ "rekognition = boto3.client(\"rekognition\")\n",
+ "s3 = boto3.client(\"s3\")"
]
},
{
@@ -98,7 +98,7 @@
"# around recognized celebrities to show them here in the notebook.\n",
"\n",
"!mkdir -p m1tmp\n",
- "tempFolder = 'm1tmp/'"
+ "tempFolder = \"m1tmp/\""
]
},
{
@@ -137,7 +137,7 @@
}
],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))"
+ "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucketName, \"Key\": imageName})))"
]
},
{
@@ -172,9 +172,9 @@
"\n",
"recognizeCelebritiesResponse = rekognition.recognize_celebrities(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucketName,\n",
- " 'Name': imageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucketName,\n",
+ " \"Name\": imageName,\n",
" }\n",
" }\n",
")"
@@ -195,7 +195,7 @@
"source": [
"# Show JSON response returned by Rekognition Celebrity Recognition API\n",
"# In the JSON response below, you will see CelebrityFaces which contains information about recognized celebrities.\n",
- "# For each recognized celebrity, you will see information like Name, Id, Urls and additional information about \n",
+ "# For each recognized celebrity, you will see information like Name, Id, Urls and additional information about\n",
"# their facial attributes.\n",
"\n",
"display(recognizeCelebritiesResponse)"
@@ -216,13 +216,14 @@
"source": [
"# Define a function that will display image with bounded boxes around recognized celebrites\n",
"# We will call this function in next step\n",
- " \n",
- "def drawBoundingBoxes (sourceImage, boxes):\n",
+ "\n",
+ "\n",
+ "def drawBoundingBoxes(sourceImage, boxes):\n",
" # blue, green, red, grey\n",
- " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n",
- " \n",
+ " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n",
+ "\n",
" # Download image locally\n",
- " imageLocation = tempFolder+os.path.basename(sourceImage)\n",
+ " imageLocation = tempFolder + os.path.basename(sourceImage)\n",
" s3.download_file(bucketName, sourceImage, imageLocation)\n",
"\n",
" # Draws BB on Image\n",
@@ -231,24 +232,24 @@
" width, height = bbImage.size\n",
" col = 0\n",
" maxcol = len(colors)\n",
- " line= 3\n",
+ " line = 3\n",
" for box in boxes:\n",
- " x1 = int(box[1]['Left'] * width)\n",
- " y1 = int(box[1]['Top'] * height)\n",
- " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n",
- " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n",
- " \n",
- " draw.text((x1,y1),box[0],colors[col])\n",
+ " x1 = int(box[1][\"Left\"] * width)\n",
+ " y1 = int(box[1][\"Top\"] * height)\n",
+ " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n",
+ " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n",
+ "\n",
+ " draw.text((x1, y1), box[0], colors[col])\n",
" for l in range(line):\n",
- " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n",
- " col = (col+1)%maxcol\n",
- " \n",
+ " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n",
+ " col = (col + 1) % maxcol\n",
+ "\n",
" imageFormat = \"PNG\"\n",
" ext = sourceImage.lower()\n",
- " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n",
- " imageFormat = 'JPEG'\n",
+ " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n",
+ " imageFormat = \"JPEG\"\n",
"\n",
- " bbImage.save(imageLocation,format=imageFormat)\n",
+ " bbImage.save(imageLocation, format=imageFormat)\n",
"\n",
" display(bbImage)"
]
@@ -274,10 +275,10 @@
"# Extract bounding box information from JSON response above and display image with bounding boxes around celebrites.\n",
"\n",
"boxes = []\n",
- "celebrities = recognizeCelebritiesResponse['CelebrityFaces']\n",
+ "celebrities = recognizeCelebritiesResponse[\"CelebrityFaces\"]\n",
"for celebrity in celebrities:\n",
- " boxes.append ((celebrity['Name'], celebrity['Face']['BoundingBox']))\n",
- " \n",
+ " boxes.append((celebrity[\"Name\"], celebrity[\"Face\"][\"BoundingBox\"]))\n",
+ "\n",
"drawBoundingBoxes(imageName, boxes)"
]
},
@@ -319,14 +320,14 @@
"# Start celebrity recognition job\n",
"startCelebrityRekognition = rekognition.start_celebrity_recognition(\n",
" Video={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucketName,\n",
- " 'Name': videoName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucketName,\n",
+ " \"Name\": videoName,\n",
" }\n",
" },\n",
")\n",
"\n",
- "celebrityJobId = startCelebrityRekognition['JobId']\n",
+ "celebrityJobId = startCelebrityRekognition[\"JobId\"]\n",
"display(\"Job Id: {0}\".format(celebrityJobId))"
]
},
@@ -347,20 +348,15 @@
"\n",
"# Wait for celebrity recognition job to complete\n",
"# In production use cases, you would usually use StepFucntion or SNS topic to get notified when job is complete.\n",
- "getCelebrityRecognition = rekognition.get_celebrity_recognition(\n",
- " JobId=celebrityJobId,\n",
- " SortBy='TIMESTAMP'\n",
- ")\n",
+ "getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n",
"\n",
- "while(getCelebrityRecognition['JobStatus'] == 'IN_PROGRESS'):\n",
+ "while getCelebrityRecognition[\"JobStatus\"] == \"IN_PROGRESS\":\n",
" time.sleep(5)\n",
- " print('.', end='')\n",
- " \n",
- " getCelebrityRecognition = rekognition.get_celebrity_recognition(\n",
- " JobId=celebrityJobId,\n",
- " SortBy='TIMESTAMP')\n",
- " \n",
- "display(getCelebrityRecognition['JobStatus'])"
+ " print(\".\", end=\"\")\n",
+ "\n",
+ " getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n",
+ "\n",
+ "display(getCelebrityRecognition[\"JobStatus\"])"
]
},
{
@@ -404,16 +400,16 @@
"strOverall = \"Celebrities in the overall video:
=======================================
\"\n",
"\n",
"# Celebrities detected in each frame\n",
- "for celebrity in getCelebrityRecognition['Celebrities']:\n",
- " if 'Celebrity' in celebrity :\n",
+ "for celebrity in getCelebrityRecognition[\"Celebrities\"]:\n",
+ " if \"Celebrity\" in celebrity:\n",
" cconfidence = celebrity[\"Celebrity\"][\"Confidence\"]\n",
- " if(cconfidence > 95):\n",
- " ts = celebrity [\"Timestamp\"]\n",
+ " if cconfidence > 95:\n",
+ " ts = celebrity[\"Timestamp\"]\n",
" cname = celebrity[\"Celebrity\"][\"Name\"]\n",
- " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence,2))\n",
+ " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence, 2))\n",
" if not cname in theCelebs:\n",
" theCelebs[cname] = cname\n",
- " \n",
+ "\n",
"\n",
"# Unique faces detected in video\n",
"for theCeleb in theCelebs:\n",
@@ -421,7 +417,7 @@
"\n",
"# Display results\n",
"display(HTML(strOverall))\n",
- "#display(HTML(strDetail))"
+ "# display(HTML(strDetail))"
]
},
{
@@ -442,12 +438,14 @@
"# Show video in a player\n",
"\n",
"s3FilePrefix = \"https://s3.amazonaws.com\"\n",
- "if(not awsRegion == 'us-east-1'):\n",
+ "if not awsRegion == \"us-east-1\":\n",
" s3FilePrefix = \"https://s3-{}.amazonaws.com\".format(awsRegion)\n",
"\n",
"s3VideoUrl = \"{0}/{1}/{2}\".format(s3FilePrefix, bucketName, videoName)\n",
"\n",
- "videoTag = \"\".format(s3VideoUrl)\n",
+ "videoTag = \"\".format(\n",
+ " s3VideoUrl\n",
+ ")\n",
"\n",
"videoui = \"\".format(videoTag, strDetail)\n",
"\n",
@@ -479,7 +477,9 @@
"metadata": {},
"outputs": [],
"source": [
- "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': customCelebrityImageName})))"
+ "display(\n",
+ " IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucketName, \"Key\": customCelebrityImageName}))\n",
+ ")"
]
},
{
@@ -492,9 +492,9 @@
"\n",
"customCelebrityResponse = rekognition.recognize_celebrities(\n",
" Image={\n",
- " 'S3Object': {\n",
- " 'Bucket': bucketName,\n",
- " 'Name': customCelebrityImageName,\n",
+ " \"S3Object\": {\n",
+ " \"Bucket\": bucketName,\n",
+ " \"Name\": customCelebrityImageName,\n",
" }\n",
" }\n",
")"
@@ -507,7 +507,7 @@
"outputs": [],
"source": [
"# Display Rekognition response\n",
- "# You will see Rekognition return an empty list for CelebrityFaces and \n",
+ "# You will see Rekognition return an empty list for CelebrityFaces and\n",
"# UnrecognizedFaces list with unrecognized faces that were detected in the image.\n",
"# In the next module you will learn how to get custom-celebrity faces recognized.\n",
"\n",
@@ -520,14 +520,14 @@
"metadata": {},
"outputs": [],
"source": [
- "#Show image and bounded boxes around detected faces\n",
+ "# Show image and bounded boxes around detected faces\n",
"\n",
"# Extract BB info from response\n",
"cboxes = []\n",
- "faces = customCelebrityResponse['UnrecognizedFaces']\n",
+ "faces = customCelebrityResponse[\"UnrecognizedFaces\"]\n",
"for face in faces:\n",
- " cboxes.append (('Unrecognized Face', face['BoundingBox']))\n",
- " \n",
+ " cboxes.append((\"Unrecognized Face\", face[\"BoundingBox\"]))\n",
+ "\n",
"drawBoundingBoxes(customCelebrityImageName, cboxes)"
]
},
diff --git a/03_automl/01_Prepare_Dataset_Autopilot.ipynb b/03_automl/01_Prepare_Dataset_Autopilot.ipynb
index 8a72f04b..fd1db9f2 100644
--- a/03_automl/01_Prepare_Dataset_Autopilot.ipynb
+++ b/03_automl/01_Prepare_Dataset_Autopilot.ipynb
@@ -59,9 +59,9 @@
"try:\n",
" setup_instance_check_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -91,9 +91,9 @@
"try:\n",
" setup_dependencies_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -121,11 +121,11 @@
"outputs": [],
"source": [
"try:\n",
- " setup_s3_bucket_passed \n",
+ " setup_s3_bucket_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -153,11 +153,11 @@
"outputs": [],
"source": [
"try:\n",
- " setup_iam_roles_passed \n",
+ " setup_iam_roles_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -183,21 +183,21 @@
"outputs": [],
"source": [
"if not setup_instance_check_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_dependencies_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_s3_bucket_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_iam_roles_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -210,7 +210,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -242,10 +242,12 @@
"source": [
"import csv\n",
"\n",
- "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df.shape"
]
},
@@ -265,12 +267,13 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(kind=\"bar\", title=\"Breakdown by Star Rating\")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -288,43 +291,26 @@
"source": [
"from sklearn.utils import resample\n",
"\n",
- "five_star_df = df.query('star_rating == 5')\n",
- "four_star_df = df.query('star_rating == 4')\n",
- "three_star_df = df.query('star_rating == 3')\n",
- "two_star_df = df.query('star_rating == 2')\n",
- "one_star_df = df.query('star_rating == 1')\n",
+ "five_star_df = df.query(\"star_rating == 5\")\n",
+ "four_star_df = df.query(\"star_rating == 4\")\n",
+ "three_star_df = df.query(\"star_rating == 3\")\n",
+ "two_star_df = df.query(\"star_rating == 2\")\n",
+ "one_star_df = df.query(\"star_rating == 1\")\n",
"\n",
"# Check which sentiment has the least number of samples\n",
- "minority_count = min(five_star_df.shape[0], \n",
- " four_star_df.shape[0], \n",
- " three_star_df.shape[0], \n",
- " two_star_df.shape[0], \n",
- " one_star_df.shape[0]) \n",
+ "minority_count = min(\n",
+ " five_star_df.shape[0], four_star_df.shape[0], three_star_df.shape[0], two_star_df.shape[0], one_star_df.shape[0]\n",
+ ")\n",
"\n",
- "five_star_df = resample(five_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "four_star_df = resample(four_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "three_star_df = resample(three_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "two_star_df = resample(two_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
- "one_star_df = resample(one_star_df,\n",
- " replace = False,\n",
- " n_samples = minority_count,\n",
- " random_state = 27)\n",
+ "one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)\n",
"\n",
"df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])\n",
"df_balanced = df_balanced.reset_index(drop=True)\n",
@@ -338,9 +324,11 @@
"metadata": {},
"outputs": [],
"source": [
- "df_balanced[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_balanced[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -368,14 +356,10 @@
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Split all data into 90% train and 10% holdout\n",
- "df_train, df_holdout = train_test_split(df_balanced, \n",
- " test_size=0.10,\n",
- " stratify=df_balanced['star_rating'])\n",
+ "df_train, df_holdout = train_test_split(df_balanced, test_size=0.10, stratify=df_balanced[\"star_rating\"])\n",
"\n",
"# Split holdout data into 50% validation and 50% test\n",
- "df_validation, df_test = train_test_split(df_holdout,\n",
- " test_size=0.50, \n",
- " stratify=df_holdout['star_rating'])\n"
+ "df_validation, df_test = train_test_split(df_holdout, test_size=0.50, stratify=df_holdout[\"star_rating\"])"
]
},
{
@@ -385,16 +369,16 @@
"outputs": [],
"source": [
"# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n",
- "labels = ['Train', 'Validation', 'Test']\n",
+ "labels = [\"Train\", \"Validation\", \"Test\"]\n",
"sizes = [len(df_train.index), len(df_validation.index), len(df_test.index)]\n",
- "explode = (0.1, 0, 0) \n",
+ "explode = (0.1, 0, 0)\n",
"\n",
"fig1, ax1 = plt.subplots()\n",
"\n",
- "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)\n",
+ "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90)\n",
"\n",
"# Equal aspect ratio ensures that pie is drawn as a circle.\n",
- "ax1.axis('equal') \n",
+ "ax1.axis(\"equal\")\n",
"\n",
"plt.show()"
]
@@ -421,7 +405,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_train[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='90% Train Breakdown by Star Rating')"
+ "df_train[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"90% Train Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -446,7 +432,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_validation[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Validation Breakdown by Star Rating')"
+ "df_validation[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"5% Validation Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -471,7 +459,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_test[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Test Breakdown by Star Rating')"
+ "df_test[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"5% Test Breakdown by Star Rating\"\n",
+ ")"
]
},
{
@@ -487,7 +477,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_train = df_train[['star_rating', 'review_body']]\n",
+ "df_train = df_train[[\"star_rating\", \"review_body\"]]\n",
"df_train.shape"
]
},
@@ -513,7 +503,7 @@
"metadata": {},
"outputs": [],
"source": [
- "autopilot_train_path = './amazon_reviews_us_Digital_Software_v1_00_autopilot.csv'\n",
+ "autopilot_train_path = \"./amazon_reviews_us_Digital_Software_v1_00_autopilot.csv\"\n",
"df_train.to_csv(autopilot_train_path, index=False, header=True)"
]
},
@@ -530,7 +520,7 @@
"metadata": {},
"outputs": [],
"source": [
- "train_s3_prefix = 'data'\n",
+ "train_s3_prefix = \"data\"\n",
"autopilot_train_s3_uri = sess.upload_data(path=autopilot_train_path, key_prefix=train_s3_prefix)\n",
"autopilot_train_s3_uri"
]
diff --git a/03_automl/02_Train_Reviews_Autopilot.ipynb b/03_automl/02_Train_Reviews_Autopilot.ipynb
index d2d7f65c..f07d5525 100644
--- a/03_automl/02_Train_Reviews_Autopilot.ipynb
+++ b/03_automl/02_Train_Reviews_Autopilot.ipynb
@@ -89,11 +89,11 @@
"source": [
"try:\n",
" autopilot_train_s3_uri\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -112,11 +112,11 @@
"outputs": [],
"source": [
"if not autopilot_train_s3_uri:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -130,12 +130,12 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -187,7 +187,7 @@
"source": [
"import csv\n",
"\n",
- "df = pd.read_csv('./tmp/amazon_reviews_us_Digital_Software_v1_00_autopilot.csv')\n",
+ "df = pd.read_csv(\"./tmp/amazon_reviews_us_Digital_Software_v1_00_autopilot.csv\")\n",
"df.head()"
]
},
@@ -205,9 +205,9 @@
"metadata": {},
"outputs": [],
"source": [
- "prefix_model_output = 'models/autopilot'\n",
+ "prefix_model_output = \"models/autopilot\"\n",
"\n",
- "model_output_s3_uri = 's3://{}/{}'.format(bucket, prefix_model_output)\n",
+ "model_output_s3_uri = \"s3://{}/{}\".format(bucket, prefix_model_output)\n",
"\n",
"print(model_output_s3_uri)"
]
@@ -221,27 +221,21 @@
"max_candidates = 3\n",
"\n",
"job_config = {\n",
- " 'CompletionCriteria': {\n",
- " 'MaxRuntimePerTrainingJobInSeconds': 900,\n",
- " 'MaxCandidates': max_candidates,\n",
- " 'MaxAutoMLJobRuntimeInSeconds': 5400\n",
+ " \"CompletionCriteria\": {\n",
+ " \"MaxRuntimePerTrainingJobInSeconds\": 900,\n",
+ " \"MaxCandidates\": max_candidates,\n",
+ " \"MaxAutoMLJobRuntimeInSeconds\": 5400,\n",
" },\n",
"}\n",
"\n",
- "input_data_config = [{\n",
- " 'DataSource': {\n",
- " 'S3DataSource': {\n",
- " 'S3DataType': 'S3Prefix',\n",
- " 'S3Uri': '{}'.format(autopilot_train_s3_uri)\n",
- " }\n",
- " },\n",
- " 'TargetAttributeName': 'star_rating'\n",
+ "input_data_config = [\n",
+ " {\n",
+ " \"DataSource\": {\"S3DataSource\": {\"S3DataType\": \"S3Prefix\", \"S3Uri\": \"{}\".format(autopilot_train_s3_uri)}},\n",
+ " \"TargetAttributeName\": \"star_rating\",\n",
" }\n",
"]\n",
"\n",
- "output_data_config = {\n",
- " 'S3OutputPath': '{}'.format(model_output_s3_uri)\n",
- "}"
+ "output_data_config = {\"S3OutputPath\": \"{}\".format(model_output_s3_uri)}"
]
},
{
@@ -266,20 +260,20 @@
"metadata": {},
"outputs": [],
"source": [
- "num_existing_jobs = 0 \n",
+ "num_existing_jobs = 0\n",
"running_jobs = 0\n",
"\n",
- "if 'AutoMLJobSummaries' in existing_jobs_response.keys():\n",
- " job_list = existing_jobs_response['AutoMLJobSummaries']\n",
+ "if \"AutoMLJobSummaries\" in existing_jobs_response.keys():\n",
+ " job_list = existing_jobs_response[\"AutoMLJobSummaries\"]\n",
" num_existing_jobs = len(job_list)\n",
" # print('[INFO] You already created {} Autopilot job(s) in this account.'.format(num_existing_jobs))\n",
" for j in job_list:\n",
- " if 'AutoMLJobStatus' in j.keys(): \n",
- " if j['AutoMLJobStatus'] == 'InProgress':\n",
+ " if \"AutoMLJobStatus\" in j.keys():\n",
+ " if j[\"AutoMLJobStatus\"] == \"InProgress\":\n",
" running_jobs = running_jobs + 1\n",
- " print('[INFO] You have {} Autopilot job(s) currently running << Should be 0 jobs.'.format(running_jobs))\n",
+ " print(\"[INFO] You have {} Autopilot job(s) currently running << Should be 0 jobs.\".format(running_jobs))\n",
"else:\n",
- " print('[OK] Please continue.')"
+ " print(\"[OK] Please continue.\")"
]
},
{
@@ -308,10 +302,10 @@
"\n",
"try:\n",
" auto_ml_job_name\n",
- "except NameError: \n",
- " timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
- " auto_ml_job_name = 'automl-dm-' + timestamp_suffix\n",
- " print('Created AutoMLJobName: ' + auto_ml_job_name)"
+ "except NameError:\n",
+ " timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
+ " auto_ml_job_name = \"automl-dm-\" + timestamp_suffix\n",
+ " print(\"Created AutoMLJobName: \" + auto_ml_job_name)"
]
},
{
@@ -340,19 +334,29 @@
"source": [
"max_running_jobs = 1\n",
"\n",
- "if running_jobs < max_running_jobs: # Limiting to max. 1 Jobs\n",
+ "if running_jobs < max_running_jobs: # Limiting to max. 1 Jobs\n",
" try:\n",
- " sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,\n",
- " InputDataConfig=input_data_config,\n",
- " OutputDataConfig=output_data_config,\n",
- " AutoMLJobConfig=job_config,\n",
- " RoleArn=role)\n",
- " print('[OK] Autopilot Job {} created.'.format(auto_ml_job_name))\n",
+ " sm.create_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name,\n",
+ " InputDataConfig=input_data_config,\n",
+ " OutputDataConfig=output_data_config,\n",
+ " AutoMLJobConfig=job_config,\n",
+ " RoleArn=role,\n",
+ " )\n",
+ " print(\"[OK] Autopilot Job {} created.\".format(auto_ml_job_name))\n",
" running_jobs = running_jobs + 1\n",
" except:\n",
- " print('[INFO] You have already launched an Autopilot job. Please continue see the output of this job.'.format(running_jobs))\n",
+ " print(\n",
+ " \"[INFO] You have already launched an Autopilot job. Please continue see the output of this job.\".format(\n",
+ " running_jobs\n",
+ " )\n",
+ " )\n",
"else:\n",
- " print('[INFO] You have already launched {} Autopilot running job(s). Please continue see the output of the running job.'.format(running_jobs))"
+ " print(\n",
+ " \"[INFO] You have already launched {} Autopilot running job(s). Please continue see the output of the running job.\".format(\n",
+ " running_jobs\n",
+ " )\n",
+ " )"
]
},
{
@@ -392,14 +396,17 @@
"source": [
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
"\n",
- "while 'AutoMLJobStatus' not in job_description_response.keys() and 'AutoMLJobSecondaryStatus' not in job_description_response.keys():\n",
+ "while (\n",
+ " \"AutoMLJobStatus\" not in job_description_response.keys()\n",
+ " and \"AutoMLJobSecondaryStatus\" not in job_description_response.keys()\n",
+ "):\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " print('[INFO] Autopilot Job has not yet started. Please wait. ')\n",
+ " print(\"[INFO] Autopilot Job has not yet started. Please wait. \")\n",
" print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n",
- " print('[INFO] Waiting for Autopilot Job to start...')\n",
+ " print(\"[INFO] Waiting for Autopilot Job to start...\")\n",
" sleep(15)\n",
"\n",
- "print('[OK] AutoMLJob started.')"
+ "print(\"[OK] AutoMLJob started.\")"
]
},
{
@@ -419,7 +426,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Processing Jobs'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Jobs'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -441,18 +454,18 @@
"source": [
"%%time\n",
"\n",
- "job_status = job_description_response['AutoMLJobStatus']\n",
- "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ "job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
"\n",
- "if job_status not in ('Stopped', 'Failed'):\n",
- " while job_status in ('InProgress') and job_sec_status in ('Starting', 'AnalyzingData'):\n",
+ "if job_status not in (\"Stopped\", \"Failed\"):\n",
+ " while job_status in (\"InProgress\") and job_sec_status in (\"Starting\", \"AnalyzingData\"):\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " job_status = job_description_response['AutoMLJobStatus']\n",
- " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ " job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
" print(job_status, job_sec_status)\n",
" sleep(15)\n",
- " print('[OK] Data analysis phase completed.\\n')\n",
- " \n",
+ " print(\"[OK] Data analysis phase completed.\\n\")\n",
+ "\n",
"print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))"
]
},
@@ -481,14 +494,14 @@
"source": [
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
"\n",
- "while 'AutoMLJobArtifacts' not in job_description_response.keys():\n",
+ "while \"AutoMLJobArtifacts\" not in job_description_response.keys():\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " print('[INFO] Autopilot Job has not yet generated the artifacts. Please wait. ')\n",
+ " print(\"[INFO] Autopilot Job has not yet generated the artifacts. Please wait. \")\n",
" print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n",
- " print('[INFO] Waiting for AutoMLJobArtifacts...')\n",
+ " print(\"[INFO] Waiting for AutoMLJobArtifacts...\")\n",
" sleep(15)\n",
"\n",
- "print('[OK] AutoMLJobArtifacts generated.')"
+ "print(\"[OK] AutoMLJobArtifacts generated.\")"
]
},
{
@@ -499,14 +512,14 @@
"source": [
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
"\n",
- "while 'DataExplorationNotebookLocation' not in job_description_response['AutoMLJobArtifacts'].keys():\n",
+ "while \"DataExplorationNotebookLocation\" not in job_description_response[\"AutoMLJobArtifacts\"].keys():\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " print('[INFO] Autopilot Job has not yet generated the notebooks. Please wait. ')\n",
+ " print(\"[INFO] Autopilot Job has not yet generated the notebooks. Please wait. \")\n",
" print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n",
- " print('[INFO] Waiting for DataExplorationNotebookLocation...')\n",
+ " print(\"[INFO] Waiting for DataExplorationNotebookLocation...\")\n",
" sleep(15)\n",
"\n",
- "print('[OK] DataExplorationNotebookLocation found.') "
+ "print(\"[OK] DataExplorationNotebookLocation found.\")"
]
},
{
@@ -515,9 +528,9 @@
"metadata": {},
"outputs": [],
"source": [
- "generated_resources = job_description_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']\n",
- "download_path = generated_resources.rsplit('/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb')[0]\n",
- "job_id = download_path.rsplit('/', 1)[-1]"
+ "generated_resources = job_description_response[\"AutoMLJobArtifacts\"][\"DataExplorationNotebookLocation\"]\n",
+ "download_path = generated_resources.rsplit(\"/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb\")[0]\n",
+ "job_id = download_path.rsplit(\"/\", 1)[-1]"
]
},
{
@@ -528,10 +541,16 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "if not job_id: \n",
- " print('No AutoMLJobArtifacts found.')\n",
- "else: \n",
- " display(HTML('Review S3 Generated Resources'.format(bucket, prefix_model_output, auto_ml_job_name, job_id)))"
+ "if not job_id:\n",
+ " print(\"No AutoMLJobArtifacts found.\")\n",
+ "else:\n",
+ " display(\n",
+ " HTML(\n",
+ " 'Review S3 Generated Resources'.format(\n",
+ " bucket, prefix_model_output, auto_ml_job_name, job_id\n",
+ " )\n",
+ " )\n",
+ " )"
]
},
{
@@ -627,7 +646,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Jobs'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Jobs'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -638,7 +663,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Batch Transform Jobs'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Batch Transform Jobs'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -661,19 +692,19 @@
"%%time\n",
"\n",
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- "job_status = job_description_response['AutoMLJobStatus']\n",
- "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ "job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
"print(job_status)\n",
"print(job_sec_status)\n",
- "if job_status not in ('Stopped', 'Failed'):\n",
- " while job_status in ('InProgress') and job_sec_status in ('FeatureEngineering'):\n",
+ "if job_status not in (\"Stopped\", \"Failed\"):\n",
+ " while job_status in (\"InProgress\") and job_sec_status in (\"FeatureEngineering\"):\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " job_status = job_description_response['AutoMLJobStatus']\n",
- " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ " job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
" print(job_status, job_sec_status)\n",
" sleep(15)\n",
- " print('[OK] Feature engineering phase completed.\\n')\n",
- " \n",
+ " print(\"[OK] Feature engineering phase completed.\\n\")\n",
+ "\n",
"print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))"
]
},
@@ -719,7 +750,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Hyperparameter Tuning Jobs'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Hyperparameter Tuning Jobs'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -730,7 +767,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Jobs'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Jobs'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -753,19 +796,19 @@
"%%time\n",
"\n",
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- "job_status = job_description_response['AutoMLJobStatus']\n",
- "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ "job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
"print(job_status)\n",
"print(job_sec_status)\n",
- "if job_status not in ('Stopped', 'Failed'):\n",
- " while job_status in ('InProgress') and job_sec_status in ('ModelTuning'):\n",
+ "if job_status not in (\"Stopped\", \"Failed\"):\n",
+ " while job_status in (\"InProgress\") and job_sec_status in (\"ModelTuning\"):\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " job_status = job_description_response['AutoMLJobStatus']\n",
- " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n",
+ " job_status = job_description_response[\"AutoMLJobStatus\"]\n",
+ " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n",
" print(job_status, job_sec_status)\n",
" sleep(15)\n",
- " print('[OK] Model tuning phase completed.\\n')\n",
- " \n",
+ " print(\"[OK] Model tuning phase completed.\\n\")\n",
+ "\n",
"print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))"
]
},
@@ -794,17 +837,17 @@
"%%time\n",
"\n",
"job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- "job_status = job_description_response['AutoMLJobStatus']\n",
+ "job_status = job_description_response[\"AutoMLJobStatus\"]\n",
"print(job_status)\n",
- "if job_status not in ('Stopped', 'Failed'):\n",
- " while job_status not in ('Completed'):\n",
+ "if job_status not in (\"Stopped\", \"Failed\"):\n",
+ " while job_status not in (\"Completed\"):\n",
" job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " job_status = job_description_response['AutoMLJobStatus']\n",
+ " job_status = job_description_response[\"AutoMLJobStatus\"]\n",
" print(job_status)\n",
" sleep(10)\n",
- " print('[OK] Autopilot Job completed.\\n')\n",
+ " print(\"[OK] Autopilot Job completed.\\n\")\n",
"else:\n",
- " print(job_status)\n"
+ " print(job_status)"
]
},
{
@@ -821,8 +864,9 @@
"metadata": {},
"outputs": [],
"source": [
- "candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n",
- " SortBy='FinalObjectiveMetricValue')"
+ "candidates_response = sm.list_candidates_for_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n",
+ ")"
]
},
{
@@ -847,15 +891,16 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'Candidates' not in candidates_response.keys():\n",
- " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n",
- " SortBy='FinalObjectiveMetricValue')\n",
- " print('[INFO] Autopilot Job is generating the Candidates. Please wait.')\n",
+ "while \"Candidates\" not in candidates_response.keys():\n",
+ " candidates_response = sm.list_candidates_for_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n",
+ " )\n",
+ " print(\"[INFO] Autopilot Job is generating the Candidates. Please wait.\")\n",
" print(json.dumps(candidates_response, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "candidates = candidates_response['Candidates']\n",
- "print('[OK] Candidates generated.') "
+ "candidates = candidates_response[\"Candidates\"]\n",
+ "print(\"[OK] Candidates generated.\")"
]
},
{
@@ -873,15 +918,16 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'CandidateName' not in candidates[0]:\n",
- " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n",
- " SortBy='FinalObjectiveMetricValue')\n",
- " candidates = candidates_response['Candidates']\n",
- " print('[INFO] Autopilot Job is generating CandidateName. Please wait. ')\n",
+ "while \"CandidateName\" not in candidates[0]:\n",
+ " candidates_response = sm.list_candidates_for_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n",
+ " )\n",
+ " candidates = candidates_response[\"Candidates\"]\n",
+ " print(\"[INFO] Autopilot Job is generating CandidateName. Please wait. \")\n",
" print(json.dumps(candidates, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "print('[OK] CandidateName generated.')"
+ "print(\"[OK] CandidateName generated.\")"
]
},
{
@@ -890,15 +936,16 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'FinalAutoMLJobObjectiveMetric' not in candidates[0]:\n",
- " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n",
- " SortBy='FinalObjectiveMetricValue')\n",
- " candidates = candidates_response['Candidates']\n",
- " print('[INFO] Autopilot Job is generating FinalAutoMLJobObjectiveMetric. Please wait. ')\n",
+ "while \"FinalAutoMLJobObjectiveMetric\" not in candidates[0]:\n",
+ " candidates_response = sm.list_candidates_for_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n",
+ " )\n",
+ " candidates = candidates_response[\"Candidates\"]\n",
+ " print(\"[INFO] Autopilot Job is generating FinalAutoMLJobObjectiveMetric. Please wait. \")\n",
" print(json.dumps(candidates, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "print('[OK] FinalAutoMLJobObjectiveMetric generated.')"
+ "print(\"[OK] FinalAutoMLJobObjectiveMetric generated.\")"
]
},
{
@@ -919,9 +966,13 @@
"outputs": [],
"source": [
"for index, candidate in enumerate(candidates):\n",
- " print(str(index) + \" \" \n",
- " + candidate['CandidateName'] + \" \" \n",
- " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))"
+ " print(\n",
+ " str(index)\n",
+ " + \" \"\n",
+ " + candidate[\"CandidateName\"]\n",
+ " + \" \"\n",
+ " + str(candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"])\n",
+ " )"
]
},
{
@@ -942,8 +993,8 @@
"from sagemaker.analytics import ExperimentAnalytics, TrainingJobAnalytics\n",
"\n",
"exp = ExperimentAnalytics(\n",
- " sagemaker_session=sess, \n",
- " experiment_name=auto_ml_job_name + '-aws-auto-ml-job',\n",
+ " sagemaker_session=sess,\n",
+ " experiment_name=auto_ml_job_name + \"-aws-auto-ml-job\",\n",
")\n",
"\n",
"df = exp.dataframe()\n",
@@ -989,14 +1040,14 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'BestCandidate' not in best_candidate_response:\n",
+ "while \"BestCandidate\" not in best_candidate_response:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " print('[INFO] Autopilot Job is generating BestCandidate. Please wait. ')\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate. Please wait. \")\n",
" print(json.dumps(best_candidate_response, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "best_candidate = best_candidate_response['BestCandidate']\n",
- "print('[OK] BestCandidate generated.') "
+ "best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ "print(\"[OK] BestCandidate generated.\")"
]
},
{
@@ -1025,14 +1076,14 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'CandidateName' not in best_candidate:\n",
+ "while \"CandidateName\" not in best_candidate:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "print('[OK] BestCandidate CandidateName generated.') "
+ "print(\"[OK] BestCandidate CandidateName generated.\")"
]
},
{
@@ -1041,14 +1092,14 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'FinalAutoMLJobObjectiveMetric' not in best_candidate:\n",
+ "while \"FinalAutoMLJobObjectiveMetric\" not in best_candidate:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "print('[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.') "
+ "print(\"[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.\")"
]
},
{
@@ -1057,10 +1108,10 @@
"metadata": {},
"outputs": [],
"source": [
- "best_candidate_identifier = best_candidate['CandidateName']\n",
+ "best_candidate_identifier = best_candidate[\"CandidateName\"]\n",
"print(\"Candidate name: \" + best_candidate_identifier)\n",
- "print(\"Metric name: \" + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])\n",
- "print(\"Metric value: \" + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))"
+ "print(\"Metric name: \" + best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"MetricName\"])\n",
+ "print(\"Metric value: \" + str(best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"]))"
]
},
{
@@ -1087,15 +1138,15 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'CandidateSteps' not in best_candidate:\n",
+ "while \"CandidateSteps\" not in best_candidate:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "best_candidate = best_candidate_response['BestCandidate']\n",
- "print('[OK] BestCandidate CandidateSteps generated.')"
+ "best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ "print(\"[OK] BestCandidate CandidateSteps generated.\")"
]
},
{
@@ -1106,15 +1157,15 @@
},
"outputs": [],
"source": [
- "while 'CandidateStepType' not in best_candidate['CandidateSteps'][0]:\n",
+ "while \"CandidateStepType\" not in best_candidate[\"CandidateSteps\"][0]:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepType. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepType. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "best_candidate = best_candidate_response['BestCandidate']\n",
- "print('[OK] BestCandidate CandidateSteps CandidateStepType generated.')"
+ "best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ "print(\"[OK] BestCandidate CandidateSteps CandidateStepType generated.\")"
]
},
{
@@ -1123,15 +1174,15 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'CandidateStepName' not in best_candidate['CandidateSteps'][0]:\n",
+ "while \"CandidateStepName\" not in best_candidate[\"CandidateSteps\"][0]:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepName. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepName. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "best_candidate = best_candidate_response['BestCandidate']\n",
- "print('[OK] BestCandidate CandidateSteps CandidateStepName generated.')"
+ "best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ "print(\"[OK] BestCandidate CandidateSteps CandidateStepName generated.\")"
]
},
{
@@ -1141,10 +1192,10 @@
"outputs": [],
"source": [
"steps = []\n",
- "for step in best_candidate['CandidateSteps']:\n",
- " print('Candidate Step Type: {}'.format(step['CandidateStepType']))\n",
- " print('Candidate Step Name: {}'.format(step['CandidateStepName']))\n",
- " steps.append(step['CandidateStepName'])"
+ "for step in best_candidate[\"CandidateSteps\"]:\n",
+ " print(\"Candidate Step Type: {}\".format(step[\"CandidateStepType\"]))\n",
+ " print(\"Candidate Step Name: {}\".format(step[\"CandidateStepName\"]))\n",
+ " steps.append(step[\"CandidateStepName\"])"
]
},
{
@@ -1155,7 +1206,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Best Candidate Processing Job'.format(region, steps[0])))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Best Candidate Processing Job'.format(\n",
+ " region, steps[0]\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1166,7 +1223,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Best Candidate Training Job'.format(region, steps[1])))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Best Candidate Training Job'.format(\n",
+ " region, steps[1]\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1177,7 +1240,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Best Candidate Transform Job'.format(region, steps[2])))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Best Candidate Transform Job'.format(\n",
+ " region, steps[2]\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1188,7 +1257,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Best Candidate Training Job (Tuning)'.format(region, steps[3])))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Best Candidate Training Job (Tuning)'.format(\n",
+ " region, steps[3]\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1204,14 +1279,14 @@
"metadata": {},
"outputs": [],
"source": [
- "while 'InferenceContainers' not in best_candidate:\n",
+ "while \"InferenceContainers\" not in best_candidate:\n",
" best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
- " best_candidate = best_candidate_response['BestCandidate']\n",
- " print('[INFO] Autopilot Job is generating BestCandidate InferenceContainers. Please wait. ')\n",
+ " best_candidate = best_candidate_response[\"BestCandidate\"]\n",
+ " print(\"[INFO] Autopilot Job is generating BestCandidate InferenceContainers. Please wait. \")\n",
" print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n",
" sleep(10)\n",
"\n",
- "print('[OK] BestCandidate InferenceContainers generated.') "
+ "print(\"[OK] BestCandidate InferenceContainers generated.\")"
]
},
{
@@ -1220,7 +1295,7 @@
"metadata": {},
"outputs": [],
"source": [
- "best_candidate_containers = best_candidate['InferenceContainers']"
+ "best_candidate_containers = best_candidate[\"InferenceContainers\"]"
]
},
{
@@ -1230,9 +1305,9 @@
"outputs": [],
"source": [
"for container in best_candidate_containers:\n",
- " print(container['Image'])\n",
- " print(container['ModelDataUrl'])\n",
- " print('======================')"
+ " print(container[\"Image\"])\n",
+ " print(container[\"ModelDataUrl\"])\n",
+ " print(\"======================\")"
]
},
{
@@ -1249,8 +1324,8 @@
"outputs": [],
"source": [
"for container in best_candidate_containers:\n",
- " print(container['Environment'])\n",
- " print('======================')"
+ " print(container[\"Environment\"])\n",
+ " print(\"======================\")"
]
},
{
@@ -1259,9 +1334,9 @@
"metadata": {},
"outputs": [],
"source": [
- "best_candidate_containers[1]['Environment'].update({'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label, probability'})\n",
- "best_candidate_containers[2]['Environment'].update({'SAGEMAKER_INFERENCE_INPUT': 'predicted_label, probability'})\n",
- "best_candidate_containers[2]['Environment'].update({'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label, probability'})"
+ "best_candidate_containers[1][\"Environment\"].update({\"SAGEMAKER_INFERENCE_OUTPUT\": \"predicted_label, probability\"})\n",
+ "best_candidate_containers[2][\"Environment\"].update({\"SAGEMAKER_INFERENCE_INPUT\": \"predicted_label, probability\"})\n",
+ "best_candidate_containers[2][\"Environment\"].update({\"SAGEMAKER_INFERENCE_OUTPUT\": \"predicted_label, probability\"})"
]
},
{
@@ -1271,8 +1346,8 @@
"outputs": [],
"source": [
"for container in best_candidate_containers:\n",
- " print(container['Environment'])\n",
- " print('======================')"
+ " print(container[\"Environment\"])\n",
+ " print(\"======================\")"
]
},
{
@@ -1298,7 +1373,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(best_candidate['InferenceContainers'])"
+ "print(best_candidate[\"InferenceContainers\"])"
]
},
{
@@ -1318,10 +1393,10 @@
"source": [
"try:\n",
" autopilot_model_name\n",
- "except NameError: \n",
- " timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
- " autopilot_model_name = 'automl-dm-model-' + timestamp_suffix\n",
- " print('[OK] Created Autopilot Model Name: ' + autopilot_model_name)"
+ "except NameError:\n",
+ " timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
+ " autopilot_model_name = \"automl-dm-model-\" + timestamp_suffix\n",
+ " print(\"[OK] Created Autopilot Model Name: \" + autopilot_model_name)"
]
},
{
@@ -1351,11 +1426,11 @@
"try:\n",
" autopilot_model_arn\n",
"except NameError:\n",
- " create_model_response = sm.create_model(Containers=best_candidate['InferenceContainers'],\n",
- " ModelName=autopilot_model_name,\n",
- " ExecutionRoleArn=role)\n",
- " autopilot_model_arn = create_model_response['ModelArn']\n",
- " print('[OK] Created Autopilot Model: {}'.format(autopilot_model_arn))"
+ " create_model_response = sm.create_model(\n",
+ " Containers=best_candidate[\"InferenceContainers\"], ModelName=autopilot_model_name, ExecutionRoleArn=role\n",
+ " )\n",
+ " autopilot_model_arn = create_model_response[\"ModelArn\"]\n",
+ " print(\"[OK] Created Autopilot Model: {}\".format(autopilot_model_arn))"
]
},
{
@@ -1380,8 +1455,8 @@
"metadata": {},
"outputs": [],
"source": [
- "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
- "epc_name = 'automl-dm-epc-' + timestamp_suffix\n",
+ "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
+ "epc_name = \"automl-dm-epc-\" + timestamp_suffix\n",
"\n",
"print(epc_name)"
]
@@ -1408,13 +1483,13 @@
"metadata": {},
"outputs": [],
"source": [
- "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
+ "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
"\n",
"try:\n",
" autopilot_endpoint_name\n",
- "except NameError: \n",
- " autopilot_endpoint_name = 'automl-dm-ep-' + timestamp_suffix\n",
- " print('[OK] Created Autopilot Endpoint Name {}: '.format(autopilot_endpoint_name)) "
+ "except NameError:\n",
+ " autopilot_endpoint_name = \"automl-dm-ep-\" + timestamp_suffix\n",
+ " print(\"[OK] Created Autopilot Endpoint Name {}: \".format(autopilot_endpoint_name))"
]
},
{
@@ -1423,8 +1498,8 @@
"metadata": {},
"outputs": [],
"source": [
- "variant_name = 'automl-dm-variant-' + timestamp_suffix\n",
- "print('[OK] Created Endpoint Variant Name {}: '.format(variant_name))"
+ "variant_name = \"automl-dm-variant-\" + timestamp_suffix\n",
+ "print(\"[OK] Created Endpoint Variant Name {}: \".format(variant_name))"
]
},
{
@@ -1442,11 +1517,17 @@
"metadata": {},
"outputs": [],
"source": [
- "ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,\n",
- " ProductionVariants=[{'InstanceType':'ml.m5.large',\n",
- " 'InitialInstanceCount': 1,\n",
- " 'ModelName': autopilot_model_name,\n",
- " 'VariantName': variant_name}])"
+ "ep_config = sm.create_endpoint_config(\n",
+ " EndpointConfigName=epc_name,\n",
+ " ProductionVariants=[\n",
+ " {\n",
+ " \"InstanceType\": \"ml.m5.large\",\n",
+ " \"InitialInstanceCount\": 1,\n",
+ " \"ModelName\": autopilot_model_name,\n",
+ " \"VariantName\": variant_name,\n",
+ " }\n",
+ " ],\n",
+ ")"
]
},
{
@@ -1466,10 +1547,9 @@
"source": [
"try:\n",
" autopilot_endpoint_arn\n",
- "except NameError: \n",
- " create_endpoint_response = sm.create_endpoint(EndpointName=autopilot_endpoint_name,\n",
- " EndpointConfigName=epc_name) \n",
- " autopilot_endpoint_arn = create_endpoint_response['EndpointArn']\n",
+ "except NameError:\n",
+ " create_endpoint_response = sm.create_endpoint(EndpointName=autopilot_endpoint_name, EndpointConfigName=epc_name)\n",
+ " autopilot_endpoint_arn = create_endpoint_response[\"EndpointArn\"]\n",
" print(autopilot_endpoint_arn)"
]
},
@@ -1490,7 +1570,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, autopilot_endpoint_name)))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, autopilot_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/03_automl/03_Predict_Reviews_Autopilot.ipynb b/03_automl/03_Predict_Reviews_Autopilot.ipynb
index aa9db286..926e0c24 100644
--- a/03_automl/03_Predict_Reviews_Autopilot.ipynb
+++ b/03_automl/03_Predict_Reviews_Autopilot.ipynb
@@ -27,12 +27,12 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -52,12 +52,12 @@
"source": [
"try:\n",
" autopilot_endpoint_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('***************************************************************************')\n",
- " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n",
- " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n",
- " print('***************************************************************************')"
+ " print(\"***************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n",
+ " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n",
+ " print(\"***************************************************************************\")"
]
},
{
@@ -83,7 +83,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sm.get_waiter('endpoint_in_service').wait(EndpointName=autopilot_endpoint_name)"
+ "sm.get_waiter(\"endpoint_in_service\").wait(EndpointName=autopilot_endpoint_name)"
]
},
{
@@ -93,9 +93,9 @@
"outputs": [],
"source": [
"resp = sm.describe_endpoint(EndpointName=autopilot_endpoint_name)\n",
- "status = resp['EndpointStatus']\n",
+ "status = resp[\"EndpointStatus\"]\n",
"\n",
- "print(\"Arn: \" + resp['EndpointArn'])\n",
+ "print(\"Arn: \" + resp[\"EndpointArn\"])\n",
"print(\"Status: \" + status)"
]
},
@@ -113,7 +113,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sm_runtime = boto3.client('sagemaker-runtime')"
+ "sm_runtime = boto3.client(\"sagemaker-runtime\")"
]
},
{
@@ -124,12 +124,14 @@
"source": [
"csv_line_predict_positive = \"\"\"I loved it!\"\"\"\n",
"\n",
- "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_positive)\n",
+ "response = sm_runtime.invoke_endpoint(\n",
+ " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_positive\n",
+ ")\n",
"\n",
- "response_body = response['Body'].read().decode('utf-8').strip()\n",
+ "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n",
"\n",
- "r = response_body.split(',')\n",
- "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))"
+ "r = response_body.split(\",\")\n",
+ "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))"
]
},
{
@@ -140,12 +142,14 @@
"source": [
"csv_line_predict_meh = \"\"\"It's OK.\"\"\"\n",
"\n",
- "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_meh)\n",
+ "response = sm_runtime.invoke_endpoint(\n",
+ " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_meh\n",
+ ")\n",
"\n",
- "response_body = response['Body'].read().decode('utf-8').strip()\n",
+ "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n",
"\n",
- "r = response_body.split(',')\n",
- "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))"
+ "r = response_body.split(\",\")\n",
+ "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))"
]
},
{
@@ -158,12 +162,14 @@
"source": [
"csv_line_predict_negative = \"\"\"It's pretty good.\"\"\"\n",
"\n",
- "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_negative)\n",
+ "response = sm_runtime.invoke_endpoint(\n",
+ " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_negative\n",
+ ")\n",
"\n",
- "response_body = response['Body'].read().decode('utf-8').strip()\n",
+ "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n",
"\n",
- "r = response_body.split(',')\n",
- "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))"
+ "r = response_body.split(\",\")\n",
+ "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))"
]
},
{
diff --git a/03_automl/generated_module/candidate_data_processors/dpp0.py b/03_automl/generated_module/candidate_data_processors/dpp0.py
index c5d57903..acd57b70 100644
--- a/03_automl/generated_module/candidate_data_processors/dpp0.py
+++ b/03_automl/generated_module/candidate_data_processors/dpp0.py
@@ -7,10 +7,7 @@
# Given a list of column names and target column name, Header can return the index
# for given column name
-HEADER = Header(
- column_names=['star_rating', 'review_body'],
- target_column_name='star_rating'
-)
+HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating")
def build_feature_transform():
@@ -18,35 +15,25 @@ def build_feature_transform():
# These features can be parsed as natural language.
- text = HEADER.as_feature_indices(['review_body'])
+ text = HEADER.as_feature_indices(["review_body"])
text_processors = Pipeline(
steps=[
(
- 'multicolumntfidfvectorizer',
- MultiColumnTfidfVectorizer(
- max_df=0.9941,
- min_df=0.0007,
- analyzer='word',
- max_features=10000
- )
+ "multicolumntfidfvectorizer",
+ MultiColumnTfidfVectorizer(max_df=0.9941, min_df=0.0007, analyzer="word", max_features=10000),
)
]
)
- column_transformer = ColumnTransformer(
- transformers=[('text_processing', text_processors, text)]
- )
+ column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)])
return Pipeline(
- steps=[
- ('column_transformer', column_transformer
- ), ('robuststandardscaler', RobustStandardScaler())
- ]
+ steps=[("column_transformer", column_transformer), ("robuststandardscaler", RobustStandardScaler())]
)
def build_label_transform():
"""Returns the model definition representing feature processing."""
- return RobustLabelEncoder(labels=['1', '2', '3', '4', '5'])
+ return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"])
diff --git a/03_automl/generated_module/candidate_data_processors/dpp1.py b/03_automl/generated_module/candidate_data_processors/dpp1.py
index d54b46ab..8a1186cd 100644
--- a/03_automl/generated_module/candidate_data_processors/dpp1.py
+++ b/03_automl/generated_module/candidate_data_processors/dpp1.py
@@ -8,10 +8,7 @@
# Given a list of column names and target column name, Header can return the index
# for given column name
-HEADER = Header(
- column_names=['star_rating', 'review_body'],
- target_column_name='star_rating'
-)
+HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating")
def build_feature_transform():
@@ -19,31 +16,24 @@ def build_feature_transform():
# These features can be parsed as natural language.
- text = HEADER.as_feature_indices(['review_body'])
+ text = HEADER.as_feature_indices(["review_body"])
text_processors = Pipeline(
steps=[
(
- 'multicolumntfidfvectorizer',
- MultiColumnTfidfVectorizer(
- max_df=0.99,
- min_df=0.0021,
- analyzer='char_wb',
- max_features=10000
- )
+ "multicolumntfidfvectorizer",
+ MultiColumnTfidfVectorizer(max_df=0.99, min_df=0.0021, analyzer="char_wb", max_features=10000),
)
]
)
- column_transformer = ColumnTransformer(
- transformers=[('text_processing', text_processors, text)]
- )
+ column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)])
return Pipeline(
steps=[
- ('column_transformer',
- column_transformer), ('robustpca', RobustPCA(n_components=5)),
- ('robuststandardscaler', RobustStandardScaler())
+ ("column_transformer", column_transformer),
+ ("robustpca", RobustPCA(n_components=5)),
+ ("robuststandardscaler", RobustStandardScaler()),
]
)
@@ -51,4 +41,4 @@ def build_feature_transform():
def build_label_transform():
"""Returns the model definition representing feature processing."""
- return RobustLabelEncoder(labels=['1', '2', '3', '4', '5'])
+ return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"])
diff --git a/03_automl/generated_module/candidate_data_processors/dpp2.py b/03_automl/generated_module/candidate_data_processors/dpp2.py
index 2a52da4a..ef6e3dfd 100644
--- a/03_automl/generated_module/candidate_data_processors/dpp2.py
+++ b/03_automl/generated_module/candidate_data_processors/dpp2.py
@@ -7,10 +7,7 @@
# Given a list of column names and target column name, Header can return the index
# for given column name
-HEADER = Header(
- column_names=['star_rating', 'review_body'],
- target_column_name='star_rating'
-)
+HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating")
def build_feature_transform():
@@ -18,35 +15,25 @@ def build_feature_transform():
# These features can be parsed as natural language.
- text = HEADER.as_feature_indices(['review_body'])
+ text = HEADER.as_feature_indices(["review_body"])
text_processors = Pipeline(
steps=[
(
- 'multicolumntfidfvectorizer',
- MultiColumnTfidfVectorizer(
- max_df=0.9983,
- min_df=0.0005,
- analyzer='word',
- max_features=10000
- )
+ "multicolumntfidfvectorizer",
+ MultiColumnTfidfVectorizer(max_df=0.9983, min_df=0.0005, analyzer="word", max_features=10000),
)
]
)
- column_transformer = ColumnTransformer(
- transformers=[('text_processing', text_processors, text)]
- )
+ column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)])
return Pipeline(
- steps=[
- ('column_transformer', column_transformer
- ), ('robuststandardscaler', RobustStandardScaler())
- ]
+ steps=[("column_transformer", column_transformer), ("robuststandardscaler", RobustStandardScaler())]
)
def build_label_transform():
"""Returns the model definition representing feature processing."""
- return RobustLabelEncoder(labels=['1', '2', '3', '4', '5'])
+ return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"])
diff --git a/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py b/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py
index ae882934..a304708f 100644
--- a/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py
+++ b/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py
@@ -16,35 +16,34 @@
def _is_inverse_label_transform():
"""Returns True if if it's running in inverse label transform."""
- return os.getenv('AUTOML_TRANSFORM_MODE') == 'inverse-label-transform'
+ return os.getenv("AUTOML_TRANSFORM_MODE") == "inverse-label-transform"
def _is_feature_transform():
"""Returns True if it's running in feature transform mode."""
- return os.getenv('AUTOML_TRANSFORM_MODE') == 'feature-transform'
+ return os.getenv("AUTOML_TRANSFORM_MODE") == "feature-transform"
def _get_selected_input_keys():
"""Returns a list of ordered content keys for container's input."""
- return [key.strip().lower() for key in os.environ['SAGEMAKER_INFERENCE_INPUT'].split(',')]
+ return [key.strip().lower() for key in os.environ["SAGEMAKER_INFERENCE_INPUT"].split(",")]
def _get_selected_output_keys():
"""Returns a list of ordered content keys for container's output."""
- return [key.strip().lower() for key in os.environ['SAGEMAKER_INFERENCE_OUTPUT'].split(',')]
+ return [key.strip().lower() for key in os.environ["SAGEMAKER_INFERENCE_OUTPUT"].split(",")]
def _sparsify_if_needed(x):
"""Returns a sparse matrix if the needed for encoding to sparse recordio protobuf."""
- if os.getenv('AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF') == '1' \
- and not sparse.issparse(x):
+ if os.getenv("AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF") == "1" and not sparse.issparse(x):
return sparse.csr_matrix(x)
return x
def _split_features_target(x):
"""Returns the features and target by splitting the input array."""
- if os.getenv('AUTOML_TRANSFORM_MODE') == 'feature-transform':
+ if os.getenv("AUTOML_TRANSFORM_MODE") == "feature-transform":
return _sparsify_if_needed(x), None
if sparse.issparse(x):
@@ -68,7 +67,7 @@ def model_fn(model_dir):
deserialized model object that can be used for model serving
"""
- return load(filename=os.path.join(model_dir, 'model.joblib'))
+ return load(filename=os.path.join(model_dir, "model.joblib"))
def predict_fn(input_object, model):
@@ -101,10 +100,7 @@ def predict_fn(input_object, model):
try:
return model.transform(input_object)
except ValueError as e:
- return worker.Response(
- response='{}'.format(str(e) or 'Unknown error.'),
- status=http_client.BAD_REQUEST
- )
+ return worker.Response(response="{}".format(str(e) or "Unknown error."), status=http_client.BAD_REQUEST)
def _generate_post_processed_response(array, model):
@@ -137,8 +133,9 @@ def _generate_post_processed_response(array, model):
for output_key_idx, output_key in enumerate(output_keys):
if output_key == "predicted_label" and output_key in input_keys:
input_key_idx = input_keys.index(output_key)
- output_array[:, output_key_idx] = model.inverse_label_transform(array[:, input_key_idx]
- .ravel().astype(np.float).astype(np.int))
+ output_array[:, output_key_idx] = model.inverse_label_transform(
+ array[:, input_key_idx].ravel().astype(np.float).astype(np.int)
+ )
elif output_key == "labels":
output_array[:, output_key_idx][:] = str(list(model.target_transformer.get_classes()))
elif output_key in input_keys:
@@ -168,11 +165,10 @@ def input_fn(request_body, request_content_type):
decoded data as 2D numpy array
"""
- content_type = request_content_type.lower(
- ) if request_content_type else "text/csv"
+ content_type = request_content_type.lower() if request_content_type else "text/csv"
content_type = content_type.split(";")[0].strip()
- if content_type == 'text/csv':
+ if content_type == "text/csv":
if isinstance(request_body, str):
byte_buffer = request_body.encode()
else:
@@ -182,8 +178,7 @@ def input_fn(request_body, request_content_type):
return val
return worker.Response(
- response=f"'{request_content_type}' is an unsupported content type.",
- status=http_client.UNSUPPORTED_MEDIA_TYPE
+ response=f"'{request_content_type}' is an unsupported content type.", status=http_client.UNSUPPORTED_MEDIA_TYPE
)
@@ -217,20 +212,17 @@ def output_fn(prediction, accept_type):
return worker.Response(
response=encoder_factory[accept_type](prediction, output_keys),
status=http_client.OK,
- mimetype=accept_type
+ mimetype=accept_type,
)
except KeyError:
# Selectable inference is not turned on
- if accept_type == 'text/csv':
+ if accept_type == "text/csv":
return worker.Response(
- response=encoders.encode(prediction, accept_type),
- status=http_client.OK,
- mimetype=accept_type
+ response=encoders.encode(prediction, accept_type), status=http_client.OK, mimetype=accept_type
)
return worker.Response(
- response=f"Accept type '{accept_type}' is not supported "
- f"during inverse label transformation.",
- status=http_client.NOT_ACCEPTABLE
+ response=f"Accept type '{accept_type}' is not supported " f"during inverse label transformation.",
+ status=http_client.NOT_ACCEPTABLE,
)
if isinstance(prediction, tuple):
@@ -238,30 +230,22 @@ def output_fn(prediction, accept_type):
else:
X, y = _split_features_target(prediction)
- if accept_type == 'application/x-recordio-protobuf':
+ if accept_type == "application/x-recordio-protobuf":
return worker.Response(
response=encoders.array_to_recordio_protobuf(
- _sparsify_if_needed(X).astype('float32'),
- y.astype('float32') if y is not None else y
+ _sparsify_if_needed(X).astype("float32"), y.astype("float32") if y is not None else y
),
status=http_client.OK,
- mimetype=accept_type
+ mimetype=accept_type,
)
- if accept_type == 'text/csv':
+ if accept_type == "text/csv":
if y is not None:
- X = np.column_stack(
- (np.ravel(y), X.todense() if sparse.issparse(X) else X)
- )
+ X = np.column_stack((np.ravel(y), X.todense() if sparse.issparse(X) else X))
- return worker.Response(
- response=encoders.encode(X, accept_type),
- status=http_client.OK,
- mimetype=accept_type
- )
+ return worker.Response(response=encoders.encode(X, accept_type), status=http_client.OK, mimetype=accept_type)
return worker.Response(
- response=f"Accept type '{accept_type}' is not supported.",
- status=http_client.NOT_ACCEPTABLE
+ response=f"Accept type '{accept_type}' is not supported.", status=http_client.NOT_ACCEPTABLE
)
@@ -273,16 +257,8 @@ def execution_parameters_fn():
used during inference and defaults to 6MB otherwise.
"""
if _is_feature_transform():
- return worker.Response(
- response='{"MaxPayloadInMB":1}',
- status=http_client.OK,
- mimetype="application/json"
- )
- return worker.Response(
- response='{"MaxPayloadInMB":6}',
- status=http_client.OK,
- mimetype="application/json"
- )
+ return worker.Response(response='{"MaxPayloadInMB":1}', status=http_client.OK, mimetype="application/json")
+ return worker.Response(response='{"MaxPayloadInMB":6}', status=http_client.OK, mimetype="application/json")
def numpy_array_to_csv(array, output_keys):
@@ -358,7 +334,7 @@ def numpy_array_to_jsonlines(array, output_keys):
encoder_factory = {
- 'text/csv': numpy_array_to_csv,
- 'application/json': numpy_array_to_json,
- 'application/jsonlines': numpy_array_to_jsonlines
+ "text/csv": numpy_array_to_csv,
+ "application/json": numpy_array_to_json,
+ "application/jsonlines": numpy_array_to_jsonlines,
}
diff --git a/03_automl/generated_module/setup.py b/03_automl/generated_module/setup.py
index c437bef2..215813f9 100644
--- a/03_automl/generated_module/setup.py
+++ b/03_automl/generated_module/setup.py
@@ -1,13 +1,13 @@
from setuptools import setup
setup(
- packages=['candidate_data_processors/'],
- name='candidate_data_processors',
- version='1.0.0',
- description='This module is auto-generated by SageMaker AutoML. '
- 'It contains candidate data processing code and the '
- 'scaffolding to run them in SageMaker.',
- author='Amazon Web Services',
- license='Apache License 2.0',
+ packages=["candidate_data_processors/"],
+ name="candidate_data_processors",
+ version="1.0.0",
+ description="This module is auto-generated by SageMaker AutoML. "
+ "It contains candidate data processing code and the "
+ "scaffolding to run them in SageMaker.",
+ author="Amazon Web Services",
+ license="Apache License 2.0",
include_package_data=True,
)
diff --git a/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb b/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
index 202a3edb..10606107 100644
--- a/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
+++ b/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
@@ -92,6 +92,7 @@
"!aws s3 sync s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/sagemaker-automl-candidates/pr-1-f08b7007254e43bb8d5d10af988f5e93db4f7a6b7f194a32b39f2c0733/notebooks/sagemaker_automl automl-dm-16-23-03-55-artifacts/sagemaker_automl --only-show-errors\n",
"\n",
"import sys\n",
+ "\n",
"sys.path.append(\"automl-dm-16-23-03-55-artifacts\")"
]
},
@@ -114,30 +115,33 @@
"from sagemaker_automl import uid, AutoMLLocalRunConfig\n",
"\n",
"# Where the preprocessed data from the existing AutoML job is stored\n",
- "BASE_AUTOML_JOB_NAME = 'automl-dm-16-23-03-55'\n",
+ "BASE_AUTOML_JOB_NAME = \"automl-dm-16-23-03-55\"\n",
"BASE_AUTOML_JOB_CONFIG = {\n",
- " 'automl_job_name': BASE_AUTOML_JOB_NAME,\n",
- " 'automl_output_s3_base_path': 's3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55',\n",
- " 'data_transformer_image_repo_version': '0.2-1-cpu-py3',\n",
- " 'algo_image_repo_versions': {'xgboost': '1.0-1-cpu-py3'},\n",
- " 'algo_inference_image_repo_versions': {'xgboost': '1.0-1-cpu-py3'}\n",
+ " \"automl_job_name\": BASE_AUTOML_JOB_NAME,\n",
+ " \"automl_output_s3_base_path\": \"s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55\",\n",
+ " \"data_transformer_image_repo_version\": \"0.2-1-cpu-py3\",\n",
+ " \"algo_image_repo_versions\": {\"xgboost\": \"1.0-1-cpu-py3\"},\n",
+ " \"algo_inference_image_repo_versions\": {\"xgboost\": \"1.0-1-cpu-py3\"},\n",
"}\n",
"\n",
"# Path conventions of the output data storage path from the local AutoML job run of this notebook\n",
- "LOCAL_AUTOML_JOB_NAME = 'automl-dm--notebook-run-{}'.format(uid())\n",
+ "LOCAL_AUTOML_JOB_NAME = \"automl-dm--notebook-run-{}\".format(uid())\n",
"LOCAL_AUTOML_JOB_CONFIG = {\n",
- " 'local_automl_job_name': LOCAL_AUTOML_JOB_NAME,\n",
- " 'local_automl_job_output_s3_base_path': 's3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/{}'.format(LOCAL_AUTOML_JOB_NAME),\n",
- " 'data_processing_model_dir': 'data-processor-models',\n",
- " 'data_processing_transformed_output_dir': 'transformed-data',\n",
- " 'multi_algo_tuning_output_dir': 'multi-algo-tuning'\n",
+ " \"local_automl_job_name\": LOCAL_AUTOML_JOB_NAME,\n",
+ " \"local_automl_job_output_s3_base_path\": \"s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/{}\".format(\n",
+ " LOCAL_AUTOML_JOB_NAME\n",
+ " ),\n",
+ " \"data_processing_model_dir\": \"data-processor-models\",\n",
+ " \"data_processing_transformed_output_dir\": \"transformed-data\",\n",
+ " \"multi_algo_tuning_output_dir\": \"multi-algo-tuning\",\n",
"}\n",
"\n",
"AUTOML_LOCAL_RUN_CONFIG = AutoMLLocalRunConfig(\n",
- " role='arn:aws:iam::405759480474:role/mod-caf61d640fbd4ba7-SageMakerExecutionRole-1U3FI8J98QOSN',\n",
+ " role=\"arn:aws:iam::405759480474:role/mod-caf61d640fbd4ba7-SageMakerExecutionRole-1U3FI8J98QOSN\",\n",
" base_automl_job_config=BASE_AUTOML_JOB_CONFIG,\n",
" local_automl_job_config=LOCAL_AUTOML_JOB_CONFIG,\n",
- " security_config={'EnableInterContainerTrafficEncryption': False, 'VpcConfig': {}})\n",
+ " security_config={\"EnableInterContainerTrafficEncryption\": False, \"VpcConfig\": {}},\n",
+ ")\n",
"\n",
"AUTOML_LOCAL_RUN_CONFIG.display()"
]
@@ -194,30 +198,32 @@
"metadata": {},
"outputs": [],
"source": [
- "automl_interactive_runner.select_candidate({\n",
- " \"data_transformer\": {\n",
- " \"name\": \"dpp0\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
- " \"volume_size_in_gb\": 50\n",
- " },\n",
- " \"transform_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
+ "automl_interactive_runner.select_candidate(\n",
+ " {\n",
+ " \"data_transformer\": {\n",
+ " \"name\": \"dpp0\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " \"volume_size_in_gb\": 50,\n",
+ " },\n",
+ " \"transform_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
+ " \"transforms_label\": True,\n",
+ " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n",
+ " \"sparse_encoding\": True,\n",
" },\n",
- " \"transforms_label\": True,\n",
- " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n",
- " \"sparse_encoding\": True\n",
- " },\n",
- " \"algorithm\": {\n",
- " \"name\": \"xgboost\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
+ " \"algorithm\": {\n",
+ " \"name\": \"xgboost\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
" },\n",
" }\n",
- "})"
+ ")"
]
},
{
@@ -234,30 +240,32 @@
"metadata": {},
"outputs": [],
"source": [
- "automl_interactive_runner.select_candidate({\n",
- " \"data_transformer\": {\n",
- " \"name\": \"dpp1\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
- " \"volume_size_in_gb\": 50\n",
+ "automl_interactive_runner.select_candidate(\n",
+ " {\n",
+ " \"data_transformer\": {\n",
+ " \"name\": \"dpp1\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " \"volume_size_in_gb\": 50,\n",
+ " },\n",
+ " \"transform_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
+ " \"transforms_label\": True,\n",
+ " \"transformed_data_format\": \"text/csv\",\n",
+ " \"sparse_encoding\": False,\n",
" },\n",
- " \"transform_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
- " },\n",
- " \"transforms_label\": True,\n",
- " \"transformed_data_format\": \"text/csv\",\n",
- " \"sparse_encoding\": False\n",
- " },\n",
- " \"algorithm\": {\n",
- " \"name\": \"xgboost\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
+ " \"algorithm\": {\n",
+ " \"name\": \"xgboost\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
" },\n",
" }\n",
- "})"
+ ")"
]
},
{
@@ -274,30 +282,32 @@
"metadata": {},
"outputs": [],
"source": [
- "automl_interactive_runner.select_candidate({\n",
- " \"data_transformer\": {\n",
- " \"name\": \"dpp2\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
- " \"volume_size_in_gb\": 50\n",
+ "automl_interactive_runner.select_candidate(\n",
+ " {\n",
+ " \"data_transformer\": {\n",
+ " \"name\": \"dpp2\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " \"volume_size_in_gb\": 50,\n",
+ " },\n",
+ " \"transform_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
+ " \"transforms_label\": True,\n",
+ " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n",
+ " \"sparse_encoding\": True,\n",
" },\n",
- " \"transform_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
- " },\n",
- " \"transforms_label\": True,\n",
- " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n",
- " \"sparse_encoding\": True\n",
- " },\n",
- " \"algorithm\": {\n",
- " \"name\": \"xgboost\",\n",
- " \"training_resource_config\": {\n",
- " \"instance_type\": \"ml.m5.4xlarge\",\n",
- " \"instance_count\": 1,\n",
+ " \"algorithm\": {\n",
+ " \"name\": \"xgboost\",\n",
+ " \"training_resource_config\": {\n",
+ " \"instance_type\": \"ml.m5.4xlarge\",\n",
+ " \"instance_count\": 1,\n",
+ " },\n",
" },\n",
" }\n",
- "})"
+ ")"
]
},
{
@@ -399,14 +409,14 @@
"outputs": [],
"source": [
"ALGORITHM_OBJECTIVE_METRICS = {\n",
- " 'xgboost': 'validation:accuracy',\n",
+ " \"xgboost\": \"validation:accuracy\",\n",
"}\n",
"\n",
"STATIC_HYPERPARAMETERS = {\n",
- " 'xgboost': {\n",
- " 'objective': 'multi:softprob',\n",
- " 'save_model_on_termination': 'true',\n",
- " 'num_class': 5,\n",
+ " \"xgboost\": {\n",
+ " \"objective\": \"multi:softprob\",\n",
+ " \"save_model_on_termination\": \"true\",\n",
+ " \"num_class\": 5,\n",
" },\n",
"}"
]
@@ -427,16 +437,16 @@
"from sagemaker.parameter import CategoricalParameter, ContinuousParameter, IntegerParameter\n",
"\n",
"ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES = {\n",
- " 'xgboost': {\n",
- " 'num_round': IntegerParameter(2, 1024, scaling_type='Logarithmic'),\n",
- " 'max_depth': IntegerParameter(2, 8, scaling_type='Logarithmic'),\n",
- " 'eta': ContinuousParameter(1e-3, 1.0, scaling_type='Logarithmic'),\n",
- " 'gamma': ContinuousParameter(1e-6, 64.0, scaling_type='Logarithmic'),\n",
- " 'min_child_weight': ContinuousParameter(1e-6, 32.0, scaling_type='Logarithmic'),\n",
- " 'subsample': ContinuousParameter(0.5, 1.0, scaling_type='Linear'),\n",
- " 'colsample_bytree': ContinuousParameter(0.3, 1.0, scaling_type='Linear'),\n",
- " 'lambda': ContinuousParameter(1e-6, 2.0, scaling_type='Logarithmic'),\n",
- " 'alpha': ContinuousParameter(1e-6, 2.0, scaling_type='Logarithmic'),\n",
+ " \"xgboost\": {\n",
+ " \"num_round\": IntegerParameter(2, 1024, scaling_type=\"Logarithmic\"),\n",
+ " \"max_depth\": IntegerParameter(2, 8, scaling_type=\"Logarithmic\"),\n",
+ " \"eta\": ContinuousParameter(1e-3, 1.0, scaling_type=\"Logarithmic\"),\n",
+ " \"gamma\": ContinuousParameter(1e-6, 64.0, scaling_type=\"Logarithmic\"),\n",
+ " \"min_child_weight\": ContinuousParameter(1e-6, 32.0, scaling_type=\"Logarithmic\"),\n",
+ " \"subsample\": ContinuousParameter(0.5, 1.0, scaling_type=\"Linear\"),\n",
+ " \"colsample_bytree\": ContinuousParameter(0.3, 1.0, scaling_type=\"Linear\"),\n",
+ " \"lambda\": ContinuousParameter(1e-6, 2.0, scaling_type=\"Logarithmic\"),\n",
+ " \"alpha\": ContinuousParameter(1e-6, 2.0, scaling_type=\"Logarithmic\"),\n",
" },\n",
"}"
]
@@ -463,7 +473,8 @@
"multi_algo_tuning_parameters = automl_interactive_runner.prepare_multi_algo_parameters(\n",
" objective_metrics=ALGORITHM_OBJECTIVE_METRICS,\n",
" static_hyperparameters=STATIC_HYPERPARAMETERS,\n",
- " hyperparameters_search_ranges=ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES)"
+ " hyperparameters_search_ranges=ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES,\n",
+ ")"
]
},
{
@@ -515,8 +526,8 @@
"\n",
"tuner = HyperparameterTuner.create(\n",
" base_tuning_job_name=base_tuning_job_name,\n",
- " strategy='Bayesian',\n",
- " objective_type='Maximize',\n",
+ " strategy=\"Bayesian\",\n",
+ " objective_type=\"Maximize\",\n",
" max_parallel_jobs=7,\n",
" max_jobs=250,\n",
" **multi_algo_tuning_parameters,\n",
@@ -546,7 +557,10 @@
"tuning_job_name = tuner.latest_tuning_job.name\n",
"\n",
"display(\n",
- " Markdown(f\"Tuning Job {tuning_job_name} started, please track the progress from [here](https://{AUTOML_LOCAL_RUN_CONFIG.region}.console.aws.amazon.com/sagemaker/home?region={AUTOML_LOCAL_RUN_CONFIG.region}#/hyper-tuning-jobs/{tuning_job_name})\"))\n",
+ " Markdown(\n",
+ " f\"Tuning Job {tuning_job_name} started, please track the progress from [here](https://{AUTOML_LOCAL_RUN_CONFIG.region}.console.aws.amazon.com/sagemaker/home?region={AUTOML_LOCAL_RUN_CONFIG.region}#/hyper-tuning-jobs/{tuning_job_name})\"\n",
+ " )\n",
+ ")\n",
"\n",
"# Wait for tuning job to finish\n",
"tuner.wait()"
@@ -588,16 +602,14 @@
"SAGEMAKER_SESSION = AUTOML_LOCAL_RUN_CONFIG.sagemaker_session\n",
"SAGEMAKER_ROLE = AUTOML_LOCAL_RUN_CONFIG.role\n",
"\n",
- "tuner_analytics = HyperparameterTuningJobAnalytics(\n",
- " tuner.latest_tuning_job.name, sagemaker_session=SAGEMAKER_SESSION)\n",
+ "tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=SAGEMAKER_SESSION)\n",
"\n",
"df_tuning_job_analytics = tuner_analytics.dataframe()\n",
"\n",
"# Sort the tuning job analytics by the final metrics value\n",
"df_tuning_job_analytics.sort_values(\n",
- " by=['FinalObjectiveValue'],\n",
- " inplace=True,\n",
- " ascending=False if tuner.objective_type == \"Maximize\" else True)\n",
+ " by=[\"FinalObjectiveValue\"], inplace=True, ascending=False if tuner.objective_type == \"Maximize\" else True\n",
+ ")\n",
"\n",
"# Show detailed analytics for the top 20 models\n",
"df_tuning_job_analytics.head(20)"
@@ -661,13 +673,15 @@
"\n",
"# Get a data transformation model from chosen candidate\n",
"best_candidate = automl_interactive_runner.choose_candidate(df_tuning_job_analytics, best_training_job)\n",
- "best_data_transformer_model = best_candidate.get_data_transformer_model(role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION)\n",
+ "best_data_transformer_model = best_candidate.get_data_transformer_model(\n",
+ " role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION\n",
+ ")\n",
"\n",
"# Our first data transformation container will always return recordio-protobuf format\n",
- "best_data_transformer_model.env[\"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT\"] = 'application/x-recordio-protobuf'\n",
+ "best_data_transformer_model.env[\"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT\"] = \"application/x-recordio-protobuf\"\n",
"# Add environment variable for sparse encoding\n",
"if best_candidate.data_transformer_step.sparse_encoding:\n",
- " best_data_transformer_model.env[\"AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF\"] = '1'\n",
+ " best_data_transformer_model.env[\"AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF\"] = \"1\"\n",
"\n",
"# Get a algo model from chosen training job of the candidate\n",
"algo_estimator = Estimator.attach(best_training_job)\n",
@@ -677,22 +691,26 @@
"# inverse label transform model if we need to transform the intermediates back to non-numerical value\n",
"model_containers = [best_data_transformer_model, best_algo_model]\n",
"if best_candidate.transforms_label:\n",
- " model_containers.append(best_candidate.get_data_transformer_model(\n",
- " transform_mode=\"inverse-label-transform\",\n",
- " role=SAGEMAKER_ROLE,\n",
- " sagemaker_session=SAGEMAKER_SESSION))\n",
+ " model_containers.append(\n",
+ " best_candidate.get_data_transformer_model(\n",
+ " transform_mode=\"inverse-label-transform\", role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION\n",
+ " )\n",
+ " )\n",
"\n",
"# This model can emit response ['predicted_label', 'probability', 'labels', 'probabilities']. To enable the model to emit one or more\n",
"# of the response content, pass the keys to `output_key` keyword argument in the select_inference_output method.\n",
"\n",
- "model_containers = select_inference_output(\"MulticlassClassification\", model_containers, output_keys=['predicted_label'])\n",
+ "model_containers = select_inference_output(\n",
+ " \"MulticlassClassification\", model_containers, output_keys=[\"predicted_label\"]\n",
+ ")\n",
"\n",
"\n",
"pipeline_model = PipelineModel(\n",
" name=\"AutoML-{}\".format(AUTOML_LOCAL_RUN_CONFIG.local_automl_job_name),\n",
" role=SAGEMAKER_ROLE,\n",
" models=model_containers,\n",
- " vpc_config=AUTOML_LOCAL_RUN_CONFIG.vpc_config)"
+ " vpc_config=AUTOML_LOCAL_RUN_CONFIG.vpc_config,\n",
+ ")"
]
},
{
@@ -717,10 +735,9 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_model.deploy(initial_instance_count=1,\n",
- " instance_type='ml.m5.2xlarge',\n",
- " endpoint_name=pipeline_model.name,\n",
- " wait=True)"
+ "pipeline_model.deploy(\n",
+ " initial_instance_count=1, instance_type=\"ml.m5.2xlarge\", endpoint_name=pipeline_model.name, wait=True\n",
+ ")"
]
},
{
diff --git a/03_automl/notebooks/sagemaker_automl/common.py b/03_automl/notebooks/sagemaker_automl/common.py
index ecc21808..f53f81ad 100644
--- a/03_automl/notebooks/sagemaker_automl/common.py
+++ b/03_automl/notebooks/sagemaker_automl/common.py
@@ -20,8 +20,7 @@ def uid():
class AutoMLLocalCandidateStep:
- """Helper class to execute a callable which is decorated with some metadata like name action.
- """
+ """Helper class to execute a callable which is decorated with some metadata like name action."""
def __init__(self, name, action, description=""):
self.name = name
@@ -56,22 +55,15 @@ def execute_steps(execution_name, steps, context, start_jitter_seconds=5):
for step in steps:
sleep(start_jitter_seconds)
thread_name = threading.current_thread().name
- logging.info(
- "[{}:{}]Executing step: {}".format(thread_name, execution_name, step.name)
- )
+ logging.info("[{}:{}]Executing step: {}".format(thread_name, execution_name, step.name))
while True:
try:
step.run(context)
break
except ClientError as e:
- if (
- e.response["Error"]["Code"] == "ThrottlingException"
- and wait_seconds < max_wait_seconds
- ):
- logging.info(
- "We are getting throttled, retrying in {}s".format(wait_seconds)
- )
+ if e.response["Error"]["Code"] == "ThrottlingException" and wait_seconds < max_wait_seconds:
+ logging.info("We are getting throttled, retrying in {}s".format(wait_seconds))
sleep(wait_seconds)
wait_seconds = wait_seconds * 2
continue
@@ -101,22 +93,22 @@ def select_inference_output(problem_type, model_containers, output_keys):
Returns: List of model_containers updated to emit the response
"""
ALLOWED_INVERSE_TRANSFORM_KEYS = {
- 'BinaryClassification': ['predicted_label', 'probability', 'probabilities', 'labels'],
- 'MulticlassClassification': ['predicted_label', 'probability', 'probabilities', 'labels']
+ "BinaryClassification": ["predicted_label", "probability", "probabilities", "labels"],
+ "MulticlassClassification": ["predicted_label", "probability", "probabilities", "labels"],
}
ALLOWED_ALGO_KEYS = {
- 'BinaryClassification': ['predicted_label', 'probability', 'probabilities'],
- 'MulticlassClassification': ['predicted_label', 'probability', 'probabilities']
+ "BinaryClassification": ["predicted_label", "probability", "probabilities"],
+ "MulticlassClassification": ["predicted_label", "probability", "probabilities"],
}
try:
ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]
except KeyError:
- raise ValueError(f'{problem_type} does not support selective inference output.')
+ raise ValueError(f"{problem_type} does not support selective inference output.")
# Either multiclass or binary classification, so the default should be 'predicted_label'
- output_keys = output_keys or ['predicted_label']
+ output_keys = output_keys or ["predicted_label"]
bad_keys = []
algo_keys = []
@@ -130,32 +122,37 @@ def select_inference_output(problem_type, model_containers, output_keys):
algo_keys.append(key.strip())
if len(bad_keys):
- raise ValueError('Requested inference output keys [{}] are unsupported. '
- 'The supported inference keys are [{}]'.format(
- ', '.join(bad_keys), ', '.format(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type])))
-
- model_containers[1].env.update({
- 'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv',
- 'SAGEMAKER_INFERENCE_OUTPUT': ','.join(algo_keys),
- 'SAGEMAKER_INFERENCE_SUPPORTED': ','.join(ALLOWED_ALGO_KEYS[problem_type])
- })
- model_containers[2].env.update({
- 'SAGEMAKER_INFERENCE_OUTPUT': ','.join(transform_keys),
- 'SAGEMAKER_INFERENCE_INPUT': ','.join(algo_keys),
- 'SAGEMAKER_INFERENCE_SUPPORTED': ','.join(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type])
- })
+ raise ValueError(
+ "Requested inference output keys [{}] are unsupported. "
+ "The supported inference keys are [{}]".format(
+ ", ".join(bad_keys), ", ".format(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type])
+ )
+ )
+
+ model_containers[1].env.update(
+ {
+ "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv",
+ "SAGEMAKER_INFERENCE_OUTPUT": ",".join(algo_keys),
+ "SAGEMAKER_INFERENCE_SUPPORTED": ",".join(ALLOWED_ALGO_KEYS[problem_type]),
+ }
+ )
+ model_containers[2].env.update(
+ {
+ "SAGEMAKER_INFERENCE_OUTPUT": ",".join(transform_keys),
+ "SAGEMAKER_INFERENCE_INPUT": ",".join(algo_keys),
+ "SAGEMAKER_INFERENCE_SUPPORTED": ",".join(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]),
+ }
+ )
return model_containers
def get_algo_image_uri(algo_name, region, repo_version):
if algo_name == "xgboost":
- return image_uris.retrieve(algo_name, region=region, version='1.0-1')
+ return image_uris.retrieve(algo_name, region=region, version="1.0-1")
elif algo_name == "mlp":
mlp_image_uri = image_uris.retrieve("linear-learner", region=region, version=repo_version)
- last_slash_index = mlp_image_uri.rfind('/')
- return "{}/{}:{}".format(
- mlp_image_uri[:last_slash_index], "mxnet-algorithms", repo_version
- )
+ last_slash_index = mlp_image_uri.rfind("/")
+ return "{}/{}:{}".format(mlp_image_uri[:last_slash_index], "mxnet-algorithms", repo_version)
else:
return image_uris.retrieve(algo_name, region=region, version=repo_version)
diff --git a/03_automl/notebooks/sagemaker_automl/config.py b/03_automl/notebooks/sagemaker_automl/config.py
index 20796511..3f7f97d6 100644
--- a/03_automl/notebooks/sagemaker_automl/config.py
+++ b/03_automl/notebooks/sagemaker_automl/config.py
@@ -56,14 +56,10 @@ def __init__(
self.automl_job_name = base_automl_job_config["automl_job_name"]
# the base s3 path where the managed AutoML job stores the intermediates (e.g. data transformation pipeline
# candidate)
- self.automl_output_s3_base_path = base_automl_job_config[
- "automl_output_s3_base_path"
- ]
+ self.automl_output_s3_base_path = base_automl_job_config["automl_output_s3_base_path"]
# Auto ML output job path convention
- self.automl_job_processed_data_path = join(
- self.automl_output_s3_base_path, self.PRE_PROCESSED_DATA_ROOT
- )
+ self.automl_job_processed_data_path = join(self.automl_output_s3_base_path, self.PRE_PROCESSED_DATA_ROOT)
self.automl_job_processed_training_data_path = join(
self.automl_job_processed_data_path, self.PRE_PROCESSED_TRAINING_DATA_PATH
)
@@ -73,17 +69,11 @@ def __init__(
# Auto ML local job config
self.local_automl_job_name = local_automl_job_config["local_automl_job_name"]
- self.local_automl_job_output_s3_base_path = local_automl_job_config[
- "local_automl_job_output_s3_base_path"
- ]
+ self.local_automl_job_output_s3_base_path = local_automl_job_config["local_automl_job_output_s3_base_path"]
# data transformer docker image repo version
- self.data_transformer_image_repo_version = base_automl_job_config[
- "data_transformer_image_repo_version"
- ]
- self.algo_image_repo_versions = base_automl_job_config[
- "algo_image_repo_versions"
- ]
+ self.data_transformer_image_repo_version = base_automl_job_config["data_transformer_image_repo_version"]
+ self.algo_image_repo_versions = base_automl_job_config["algo_image_repo_versions"]
self.algo_inference_image_repo_versions = base_automl_job_config["algo_inference_image_repo_versions"]
@@ -110,19 +100,11 @@ def vpc_config(self):
@property
def subnets(self):
- return (
- self.vpc_config.get("Subnets", None)
- if self.vpc_config is not None
- else None
- )
+ return self.vpc_config.get("Subnets", None) if self.vpc_config is not None else None
@property
def security_group_ids(self):
- return (
- self.vpc_config.get("SecurityGroupIds", None)
- if self.vpc_config is not None
- else None
- )
+ return self.vpc_config.get("SecurityGroupIds", None) if self.vpc_config is not None else None
@property
def encrypt_inter_container_traffic(self):
@@ -187,9 +169,4 @@ def to_html_table(self):
def display(self):
from IPython.display import display, Markdown
- display(
- Markdown(
- "This notebook is initialized to use the following configuration: "
- + self.to_html_table()
- )
- )
+ display(Markdown("This notebook is initialized to use the following configuration: " + self.to_html_table()))
diff --git a/03_automl/notebooks/sagemaker_automl/interactive_runner.py b/03_automl/notebooks/sagemaker_automl/interactive_runner.py
index d8221603..857f3376 100644
--- a/03_automl/notebooks/sagemaker_automl/interactive_runner.py
+++ b/03_automl/notebooks/sagemaker_automl/interactive_runner.py
@@ -22,9 +22,9 @@
class AutoMLInteractiveRunner:
"""AutoMLInteractiveRunner is an orchestrator that manages the AutoML local run. This includes the following:
- 1. Manages the state of local candidates selection
- 2. Orchestrate multi-algo tuning operations that requires inputs from all candidates.
- 3. Model selection and export of trained estimator to deployable model
+ 1. Manages the state of local candidates selection
+ 2. Orchestrate multi-algo tuning operations that requires inputs from all candidates.
+ 3. Model selection and export of trained estimator to deployable model
"""
def __init__(self, local_run_config, candidates=None):
@@ -74,9 +74,7 @@ def select_candidate(self, candidate_definition):
if candidate_pipeline_name in self.candidates:
logging.info(
- "Warning: pipeline candidate {} has already been selected, replacing".format(
- candidate_pipeline_name
- )
+ "Warning: pipeline candidate {} has already been selected, replacing".format(candidate_pipeline_name)
)
# create candidate
@@ -96,9 +94,7 @@ def fit_data_transformers(self, parallel_jobs=2, start_jitter_seconds=10):
execution_future = {}
- with ThreadPoolExecutor(
- max_workers=parallel_jobs, thread_name_prefix="Worker"
- ) as executor:
+ with ThreadPoolExecutor(max_workers=parallel_jobs, thread_name_prefix="Worker") as executor:
for candidate_pipeline_name, candidate in self.candidates.items():
candidate.prepare_data_transformers_for_training()
@@ -125,27 +121,19 @@ def fit_data_transformers(self, parallel_jobs=2, start_jitter_seconds=10):
while True:
future = next(iterator)
candidate_pipeline_name = execution_future[future]
- success = self._process_data_transformer_future(
- candidate_pipeline_name, future
- )
+ success = self._process_data_transformer_future(candidate_pipeline_name, future)
if success:
success_count += 1
except StopIteration:
- logging.info(
- "Successfully fit {} data transformers".format(success_count)
- )
+ logging.info("Successfully fit {} data transformers".format(success_count))
def _process_data_transformer_future(self, candidate_pipeline_name, future):
try:
future.result()
- logging.info(
- "Successfully fit data transformer for {}".format(
- candidate_pipeline_name
- )
- )
+ logging.info("Successfully fit data transformer for {}".format(candidate_pipeline_name))
self.candidates[candidate_pipeline_name].set_transformer_trained()
return True
except Exception:
@@ -178,14 +166,10 @@ def prepare_multi_algo_parameters(
"""
# Create Estimators
- estimator_kwargs[
- "encrypt_inter_container_traffic"
- ] = self.local_run_config.encrypt_inter_container_traffic
+ estimator_kwargs["encrypt_inter_container_traffic"] = self.local_run_config.encrypt_inter_container_traffic
estimator_kwargs["subnets"] = self.local_run_config.subnets
- estimator_kwargs[
- "security_group_ids"
- ] = self.local_run_config.security_group_ids
+ estimator_kwargs["security_group_ids"] = self.local_run_config.security_group_ids
estimator_kwargs["output_kms_key"] = self.local_run_config.output_kms_key
estimator_kwargs["enable_network_isolation"] = True
@@ -253,15 +237,9 @@ def choose_candidate(self, tuner_analytics_dataframe, multi_algo_training_job_na
tuner_analytics_dataframe["TrainingJobName"] == multi_algo_training_job_name
]
# The TrainingJobDefinitionName is mapped to candidate name
- best_data_processing_pipeline_name = training_job_analytics.iloc[0][
- "TrainingJobDefinitionName"
- ]
+ best_data_processing_pipeline_name = training_job_analytics.iloc[0]["TrainingJobDefinitionName"]
- logging.info(
- "Chosen Data Processing pipeline candidate name is {}".format(
- best_data_processing_pipeline_name
- )
- )
+ logging.info("Chosen Data Processing pipeline candidate name is {}".format(best_data_processing_pipeline_name))
best_candidate = self.candidates[best_data_processing_pipeline_name]
return best_candidate
diff --git a/03_automl/notebooks/sagemaker_automl/local_candidate.py b/03_automl/notebooks/sagemaker_automl/local_candidate.py
index ff86cd16..9ff4d754 100644
--- a/03_automl/notebooks/sagemaker_automl/local_candidate.py
+++ b/03_automl/notebooks/sagemaker_automl/local_candidate.py
@@ -17,12 +17,9 @@
class AutoMLLocalCandidate:
- """AutoMLLocalCandidate models an AutoML pipeline consist of data transformer and algo steps
- """
+ """AutoMLLocalCandidate models an AutoML pipeline consist of data transformer and algo steps"""
- def __init__(
- self, candidate_name, data_transformer_step, algo_step, local_run_config
- ):
+ def __init__(self, candidate_name, data_transformer_step, algo_step, local_run_config):
"""
Args:
candidate_name (str): name of the candidate, e.g. `dpp0-xgboost`
@@ -83,7 +80,8 @@ def create(cls, candidate_name, candidate_definition, local_run_config):
repo_version=local_run_config.data_transformer_image_repo_version,
source_module_path=os.path.join(
f"{local_run_config.automl_job_name}-artifacts",
- AutoMLCandidateDataTransformerStep.DEFAULT_SOURCE_MODULE)
+ AutoMLCandidateDataTransformerStep.DEFAULT_SOURCE_MODULE,
+ ),
)
algo_name = candidate_definition["algorithm"]["name"]
@@ -91,12 +89,10 @@ def create(cls, candidate_name, candidate_definition, local_run_config):
**candidate_definition["algorithm"],
region=local_run_config.region,
repo_version=local_run_config.algo_image_repo_versions[algo_name],
- inference_repo_version=local_run_config.algo_inference_image_repo_versions[algo_name]
+ inference_repo_version=local_run_config.algo_inference_image_repo_versions[algo_name],
)
- return AutoMLLocalCandidate(
- candidate_name, data_transformer_step, algo_step, local_run_config
- )
+ return AutoMLLocalCandidate(candidate_name, data_transformer_step, algo_step, local_run_config)
@property
def content_type(self):
@@ -111,9 +107,7 @@ def data_transformer_transformed_data_path(self):
self._check_data_transformer_prepared()
return self._state["data_transformer"]["transform_output_path"]
- def prepare_data_transformers_for_training(
- self, training_job_name=None, transform_job_name=None, **kwargs
- ):
+ def prepare_data_transformers_for_training(self, training_job_name=None, transform_job_name=None, **kwargs):
"""This prepare the data transformers for training:
1. create SKlearn trainer
2. create steps to be executed by runner
@@ -127,9 +121,7 @@ def prepare_data_transformers_for_training(
"""
# add network & security features
- kwargs[
- "encrypt_inter_container_traffic"
- ] = self.local_run_config.encrypt_inter_container_traffic
+ kwargs["encrypt_inter_container_traffic"] = self.local_run_config.encrypt_inter_container_traffic
kwargs["subnets"] = self.local_run_config.subnets
kwargs["security_group_ids"] = self.local_run_config.security_group_ids
@@ -140,25 +132,19 @@ def prepare_data_transformers_for_training(
output_path=self.local_run_config.data_processing_model_s3_root,
role=self.local_run_config.role,
sagemaker_session=self.local_run_config.sagemaker_session,
- **kwargs
+ **kwargs,
)
- training_job_name = (
- training_job_name
- or "{prefix}-{dpp_name}-train-{suffix}".format(
- prefix=self.local_run_config.local_automl_job_name,
- dpp_name=self.data_transformer_step.name,
- suffix=uid(),
- )
+ training_job_name = training_job_name or "{prefix}-{dpp_name}-train-{suffix}".format(
+ prefix=self.local_run_config.local_automl_job_name,
+ dpp_name=self.data_transformer_step.name,
+ suffix=uid(),
)
- transform_job_name = (
- transform_job_name
- or "{prefix}-{dpp_name}-transform-{suffix}".format(
- prefix=self.local_run_config.local_automl_job_name,
- dpp_name=self.data_transformer_step.name,
- suffix=uid(),
- )
+ transform_job_name = transform_job_name or "{prefix}-{dpp_name}-transform-{suffix}".format(
+ prefix=self.local_run_config.local_automl_job_name,
+ dpp_name=self.data_transformer_step.name,
+ suffix=uid(),
)
transform_output_path = "{prefix}/{dpp_name}/{transformed_data_format}".format(
@@ -207,14 +193,9 @@ def set_transformer_trained(self):
self._state["data_transformer"]["trained"] = True
def data_transformer_is_trained(self):
- return (
- "data_transformer" in self._state
- and self._state["data_transformer"]["trained"]
- )
+ return "data_transformer" in self._state and self._state["data_transformer"]["trained"]
- def get_data_transformer_model(
- self, role, sagemaker_session, transform_mode=None, **kwargs
- ):
+ def get_data_transformer_model(self, role, sagemaker_session, transform_mode=None, **kwargs):
"""
Args:
@@ -230,25 +211,18 @@ def get_data_transformer_model(
self._check_data_transformer_prepared()
if not self.data_transformer_is_trained:
- raise AutoMLLocalCandidateNotTrained(
- "AutoML Candidate data transformers has not been trained yet"
- )
+ raise AutoMLLocalCandidateNotTrained("AutoML Candidate data transformers has not been trained yet")
data_transformer_state = self._state["data_transformer"]
trainer = data_transformer_state["trainer"]
training_job_name = data_transformer_state["training_job_name"]
- data_transformer_estimator = trainer.attach(
- training_job_name, sagemaker_session=sagemaker_session
- )
+ data_transformer_estimator = trainer.attach(training_job_name, sagemaker_session=sagemaker_session)
security_config = self.local_run_config.security_config
- if (
- self.local_run_config.security_config is not None
- and "VpcConfig" not in kwargs
- ):
+ if self.local_run_config.security_config is not None and "VpcConfig" not in kwargs:
kwargs.update({"vpc_config": security_config["VpcConfig"]})
return self.data_transformer_step.create_model(
@@ -256,27 +230,21 @@ def get_data_transformer_model(
role=role,
sagemaker_session=sagemaker_session,
transform_mode=transform_mode,
- **kwargs
+ **kwargs,
)
def to_dict(self):
base_dict = {
"pipeline_name": self.candidate_name,
- "data_transformer": {
- "data_processing_module_name": self.data_transformer_step.name
- },
+ "data_transformer": {"data_processing_module_name": self.data_transformer_step.name},
"algorithm": {"algo_name": self.algo_step.algo_name},
}
if "data_transformer" in self._state:
base_dict["data_transformer"].update(
{
- "training_job_name": self._state["data_transformer"][
- "training_job_name"
- ],
- "transform_job_name": self._state["data_transformer"][
- "transform_job_name"
- ],
+ "training_job_name": self._state["data_transformer"]["training_job_name"],
+ "transform_job_name": self._state["data_transformer"]["transform_job_name"],
}
)
diff --git a/03_automl/notebooks/sagemaker_automl/steps.py b/03_automl/notebooks/sagemaker_automl/steps.py
index ec243529..3128fce1 100644
--- a/03_automl/notebooks/sagemaker_automl/steps.py
+++ b/03_automl/notebooks/sagemaker_automl/steps.py
@@ -15,21 +15,27 @@ class AutoMLCandidateAlgoStep:
and `mlp`.
"""
- def __init__(self, name, training_resource_config, region, repo_version, inference_repo_version,
- candidate_specific_static_hyperparameters=None):
+ def __init__(
+ self,
+ name,
+ training_resource_config,
+ region,
+ repo_version,
+ inference_repo_version,
+ candidate_specific_static_hyperparameters=None,
+ ):
self.algo_name = name
self.training_resource_config = training_resource_config
- self.candidate_specific_static_hps = candidate_specific_static_hyperparameters \
- if candidate_specific_static_hyperparameters else {}
+ self.candidate_specific_static_hps = (
+ candidate_specific_static_hyperparameters if candidate_specific_static_hyperparameters else {}
+ )
self.region = region
self.repo_version = repo_version
self.algo_image_uri = get_algo_image_uri(self.algo_name, region, repo_version)
self.algo_inference_image_uri = get_algo_image_uri(self.algo_name, region, inference_repo_version)
- def create_estimator(
- self, role, output_path, hyperparameters, sagemaker_session, **kwargs
- ):
+ def create_estimator(self, role, output_path, hyperparameters, sagemaker_session, **kwargs):
estimator = Estimator(
self.algo_image_uri,
@@ -47,13 +53,11 @@ def create_estimator(
def get_inference_container_config(self):
config = {
- 'env': {
- 'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv'
- },
- 'image_uri': self.algo_inference_image_uri
+ "env": {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"},
+ "image_uri": self.algo_inference_image_uri,
}
- if self.algo_name == 'mlp':
- config['env']['ML_APPLICATION'] = 'mlp'
+ if self.algo_name == "mlp":
+ config["env"]["ML_APPLICATION"] = "mlp"
return config
@@ -107,7 +111,7 @@ def __init__(
# We share registry account id with all framework container
xgb_image_uri = image_uris.retrieve("xgboost", region=region, version="1.0-1")
- last_slash_index = xgb_image_uri.rfind('/')
+ last_slash_index = xgb_image_uri.rfind("/")
self.transformer_image_uri = "{}/{}:{}".format(
xgb_image_uri[:last_slash_index], "sagemaker-sklearn-automl", repo_version
)
@@ -198,13 +202,9 @@ def create_steps(
def _train_transform(context):
_trainer = context.get("trainer")
- training_data_input_path = (
- local_run_config.automl_job_processed_training_data_path
- )
+ training_data_input_path = local_run_config.automl_job_processed_training_data_path
return _trainer.fit(
- {
- AutoMLCandidateDataTransformerStep.TRAIN_CHANNEL_NAME: training_data_input_path
- },
+ {AutoMLCandidateDataTransformerStep.TRAIN_CHANNEL_NAME: training_data_input_path},
job_name=training_job_name,
wait=True,
logs=False,
@@ -227,7 +227,7 @@ def _create_transformer(context):
accept=self.content_type,
env=transform_env,
volume_kms_key=local_run_config.volume_kms_key,
- output_kms_key=local_run_config.output_kms_key
+ output_kms_key=local_run_config.output_kms_key,
)
context["transformer"] = transformer
@@ -262,9 +262,7 @@ def _transform_data(context):
),
]
- def create_model(
- self, estimator, role, sagemaker_session, transform_mode, **kwargs
- ):
+ def create_model(self, estimator, role, sagemaker_session, transform_mode, **kwargs):
"""Create a deployable data transformer model
Args:
estimator: an estimator attached from trainer
diff --git a/04_ingest/01_Copy_TSV_To_S3.ipynb b/04_ingest/01_Copy_TSV_To_S3.ipynb
index b7b05a62..ae42c5e4 100644
--- a/04_ingest/01_Copy_TSV_To_S3.ipynb
+++ b/04_ingest/01_Copy_TSV_To_S3.ipynb
@@ -87,9 +87,9 @@
"try:\n",
" setup_instance_check_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -119,9 +119,9 @@
"try:\n",
" setup_dependencies_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -149,11 +149,11 @@
"outputs": [],
"source": [
"try:\n",
- " setup_s3_bucket_passed \n",
+ " setup_s3_bucket_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -181,11 +181,11 @@
"outputs": [],
"source": [
"try:\n",
- " setup_iam_roles_passed \n",
+ " setup_iam_roles_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -204,21 +204,21 @@
"outputs": [],
"source": [
"if not setup_instance_check_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_dependencies_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_s3_bucket_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"if not setup_iam_roles_passed:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -231,13 +231,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -253,7 +253,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'"
+ "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\""
]
},
{
@@ -278,7 +278,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n",
+ "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n",
"print(s3_private_path_tsv)"
]
},
@@ -350,7 +350,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Bucket'.format(region, account_id, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Bucket'.format(\n",
+ " region, account_id, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -427,7 +433,7 @@
"# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Digital_Software_v1_00.tsv.gz\"\n",
"# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\"\n",
"# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Gift_Card_v1_00.tsv.gz\"\n",
- "# !aws s3 ls --recursive s3://dsoaws/$step_prefix/\n"
+ "# !aws s3 ls --recursive s3://dsoaws/$step_prefix/"
]
}
],
diff --git a/04_ingest/02_Create_Athena_Database.ipynb b/04_ingest/02_Create_Athena_Database.ipynb
index 1d60b09e..08948b79 100644
--- a/04_ingest/02_Create_Athena_Database.ipynb
+++ b/04_ingest/02_Create_Athena_Database.ipynb
@@ -29,7 +29,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -62,10 +62,10 @@
"try:\n",
" s3_public_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -95,10 +95,10 @@
"try:\n",
" s3_private_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -141,7 +141,7 @@
"metadata": {},
"outputs": [],
"source": [
- "database_name = 'dsoaws'"
+ "database_name = \"dsoaws\""
]
},
{
@@ -160,7 +160,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -178,7 +178,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'CREATE DATABASE IF NOT EXISTS {}'.format(database_name)\n",
+ "statement = \"CREATE DATABASE IF NOT EXISTS {}\".format(database_name)\n",
"print(statement)"
]
},
@@ -189,6 +189,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"pd.read_sql(statement, conn)"
]
},
@@ -205,7 +206,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW DATABASES'\n",
+ "statement = \"SHOW DATABASES\"\n",
"\n",
"df_show = pd.read_sql(statement, conn)\n",
"df_show.head(5)"
diff --git a/04_ingest/03_Register_S3_TSV_With_Athena.ipynb b/04_ingest/03_Register_S3_TSV_With_Athena.ipynb
index 55b04ebc..6d3eab6f 100644
--- a/04_ingest/03_Register_S3_TSV_With_Athena.ipynb
+++ b/04_ingest/03_Register_S3_TSV_With_Athena.ipynb
@@ -31,7 +31,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -64,9 +64,9 @@
"try:\n",
" ingest_create_athena_db_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -85,11 +85,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_db_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]') "
+ " print(\"[OK]\")"
]
},
{
@@ -110,10 +110,10 @@
"try:\n",
" s3_private_path_tsv\n",
"except NameError:\n",
- " print('*****************************************************************************')\n",
- " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n",
- " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n",
- " print('*****************************************************************************')"
+ " print(\"*****************************************************************************\")\n",
+ " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n",
+ " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n",
+ " print(\"*****************************************************************************\")"
]
},
{
@@ -179,7 +179,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -189,8 +189,8 @@
"outputs": [],
"source": [
"# Set Athena parameters\n",
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'"
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\""
]
},
{
@@ -226,7 +226,9 @@
" review_body string,\n",
" review_date string\n",
") ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\\\t' LINES TERMINATED BY '\\\\n' LOCATION '{}'\n",
- "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(database_name, table_name_tsv, s3_private_path_tsv)\n",
+ "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(\n",
+ " database_name, table_name_tsv, s3_private_path_tsv\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -238,6 +240,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"pd.read_sql(statement, conn)"
]
},
@@ -254,7 +257,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW TABLES in {}'.format(database_name)\n",
+ "statement = \"SHOW TABLES in {}\".format(database_name)\n",
"\n",
"df_show = pd.read_sql(statement, conn)\n",
"df_show.head(5)"
@@ -292,10 +295,12 @@
"metadata": {},
"outputs": [],
"source": [
- "product_category = 'Digital_Software'\n",
+ "product_category = \"Digital_Software\"\n",
"\n",
"statement = \"\"\"SELECT * FROM {}.{}\n",
- " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_tsv, product_category)\n",
+ " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n",
+ " database_name, table_name_tsv, product_category\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -317,11 +322,11 @@
"outputs": [],
"source": [
"if not df.empty:\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"else:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -339,7 +344,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review AWS Glue Catalog'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review AWS Glue Catalog'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb b/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
index edb39de0..799d3123 100644
--- a/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
+++ b/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb
@@ -30,7 +30,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -63,9 +63,9 @@
"try:\n",
" ingest_create_athena_table_tsv_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -84,11 +84,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_tsv_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -123,12 +123,12 @@
"outputs": [],
"source": [
"# Set S3 path to Parquet data\n",
- "s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(bucket)\n",
+ "s3_path_parquet = \"s3://{}/amazon-reviews-pds/parquet\".format(bucket)\n",
"\n",
"# Set Athena parameters\n",
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'\n",
- "table_name_parquet = 'amazon_reviews_parquet'"
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\"\n",
+ "table_name_parquet = \"amazon_reviews_parquet\""
]
},
{
@@ -138,7 +138,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -185,7 +185,9 @@
" CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,\n",
" DATE(review_date) AS review_date,\n",
" product_category\n",
- "FROM {}.{}\"\"\".format(database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv)\n",
+ "FROM {}.{}\"\"\".format(\n",
+ " database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -221,7 +223,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)\n",
+ "statement = \"MSCK REPAIR TABLE {}.{}\".format(database_name, table_name_parquet)\n",
"\n",
"print(statement)"
]
@@ -233,6 +235,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"df = pd.read_sql(statement, conn)\n",
"df.head(5)"
]
@@ -250,7 +253,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)\n",
+ "statement = \"SHOW PARTITIONS {}.{}\".format(database_name, table_name_parquet)\n",
"\n",
"print(statement)"
]
@@ -278,7 +281,7 @@
"metadata": {},
"outputs": [],
"source": [
- "statement = 'SHOW TABLES in {}'.format(database_name)"
+ "statement = \"SHOW TABLES in {}\".format(database_name)"
]
},
{
@@ -323,10 +326,12 @@
"metadata": {},
"outputs": [],
"source": [
- "product_category = 'Digital_Software'\n",
+ "product_category = \"Digital_Software\"\n",
"\n",
"statement = \"\"\"SELECT * FROM {}.{}\n",
- " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_parquet, product_category)\n",
+ " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n",
+ " database_name, table_name_parquet, product_category\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -348,11 +353,11 @@
"outputs": [],
"source": [
"if not df.empty:\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"else:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -370,7 +375,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review AWS Glue Catalog'.format(region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review AWS Glue Catalog'.format(\n",
+ " region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb b/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb
index 960053f1..f7d2794a 100644
--- a/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb
+++ b/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb
@@ -46,9 +46,9 @@
"try:\n",
" ingest_create_athena_table_tsv_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -67,11 +67,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_tsv_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -92,9 +92,9 @@
"try:\n",
" ingest_create_athena_table_parquet_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -113,11 +113,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_parquet_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -136,12 +136,12 @@
"import sagemaker\n",
"import boto3\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -181,11 +181,10 @@
"metadata": {},
"outputs": [],
"source": [
- "path = 's3://{}/amazon-reviews-pds/parquet/'.format(bucket)\n",
- "df_parquet_results = wr.s3.read_parquet(path,\n",
- " columns=['star_rating', 'product_category', 'review_body'],\n",
- " partition_filter=p_filter,\n",
- " dataset=True)\n",
+ "path = \"s3://{}/amazon-reviews-pds/parquet/\".format(bucket)\n",
+ "df_parquet_results = wr.s3.read_parquet(\n",
+ " path, columns=[\"star_rating\", \"product_category\", \"review_body\"], partition_filter=p_filter, dataset=True\n",
+ ")\n",
"df_parquet_results.shape"
]
},
@@ -226,13 +225,15 @@
"metadata": {},
"outputs": [],
"source": [
- "path = 's3://{}/amazon-reviews-pds/parquet/'.format(bucket)\n",
- "chunk_iter = wr.s3.read_parquet(path,\n",
- " columns=['star_rating', 'product_category', 'review_body'],\n",
- " # filters=[(\"product_category\", \"=\", \"Digital_Software\")],\n",
- " partition_filter=p_filter,\n",
- " dataset=True,\n",
- " chunked=True)"
+ "path = \"s3://{}/amazon-reviews-pds/parquet/\".format(bucket)\n",
+ "chunk_iter = wr.s3.read_parquet(\n",
+ " path,\n",
+ " columns=[\"star_rating\", \"product_category\", \"review_body\"],\n",
+ " # filters=[(\"product_category\", \"=\", \"Digital_Software\")],\n",
+ " partition_filter=p_filter,\n",
+ " dataset=True,\n",
+ " chunked=True,\n",
+ ")"
]
},
{
@@ -260,9 +261,9 @@
"metadata": {},
"outputs": [],
"source": [
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'\n",
- "table_name_parquet = 'amazon_reviews_parquet'"
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\"\n",
+ "table_name_parquet = \"amazon_reviews_parquet\""
]
},
{
@@ -272,7 +273,7 @@
"outputs": [],
"source": [
"for table in wr.catalog.get_tables(database=\"dsoaws\"):\n",
- " print(table['Name'])"
+ " print(table[\"Name\"])"
]
},
{
@@ -290,10 +291,7 @@
"outputs": [],
"source": [
"%%time\n",
- "df = wr.athena.read_sql_query(\n",
- " sql='SELECT * FROM {} LIMIT 5000'.format(table_name_parquet),\n",
- " database=database_name\n",
- ")"
+ "df = wr.athena.read_sql_query(sql=\"SELECT * FROM {} LIMIT 5000\".format(table_name_parquet), database=database_name)"
]
},
{
@@ -324,9 +322,9 @@
"%%time\n",
"\n",
"chunk_iter = wr.athena.read_sql_query(\n",
- " sql='SELECT * FROM {} LIMIT 5000'.format(table_name_parquet),\n",
- " database='{}'.format(database_name),\n",
- " chunksize=64_000 # 64 KB Chunks\n",
+ " sql=\"SELECT * FROM {} LIMIT 5000\".format(table_name_parquet),\n",
+ " database=\"{}\".format(database_name),\n",
+ " chunksize=64_000, # 64 KB Chunks\n",
")"
]
},
diff --git a/05_explore/01_Visualize_Reviews_Dataset.ipynb b/05_explore/01_Visualize_Reviews_Dataset.ipynb
index 868bc788..a2b93906 100644
--- a/05_explore/01_Visualize_Reviews_Dataset.ipynb
+++ b/05_explore/01_Visualize_Reviews_Dataset.ipynb
@@ -25,9 +25,9 @@
"try:\n",
" ingest_create_athena_table_parquet_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -46,11 +46,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_parquet_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -97,6 +97,7 @@
"import seaborn as sns\n",
"\n",
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'"
]
@@ -110,7 +111,7 @@
"import sagemaker\n",
"import boto3\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -122,9 +123,9 @@
"metadata": {},
"outputs": [],
"source": [
- "# Set Athena database & table \n",
- "database_name = 'dsoaws'\n",
- "table_name = 'amazon_reviews_parquet'"
+ "# Set Athena database & table\n",
+ "database_name = \"dsoaws\"\n",
+ "table_name = \"amazon_reviews_parquet\""
]
},
{
@@ -143,7 +144,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -168,23 +169,27 @@
"metadata": {},
"outputs": [],
"source": [
- "sns.set_style = 'seaborn-whitegrid'\n",
- "\n",
- "sns.set(rc={\"font.style\":\"normal\",\n",
- " \"axes.facecolor\":\"white\",\n",
- " 'grid.color': '.8',\n",
- " 'grid.linestyle': '-',\n",
- " \"figure.facecolor\":\"white\",\n",
- " \"figure.titlesize\":20,\n",
- " \"text.color\":\"black\",\n",
- " \"xtick.color\":\"black\",\n",
- " \"ytick.color\":\"black\",\n",
- " \"axes.labelcolor\":\"black\",\n",
- " \"axes.grid\":True,\n",
- " 'axes.labelsize':10,\n",
- " 'xtick.labelsize':10,\n",
- " 'font.size':10,\n",
- " 'ytick.labelsize':10})"
+ "sns.set_style = \"seaborn-whitegrid\"\n",
+ "\n",
+ "sns.set(\n",
+ " rc={\n",
+ " \"font.style\": \"normal\",\n",
+ " \"axes.facecolor\": \"white\",\n",
+ " \"grid.color\": \".8\",\n",
+ " \"grid.linestyle\": \"-\",\n",
+ " \"figure.facecolor\": \"white\",\n",
+ " \"figure.titlesize\": 20,\n",
+ " \"text.color\": \"black\",\n",
+ " \"xtick.color\": \"black\",\n",
+ " \"ytick.color\": \"black\",\n",
+ " \"axes.labelcolor\": \"black\",\n",
+ " \"axes.grid\": True,\n",
+ " \"axes.labelsize\": 10,\n",
+ " \"xtick.labelsize\": 10,\n",
+ " \"font.size\": 10,\n",
+ " \"ytick.labelsize\": 10,\n",
+ " }\n",
+ ")"
]
},
{
@@ -205,7 +210,7 @@
" for p in ax.patches:\n",
" _x = p.get_x() + p.get_width() + float(space)\n",
" _y = p.get_y() + p.get_height()\n",
- " value = round(float(p.get_width()),2)\n",
+ " value = round(float(p.get_width()), 2)\n",
" ax.text(_x, _y, value, ha=\"left\")\n",
"\n",
" if isinstance(axs, np.ndarray):\n",
@@ -234,7 +239,9 @@
"FROM {}.{} \n",
"GROUP BY product_category \n",
"ORDER BY avg_star_rating DESC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -281,16 +288,16 @@
"outputs": [],
"source": [
"# Create plot\n",
- "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df, saturation=1)\n",
"\n",
"if num_categories < 10:\n",
- " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n",
- " \n",
- "# Set title and x-axis ticks \n",
- "plt.title('Average Rating by Product Category')\n",
- "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n",
+ " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
"\n",
- "# Helper code to show actual values afters bars \n",
+ "# Set title and x-axis ticks\n",
+ "plt.title(\"Average Rating by Product Category\")\n",
+ "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n",
+ "\n",
+ "# Helper code to show actual values afters bars\n",
"show_values_barplot(barplot, 0.1)\n",
"\n",
"plt.xlabel(\"Average Rating\")\n",
@@ -333,7 +340,9 @@
"FROM {}.{}\n",
"GROUP BY product_category \n",
"ORDER BY count_star_rating DESC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -355,10 +364,10 @@
"outputs": [],
"source": [
"# Store counts\n",
- "count_ratings = df['count_star_rating']\n",
+ "count_ratings = df[\"count_star_rating\"]\n",
"\n",
"# Store max ratings\n",
- "max_ratings = df['count_star_rating'].max()\n",
+ "max_ratings = df[\"count_star_rating\"].max()\n",
"print(max_ratings)"
]
},
@@ -376,20 +385,20 @@
"outputs": [],
"source": [
"# Create Seaborn barplot\n",
- "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n",
"\n",
"if num_categories < 10:\n",
- " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n",
+ " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
"\n",
"# Set title\n",
"plt.title(\"Number of Ratings per Product Category for Subset of Product Categories\")\n",
"\n",
- "# Set x-axis ticks to match scale \n",
+ "# Set x-axis ticks to match scale\n",
"if max_ratings > 200000:\n",
- " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
+ " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
" plt.xlim(0, 20000000)\n",
"elif max_ratings <= 200000:\n",
- " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '150K', '200K'])\n",
+ " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"150K\", \"200K\"])\n",
" plt.xlim(0, 200000)\n",
"\n",
"plt.xlabel(\"Number of Ratings\")\n",
@@ -427,13 +436,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category, MIN(review_date) AS first_review_date\n",
"FROM {}.{}\n",
"GROUP BY product_category\n",
"ORDER BY first_review_date \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -456,7 +467,8 @@
"source": [
"# Convert date strings (e.g. 2014-10-18) to datetime\n",
"import datetime as datetime\n",
- "dates = pd.to_datetime(df['first_review_date'])\n"
+ "\n",
+ "dates = pd.to_datetime(df[\"first_review_date\"])"
]
},
{
@@ -467,16 +479,18 @@
"source": [
"# See: https://stackoverflow.com/questions/60761410/how-to-graph-events-on-a-timeline\n",
"\n",
+ "\n",
"def modify_dataframe(df):\n",
" \"\"\" Modify dataframe to include new columns \"\"\"\n",
- " df['year'] = pd.to_datetime(df['first_review_date'], format='%Y-%m-%d').dt.year\n",
+ " df[\"year\"] = pd.to_datetime(df[\"first_review_date\"], format=\"%Y-%m-%d\").dt.year\n",
" return df\n",
"\n",
+ "\n",
"def get_x_y(df):\n",
" \"\"\" Get X and Y coordinates; return tuple \"\"\"\n",
- " series = df['year'].value_counts().sort_index()\n",
+ " series = df[\"year\"].value_counts().sort_index()\n",
" # new_series = series.reindex(range(1,21)).fillna(0).astype(int)\n",
- " return series.index, series.values\n"
+ " return series.index, series.values"
]
},
{
@@ -504,20 +518,20 @@
"metadata": {},
"outputs": [],
"source": [
- "fig = plt.figure(figsize=(12,5))\n",
+ "fig = plt.figure(figsize=(12, 5))\n",
"ax = plt.gca()\n",
"\n",
- "ax.set_title('Number Of First Product Category Reviews Per Year for Subset of Categories')\n",
- "ax.set_xlabel('Year')\n",
- "ax.set_ylabel('Count')\n",
+ "ax.set_title(\"Number Of First Product Category Reviews Per Year for Subset of Categories\")\n",
+ "ax.set_xlabel(\"Year\")\n",
+ "ax.set_ylabel(\"Count\")\n",
"\n",
"ax.plot(X, Y, color=\"black\", linewidth=2, marker=\"o\")\n",
- "ax.fill_between(X, [0]*len(X), Y, facecolor='lightblue')\n",
+ "ax.fill_between(X, [0] * len(X), Y, facecolor=\"lightblue\")\n",
"\n",
"ax.locator_params(integer=True)\n",
"\n",
"ax.set_xticks(range(1995, 2016, 1))\n",
- "ax.set_yticks(range(0, max(Y)+2, 1))\n",
+ "ax.set_yticks(range(0, max(Y) + 2, 1))\n",
"\n",
"plt.xticks(rotation=45)\n",
"\n",
@@ -548,7 +562,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category,\n",
" star_rating,\n",
@@ -556,7 +570,9 @@
"FROM {}.{}\n",
"GROUP BY product_category, star_rating\n",
"ORDER BY product_category ASC, star_rating DESC, count_reviews\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -585,14 +601,14 @@
"outputs": [],
"source": [
"# Create grouped DataFrames by category and by star rating\n",
- "grouped_category = df.groupby('product_category')\n",
- "grouped_star = df.groupby('star_rating')\n",
+ "grouped_category = df.groupby(\"product_category\")\n",
+ "grouped_star = df.groupby(\"star_rating\")\n",
"\n",
"# Create sum of ratings per star rating\n",
- "df_sum = df.groupby(['star_rating']).sum()\n",
+ "df_sum = df.groupby([\"star_rating\"]).sum()\n",
"\n",
"# Calculate total number of star ratings\n",
- "total = df_sum['count_reviews'].sum()\n",
+ "total = df_sum[\"count_reviews\"].sum()\n",
"print(total)"
]
},
@@ -605,17 +621,17 @@
"# Create dictionary of product categories and array of star rating distribution per category\n",
"distribution = {}\n",
"count_reviews_per_star = []\n",
- "i=0\n",
- " \n",
+ "i = 0\n",
+ "\n",
"for category, ratings in grouped_category:\n",
" count_reviews_per_star = []\n",
- " for star in ratings['star_rating']:\n",
- " count_reviews_per_star.append(ratings.at[i, 'count_reviews'])\n",
- " i=i+1;\n",
+ " for star in ratings[\"star_rating\"]:\n",
+ " count_reviews_per_star.append(ratings.at[i, \"count_reviews\"])\n",
+ " i = i + 1\n",
" distribution[category] = count_reviews_per_star\n",
"\n",
"# Check if distribution has been created succesfully\n",
- "print(distribution)\n"
+ "print(distribution)"
]
},
{
@@ -654,8 +670,8 @@
"# Sort distribution by highest average rating per category\n",
"sorted_distribution = {}\n",
"\n",
- "average_star_ratings.iloc[:,0]\n",
- "for index, value in average_star_ratings.iloc[:,0].items():\n",
+ "average_star_ratings.iloc[:, 0]\n",
+ "for index, value in average_star_ratings.iloc[:, 0].items():\n",
" sorted_distribution[value] = distribution[value]"
]
},
@@ -716,7 +732,7 @@
"proportion_star5 = np.true_divide(star5, total) * 100\n",
"\n",
"# Add colors\n",
- "colors = ['red', 'purple','blue','orange','green']\n",
+ "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n",
"\n",
"# The position of the bars on the x-axis\n",
"r = range(len(categories))\n",
@@ -724,21 +740,53 @@
"\n",
"# Plot bars\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
- "\n",
- "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n",
- "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n",
- "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n",
- "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n",
- "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
+ "\n",
+ "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n",
+ "ax4 = plt.barh(\n",
+ " r,\n",
+ " proportion_star4,\n",
+ " left=proportion_star5,\n",
+ " color=colors[3],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"4-Star Ratings\",\n",
+ ")\n",
+ "ax3 = plt.barh(\n",
+ " r,\n",
+ " proportion_star3,\n",
+ " left=proportion_star5 + proportion_star4,\n",
+ " color=colors[2],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"3-Star Ratings\",\n",
+ ")\n",
+ "ax2 = plt.barh(\n",
+ " r,\n",
+ " proportion_star2,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3,\n",
+ " color=colors[1],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"2-Star Ratings\",\n",
+ ")\n",
+ "ax1 = plt.barh(\n",
+ " r,\n",
+ " proportion_star1,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n",
+ " color=colors[0],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"1-Star Ratings\",\n",
+ ")\n",
"\n",
- "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n",
- "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
- "plt.yticks(r, categories, fontweight='regular')\n",
+ "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n",
+ "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n",
+ "plt.yticks(r, categories, fontweight=\"regular\")\n",
"\n",
- "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n",
+ "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n",
"plt.gca().invert_yaxis()\n",
"plt.tight_layout()\n",
"\n",
@@ -769,14 +817,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT star_rating,\n",
" COUNT(*) AS count_reviews\n",
"FROM dsoaws.amazon_reviews_parquet\n",
"GROUP BY star_rating\n",
"ORDER BY star_rating DESC, count_reviews \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -807,15 +857,12 @@
"metadata": {},
"outputs": [],
"source": [
- "chart = df.plot.bar(x='star_rating', \n",
- " y='count_reviews', \n",
- " rot='0',\n",
- " figsize=(10,5), \n",
- " title='Review Count by Star Ratings', \n",
- " legend=False)\n",
+ "chart = df.plot.bar(\n",
+ " x=\"star_rating\", y=\"count_reviews\", rot=\"0\", figsize=(10, 5), title=\"Review Count by Star Ratings\", legend=False\n",
+ ")\n",
"\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")\n",
"\n",
"plt.show(chart)"
]
@@ -852,13 +899,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT year, ROUND(AVG(star_rating),4) AS avg_rating\n",
"FROM {}.{}\n",
"GROUP BY year\n",
"ORDER BY year\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -879,7 +928,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year\n"
+ "df[\"year\"] = pd.to_datetime(df[\"year\"], format=\"%Y\").dt.year"
]
},
{
@@ -896,21 +945,21 @@
"outputs": [],
"source": [
"fig = plt.gcf()\n",
- "fig.set_size_inches(12,5)\n",
+ "fig.set_size_inches(12, 5)\n",
"\n",
- "fig.suptitle('Average Star Rating Over Time (Across Subset of Product Categories)')\n",
+ "fig.suptitle(\"Average Star Rating Over Time (Across Subset of Product Categories)\")\n",
"\n",
"ax = plt.gca()\n",
- "#ax = plt.gca().set_xticks(df['year'])\n",
+ "# ax = plt.gca().set_xticks(df['year'])\n",
"ax.locator_params(integer=True)\n",
- "ax.set_xticks(df['year'].unique())\n",
+ "ax.set_xticks(df[\"year\"].unique())\n",
"\n",
- "df.plot(kind='line',x='year',y='avg_rating', color='red', ax=ax)\n",
+ "df.plot(kind=\"line\", x=\"year\", y=\"avg_rating\", color=\"red\", ax=ax)\n",
"\n",
- "#plt.xticks(range(1995, 2016, 1))\n",
- "#plt.yticks(range(0,6,1))\n",
- "plt.xlabel('Years')\n",
- "plt.ylabel('Average Star Rating')\n",
+ "# plt.xticks(range(1995, 2016, 1))\n",
+ "# plt.yticks(range(0,6,1))\n",
+ "plt.xlabel(\"Years\")\n",
+ "plt.ylabel(\"Average Star Rating\")\n",
"plt.xticks(rotation=45)\n",
"\n",
"# fig.savefig('average-rating.png', dpi=300)\n",
@@ -940,13 +989,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_category, year, ROUND(AVG(star_rating), 4) AS avg_rating_category\n",
"FROM {}.{}\n",
"GROUP BY product_category, year\n",
"ORDER BY year \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -975,11 +1026,20 @@
"outputs": [],
"source": [
"def plot_categories(df):\n",
- " df_categories = df['product_category'].unique()\n",
+ " df_categories = df[\"product_category\"].unique()\n",
" for category in df_categories:\n",
" # print(category)\n",
- " df_plot = df.loc[df['product_category'] == category]\n",
- " df_plot.plot(kind='line',x='year',y='avg_rating_category', c=np.random.rand(3,), ax=ax, label=category)"
+ " df_plot = df.loc[df[\"product_category\"] == category]\n",
+ " df_plot.plot(\n",
+ " kind=\"line\",\n",
+ " x=\"year\",\n",
+ " y=\"avg_rating_category\",\n",
+ " c=np.random.rand(\n",
+ " 3,\n",
+ " ),\n",
+ " ax=ax,\n",
+ " label=category,\n",
+ " )"
]
},
{
@@ -989,19 +1049,19 @@
"outputs": [],
"source": [
"fig = plt.gcf()\n",
- "fig.set_size_inches(12,5)\n",
+ "fig.set_size_inches(12, 5)\n",
+ "\n",
+ "fig.suptitle(\"Average Star Rating Over Time Across Subset Of Categories\")\n",
"\n",
- "fig.suptitle('Average Star Rating Over Time Across Subset Of Categories')\n",
- " \n",
"ax = plt.gca()\n",
"\n",
"ax.locator_params(integer=True)\n",
- "ax.set_xticks(df['year'].unique())\n",
+ "ax.set_xticks(df[\"year\"].unique())\n",
"\n",
"plot_categories(df)\n",
"\n",
- "plt.xlabel('Year')\n",
- "plt.ylabel('Average Star Rating')\n",
+ "plt.xlabel(\"Year\")\n",
+ "plt.ylabel(\"Average Star Rating\")\n",
"plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n",
"\n",
"# fig.savefig('average_rating_category_all_data.png', dpi=300)\n",
@@ -1031,14 +1091,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT star_rating,\n",
" AVG(helpful_votes) AS avg_helpful_votes\n",
"FROM {}.{}\n",
"GROUP BY star_rating\n",
"ORDER BY star_rating ASC\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1076,10 +1138,12 @@
"metadata": {},
"outputs": [],
"source": [
- "chart = df.plot.bar(x='star_rating', y='avg_helpful_votes', rot='0', figsize=(10,5), title='Helpfulness Of Star Ratings', legend=False )\n",
+ "chart = df.plot.bar(\n",
+ " x=\"star_rating\", y=\"avg_helpful_votes\", rot=\"0\", figsize=(10, 5), title=\"Helpfulness Of Star Ratings\", legend=False\n",
+ ")\n",
"\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Average Helpful Votes')\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Average Helpful Votes\")\n",
"\n",
"# chart.get_figure().savefig('helpful-votes.png', dpi=300)\n",
"plt.show(chart)"
@@ -1108,7 +1172,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT product_title,\n",
" helpful_votes,\n",
@@ -1117,7 +1181,9 @@
" SUBSTR(review_body, 1, 100) AS review_body_substr\n",
"FROM {}.{}\n",
"ORDER BY helpful_votes DESC LIMIT 10 \n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1155,7 +1221,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT (CAST(positive_review_count AS DOUBLE) / CAST(negative_review_count AS DOUBLE)) AS positive_to_negative_sentiment_ratio\n",
"FROM (\n",
@@ -1167,7 +1233,9 @@
" FROM {}.{}\n",
" WHERE star_rating < 4\n",
")\n",
- "\"\"\".format(database_name, table_name, database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name, database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1205,7 +1273,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# SQL statement \n",
+ "# SQL statement\n",
"statement = \"\"\"\n",
"SELECT customer_id, product_category, product_title, \n",
"ROUND(AVG(star_rating),4) AS avg_star_rating, COUNT(*) AS review_count \n",
@@ -1214,7 +1282,9 @@
"HAVING COUNT(*) > 1 \n",
"ORDER BY review_count DESC\n",
"LIMIT 5\n",
- "\"\"\".format(database_name, table_name)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -1275,7 +1345,7 @@
"metadata": {},
"outputs": [],
"source": [
- "summary = df['num_words'].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n",
+ "summary = df[\"num_words\"].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n",
"summary"
]
},
@@ -1285,9 +1355,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df['num_words'].plot.hist(xticks=[0, 16, 32, 64, 128, 256], \n",
- " bins=100,\n",
- " range=[0, 256]).axvline(x=summary['80%'], c='red')"
+ "df[\"num_words\"].plot.hist(xticks=[0, 16, 32, 64, 128, 256], bins=100, range=[0, 256]).axvline(\n",
+ " x=summary[\"80%\"], c=\"red\"\n",
+ ")"
]
},
{
diff --git a/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb b/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb
index c1d4cf89..d206f883 100644
--- a/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb
+++ b/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb
@@ -44,7 +44,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -85,10 +85,12 @@
"source": [
"import csv\n",
"\n",
- "df_giftcards = pd.read_csv('./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_giftcards = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_giftcards.shape"
]
},
@@ -109,10 +111,12 @@
"source": [
"import csv\n",
"\n",
- "df_software = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_software = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_software.shape"
]
},
@@ -133,10 +137,12 @@
"source": [
"import csv\n",
"\n",
- "df_videogames = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_videogames = pd.read_csv(\n",
+ " \"./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df_videogames.shape"
]
},
@@ -163,12 +169,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_giftcards[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_giftcards[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -178,12 +187,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_software[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_software[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -193,12 +205,15 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "df_videogames[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n",
- "plt.xlabel('Star Rating')\n",
- "plt.ylabel('Review Count')"
+ "df_videogames[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n",
+ " kind=\"bar\", title=\"Breakdown by Star Rating\"\n",
+ ")\n",
+ "plt.xlabel(\"Star Rating\")\n",
+ "plt.ylabel(\"Review Count\")"
]
},
{
@@ -270,7 +285,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -286,7 +301,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n",
+ "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n",
"df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n",
"df_balanced.shape"
]
@@ -299,7 +314,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -331,7 +346,7 @@
"metadata": {},
"outputs": [],
"source": [
- "path = './data-clarify/amazon_reviews_us_giftcards_software_videogames.csv'\n",
+ "path = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\"\n",
"df.to_csv(path, index=False, header=True)"
]
},
@@ -357,7 +372,7 @@
"metadata": {},
"outputs": [],
"source": [
- "path_balanced = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv'\n",
+ "path_balanced = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv\"\n",
"df_balanced.to_csv(path_balanced, index=False, header=True)"
]
},
@@ -374,8 +389,8 @@
"metadata": {},
"outputs": [],
"source": [
- "path_jsonlines = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl'\n",
- "df_balanced.to_json(path_or_buf=path_jsonlines, orient='records', lines=True)"
+ "path_jsonlines = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl\"\n",
+ "df_balanced.to_json(path_or_buf=path_jsonlines, orient=\"records\", lines=True)"
]
},
{
@@ -392,9 +407,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path)\n",
+ "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path)\n",
"bias_data_s3_uri"
]
},
@@ -413,7 +429,9 @@
"metadata": {},
"outputs": [],
"source": [
- "balanced_bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_balanced)\n",
+ "balanced_bias_data_s3_uri = sess.upload_data(\n",
+ " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_balanced\n",
+ ")\n",
"balanced_bias_data_s3_uri"
]
},
@@ -432,7 +450,9 @@
"metadata": {},
"outputs": [],
"source": [
- "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_jsonlines)\n",
+ "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(\n",
+ " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_jsonlines\n",
+ ")\n",
"balanced_bias_data_jsonlines_s3_uri"
]
},
diff --git a/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb b/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb
index cb27a9c5..03d6d536 100644
--- a/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb
+++ b/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb
@@ -114,7 +114,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n",
+ "df = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n",
"df.shape"
]
},
@@ -132,7 +132,7 @@
},
"outputs": [],
"source": [
- "sns.countplot(data=df, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -166,11 +166,9 @@
},
"outputs": [],
"source": [
- "facet_column = report.FacetColumn(name='product_category')\n",
- "label_column = report.LabelColumn(name='star_rating', \n",
- " data=df['star_rating'], \n",
- " positive_label_values=[5, 4])\n",
- "group_variable = df['product_category']"
+ "facet_column = report.FacetColumn(name=\"product_category\")\n",
+ "label_column = report.LabelColumn(name=\"star_rating\", data=df[\"star_rating\"], positive_label_values=[5, 4])\n",
+ "group_variable = df[\"product_category\"]"
]
},
{
@@ -194,11 +192,9 @@
},
"outputs": [],
"source": [
- "report.bias_report(df, \n",
- " facet_column, \n",
- " label_column, \n",
- " stage_type=report.StageType.PRE_TRAINING, \n",
- " group_variable=group_variable)"
+ "report.bias_report(\n",
+ " df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n",
+ ")"
]
},
{
@@ -214,7 +210,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n",
+ "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n",
"df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n",
"df_balanced.shape"
]
@@ -227,7 +223,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')"
+ "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -255,12 +251,10 @@
"source": [
"from smclarify.bias import report\n",
"\n",
- "facet_column = report.FacetColumn(name='product_category')\n",
- "label_column = report.LabelColumn(name='star_rating',\n",
- " data=df_balanced['star_rating'],\n",
- " positive_label_values=[5, 4])\n",
+ "facet_column = report.FacetColumn(name=\"product_category\")\n",
+ "label_column = report.LabelColumn(name=\"star_rating\", data=df_balanced[\"star_rating\"], positive_label_values=[5, 4])\n",
"\n",
- "group_variable = df_balanced['product_category']"
+ "group_variable = df_balanced[\"product_category\"]"
]
},
{
@@ -276,11 +270,9 @@
"metadata": {},
"outputs": [],
"source": [
- "report.bias_report(df_balanced,\n",
- " facet_column,\n",
- " label_column,\n",
- " stage_type=report.StageType.PRE_TRAINING,\n",
- " group_variable=group_variable)"
+ "report.bias_report(\n",
+ " df_balanced, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n",
+ ")"
]
},
{
diff --git a/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb b/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb
index 17dd4b4a..bcd02945 100644
--- a/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb
+++ b/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb
@@ -20,12 +20,12 @@
"import pandas as pd\n",
"import numpy as np\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -72,7 +72,7 @@
"source": [
"import pandas as pd\n",
"\n",
- "data = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n",
+ "data = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n",
"data.head()"
]
},
@@ -101,7 +101,7 @@
"source": [
"import seaborn as sns\n",
"\n",
- "sns.countplot(data=data, x='star_rating', hue='product_category')"
+ "sns.countplot(data=data, x=\"star_rating\", hue=\"product_category\")"
]
},
{
@@ -121,10 +121,9 @@
"source": [
"from sagemaker import clarify\n",
"\n",
- "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n",
- " instance_count=1,\n",
- " instance_type='ml.c5.2xlarge',\n",
- " sagemaker_session=sess)"
+ "clarify_processor = clarify.SageMakerClarifyProcessor(\n",
+ " role=role, instance_count=1, instance_type=\"ml.c5.2xlarge\", sagemaker_session=sess\n",
+ ")"
]
},
{
@@ -151,13 +150,15 @@
"metadata": {},
"outputs": [],
"source": [
- "bias_report_output_path = 's3://{}/clarify'.format(bucket)\n",
+ "bias_report_output_path = \"s3://{}/clarify\".format(bucket)\n",
"\n",
- "bias_data_config = clarify.DataConfig(s3_data_input_path=bias_data_s3_uri,\n",
- " s3_output_path=bias_report_output_path,\n",
- " label='star_rating',\n",
- " headers=data.columns.to_list(),\n",
- " dataset_type='text/csv')"
+ "bias_data_config = clarify.DataConfig(\n",
+ " s3_data_input_path=bias_data_s3_uri,\n",
+ " s3_output_path=bias_report_output_path,\n",
+ " label=\"star_rating\",\n",
+ " headers=data.columns.to_list(),\n",
+ " dataset_type=\"text/csv\",\n",
+ ")"
]
},
{
@@ -177,10 +178,12 @@
"metadata": {},
"outputs": [],
"source": [
- "bias_config = clarify.BiasConfig(label_values_or_threshold=[5, 4],\n",
- " facet_name='product_category',\n",
- " facet_values_or_threshold=['Gift Card'],\n",
- " group_name='product_category')"
+ "bias_config = clarify.BiasConfig(\n",
+ " label_values_or_threshold=[5, 4],\n",
+ " facet_name=\"product_category\",\n",
+ " facet_values_or_threshold=[\"Gift Card\"],\n",
+ " group_name=\"product_category\",\n",
+ ")"
]
},
{
@@ -197,11 +200,8 @@
"outputs": [],
"source": [
"clarify_processor.run_pre_training_bias(\n",
- " data_config=bias_data_config,\n",
- " data_bias_config=bias_config,\n",
- " methods='all',\n",
- " wait=False,\n",
- " logs=False)"
+ " data_config=bias_data_config, data_bias_config=bias_config, methods=\"all\", wait=False, logs=False\n",
+ ")"
]
},
{
@@ -222,7 +222,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Processing Job'.format(region, run_pre_training_bias_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Job'.format(\n",
+ " region, run_pre_training_bias_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -233,7 +239,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, run_pre_training_bias_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, run_pre_training_bias_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -244,7 +256,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, run_pre_training_bias_processing_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n",
+ " bucket, run_pre_training_bias_processing_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -253,8 +271,9 @@
"metadata": {},
"outputs": [],
"source": [
- "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=run_pre_training_bias_processing_job_name,\n",
- " sagemaker_session=sess)\n",
+ "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n",
+ " processing_job_name=run_pre_training_bias_processing_job_name, sagemaker_session=sess\n",
+ ")\n",
"\n",
"processing_job_description = running_processor.describe()\n",
"\n",
@@ -304,7 +323,7 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Bias Report'))\n"
+ "display(HTML('Review Bias Report'))"
]
},
{
diff --git a/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb b/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb
index 14506f1d..2ee7f780 100644
--- a/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb
+++ b/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb
@@ -76,9 +76,9 @@
"try:\n",
" ingest_create_athena_table_tsv_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -97,11 +97,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_tsv_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -121,7 +121,7 @@
"import sagemaker\n",
"import boto3\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -151,12 +151,14 @@
"source": [
"from sagemaker.spark.processing import PySparkProcessor\n",
"\n",
- "processor = PySparkProcessor(base_job_name='spark-amazon-reviews-analyzer',\n",
- " role=role,\n",
- " framework_version='2.4',\n",
- " instance_count=1,\n",
- " instance_type='ml.r5.2xlarge',\n",
- " max_runtime_in_seconds=300)"
+ "processor = PySparkProcessor(\n",
+ " base_job_name=\"spark-amazon-reviews-analyzer\",\n",
+ " role=role,\n",
+ " framework_version=\"2.4\",\n",
+ " instance_count=1,\n",
+ " instance_type=\"ml.r5.2xlarge\",\n",
+ " max_runtime_in_seconds=300,\n",
+ ")"
]
},
{
@@ -165,7 +167,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_input_data = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n",
+ "s3_input_data = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n",
"print(s3_input_data)"
]
},
@@ -192,12 +194,13 @@
"outputs": [],
"source": [
"from time import gmtime, strftime\n",
+ "\n",
"timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
"\n",
- "output_prefix = 'amazon-reviews-spark-analyzer-{}'.format(timestamp_prefix)\n",
- "processing_job_name = 'amazon-reviews-spark-analyzer-{}'.format(timestamp_prefix)\n",
+ "output_prefix = \"amazon-reviews-spark-analyzer-{}\".format(timestamp_prefix)\n",
+ "processing_job_name = \"amazon-reviews-spark-analyzer-{}\".format(timestamp_prefix)\n",
"\n",
- "print('Processing job name: {}'.format(processing_job_name))"
+ "print(\"Processing job name: {}\".format(processing_job_name))"
]
},
{
@@ -206,7 +209,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_output_analyze_data = 's3://{}/{}/output'.format(bucket, output_prefix)\n",
+ "s3_output_analyze_data = \"s3://{}/{}/output\".format(bucket, output_prefix)\n",
"\n",
"print(s3_output_analyze_data)"
]
@@ -239,13 +242,17 @@
"source": [
"from sagemaker.processing import ProcessingOutput\n",
"\n",
- "processor.run(submit_app='preprocess-deequ-pyspark.py',\n",
- " submit_jars=['deequ-1.0.3-rc2.jar'],\n",
- " arguments=['s3_input_data', s3_input_data,\n",
- " 's3_output_analyze_data', s3_output_analyze_data,\n",
- " ],\n",
- " logs=True,\n",
- " wait=False\n",
+ "processor.run(\n",
+ " submit_app=\"preprocess-deequ-pyspark.py\",\n",
+ " submit_jars=[\"deequ-1.0.3-rc2.jar\"],\n",
+ " arguments=[\n",
+ " \"s3_input_data\",\n",
+ " s3_input_data,\n",
+ " \"s3_output_analyze_data\",\n",
+ " s3_output_analyze_data,\n",
+ " ],\n",
+ " logs=True,\n",
+ " wait=False,\n",
")"
]
},
@@ -257,9 +264,15 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n",
+ "processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n",
"\n",
- "display(HTML('Review Processing Job'.format(region, processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Job'.format(\n",
+ " region, processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -270,9 +283,15 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n",
+ "processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n",
"\n",
- "display(HTML('Review CloudWatch Logs After a Few Minutes'.format(region, processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After a Few Minutes'.format(\n",
+ " region, processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -285,7 +304,13 @@
"\n",
"s3_job_output_prefix = output_prefix\n",
"\n",
- "display(HTML('Review S3 Output Data After The Spark Job Has Completed'.format(bucket, s3_job_output_prefix, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Spark Job Has Completed'.format(\n",
+ " bucket, s3_job_output_prefix, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -303,8 +328,9 @@
},
"outputs": [],
"source": [
- "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=processing_job_name,\n",
- " sagemaker_session=sess)\n",
+ "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n",
+ " processing_job_name=processing_job_name, sagemaker_session=sess\n",
+ ")\n",
"\n",
"processing_job_description = running_processor.describe()\n",
"\n",
@@ -388,8 +414,11 @@
"import pandas as pd\n",
"import os\n",
"\n",
+ "\n",
"def load_dataset(path, sep, header):\n",
- " data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\n",
+ " data = pd.concat(\n",
+ " [pd.read_csv(f, sep=sep, header=header) for f in glob.glob(\"{}/*.csv\".format(path))], ignore_index=True\n",
+ " )\n",
"\n",
" return data"
]
@@ -402,8 +431,8 @@
},
"outputs": [],
"source": [
- "df_constraint_checks = load_dataset(path='./amazon-reviews-spark-analyzer/constraint-checks/', sep='\\t', header=0)\n",
- "df_constraint_checks[['check', 'constraint', 'constraint_status', 'constraint_message']]"
+ "df_constraint_checks = load_dataset(path=\"./amazon-reviews-spark-analyzer/constraint-checks/\", sep=\"\\t\", header=0)\n",
+ "df_constraint_checks[[\"check\", \"constraint\", \"constraint_status\", \"constraint_message\"]]"
]
},
{
@@ -419,7 +448,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_dataset_metrics = load_dataset(path='./amazon-reviews-spark-analyzer/dataset-metrics/', sep='\\t', header=0)\n",
+ "df_dataset_metrics = load_dataset(path=\"./amazon-reviews-spark-analyzer/dataset-metrics/\", sep=\"\\t\", header=0)\n",
"df_dataset_metrics"
]
},
@@ -438,7 +467,7 @@
},
"outputs": [],
"source": [
- "df_success_metrics = load_dataset(path='./amazon-reviews-spark-analyzer/success-metrics/', sep='\\t', header=0)\n",
+ "df_success_metrics = load_dataset(path=\"./amazon-reviews-spark-analyzer/success-metrics/\", sep=\"\\t\", header=0)\n",
"df_success_metrics"
]
},
diff --git a/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb b/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb
index ffd4df51..a37ddf12 100644
--- a/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb
+++ b/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb
@@ -29,10 +29,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.20.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -159,6 +158,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -170,6 +170,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -187,6 +188,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -202,6 +204,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -218,29 +221,24 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
" return processing_inputs\n",
"\n",
+ "\n",
"def create_container_arguments(output_name, output_content_type):\n",
- " output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- " }\n",
+ " output_config = {output_name: {\"content_type\": output_content_type}}\n",
" return [f\"--output-config '{json.dumps(output_config)}'\"]\n",
"\n",
+ "\n",
"# Create Processing Job Arguments\n",
"processing_job_arguments = {\n",
" \"AppSpecification\": {\n",
@@ -256,7 +254,7 @@
" \"S3Uri\": output_path,\n",
" \"LocalPath\": os.path.join(processing_dir, \"output\"),\n",
" \"S3UploadMode\": \"EndOfJob\",\n",
- " }\n",
+ " },\n",
" },\n",
" ],\n",
" },\n",
@@ -357,14 +355,11 @@
"region = boto3.Session().region_name\n",
"container = sagemaker.image_uris.retrieve(\"xgboost\", region, \"1.2-1\")\n",
"hyperparameters = {\n",
- " \"max_depth\":\"5\",\n",
+ " \"max_depth\": \"5\",\n",
" \"objective\": \"reg:squarederror\",\n",
" \"num_round\": \"10\",\n",
"}\n",
- "train_content_type = (\n",
- " \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\"\n",
- " else \"text/csv\"\n",
- ")\n",
+ "train_content_type = \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\" else \"text/csv\"\n",
"train_input = sagemaker.inputs.TrainingInput(\n",
" s3_data=f\"s3://{bucket}/{training_path}\",\n",
" content_type=train_content_type,\n",
diff --git a/05_explore/99_GENERATED_Python_Code.py b/05_explore/99_GENERATED_Python_Code.py
index 377fbc5b..c91f9927 100644
--- a/05_explore/99_GENERATED_Python_Code.py
+++ b/05_explore/99_GENERATED_Python_Code.py
@@ -1,10 +1,12 @@
from pyspark.sql.session import SparkSession
from pyspark.sql.dataframe import DataFrame
+
# You may want to configure the Spark Context with the right credentials provider.
-spark = SparkSession.builder.master('local').getOrCreate()
+spark = SparkSession.builder.master("local").getOrCreate()
mode = None
+
def capture_stdout(func, *args, **kwargs):
"""Capture standard output to a string buffer"""
@@ -54,7 +56,7 @@ def default_spark_with_trained_parameters_and_state(df, trained_parameters, stat
def dispatch(key_name, args, kwargs, funcs):
"""
- Dispatches to another operator based on a key in the passed parameters.
+ Dispatches to another operator based on a key in the passed parameters.
This also slices out any parameters using the parameter_name passed in,
and will reassemble the trained_parameters correctly after invocation.
@@ -98,7 +100,9 @@ def dispatch(key_name, args, kwargs, funcs):
updated_trained_parameters = result["trained_parameters"]
if existing_trained_parameters is not None or updated_trained_parameters is not None:
- existing_trained_parameters = existing_trained_parameters if existing_trained_parameters is not None else {}
+ existing_trained_parameters = (
+ existing_trained_parameters if existing_trained_parameters is not None else {}
+ )
existing_trained_parameters[parameter_name] = result["trained_parameters"]
# Update the result trained_parameters so they are part of the original structure.
@@ -153,7 +157,9 @@ def process_numeric_standard_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
@@ -207,7 +213,9 @@ def process_numeric_robust_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
@@ -263,14 +271,21 @@ def process_numeric_min_max_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
temp_normalized_vector_col = temp_col_name(assembled)
trained_parameters = load_trained_parameters(
- trained_parameters, {"input_column": input_column, "min": min, "max": max,}
+ trained_parameters,
+ {
+ "input_column": input_column,
+ "min": min,
+ "max": max,
+ },
)
scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
@@ -308,13 +323,20 @@ def process_numeric_max_absolute_scaler(df, input_column=None, output_column=Non
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
temp_normalized_vector_col = temp_col_name(assembled)
- trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,})
+ trained_parameters = load_trained_parameters(
+ trained_parameters,
+ {
+ "input_column": input_column,
+ },
+ )
scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
trained_parameters, MinMaxScalerModel, "scaler_model"
@@ -411,7 +433,9 @@ def athena_start_query_execution_core(client, request):
try:
result = client.start_query_execution(**request)
except Exception as e:
- raise RuntimeError(f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}")
+ raise RuntimeError(
+ f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}"
+ )
return result
@@ -499,7 +523,10 @@ def athena_start_query_execution(dataset_definition, client):
query_request = {
"QueryString": ctas_query,
- "QueryExecutionContext": {"Database": database_name, "Catalog": catalog_name,},
+ "QueryExecutionContext": {
+ "Database": database_name,
+ "Catalog": catalog_name,
+ },
"ResultConfiguration": {"OutputLocation": metadata_s3_output_location},
}
logging.debug("Query request is: %s", query_request)
@@ -671,8 +698,13 @@ def cast_single_column_type(
# | 2|None| bar |
# | 3| 1 | |
# +---+----+------------------+
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
- df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
+ df = df.withColumn(
+ non_castable_column,
+ f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),
+ )
elif invalid_data_handling_method == NonCastableDataHandlingMethod.REPLACE_WITH_FIXED_VALUE:
# Replace non-castable data to a value in the same column
# Original dataframe
@@ -693,7 +725,9 @@ def cast_single_column_type(
# +---+----+
value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type)
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise(
f.to_date(f.lit(value), date_formatting)
@@ -726,8 +760,13 @@ def cast_single_column_type(
# +---+----+------------------+
value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type)
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
- df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
+ df = df.withColumn(
+ non_castable_column,
+ f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),
+ )
replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise(
f.to_date(f.lit(value), date_formatting)
@@ -779,8 +818,7 @@ class OperatorSparkOperatorCustomerError(Exception):
def temp_col_name(df, *illegal_names):
- """Generates a temporary column name that is unused.
- """
+ """Generates a temporary column name that is unused."""
name = "temp_col"
idx = 0
name_set = set(list(df.columns) + list(illegal_names))
@@ -792,8 +830,7 @@ def temp_col_name(df, *illegal_names):
def get_temp_col_if_not_set(df, col_name):
- """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name.
- """
+ """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name."""
if col_name:
return col_name, False
else:
@@ -803,7 +840,7 @@ def get_temp_col_if_not_set(df, col_name):
def replace_input_if_output_is_temp(df, input_column, output_column, output_is_temp):
"""Replaces the input column in the dataframe if the output was not set
- This is used with get_temp_col_if_not_set to enable the behavior where a
+ This is used with get_temp_col_if_not_set to enable the behavior where a
transformer will replace its input column if an output is not specified.
"""
if output_is_temp:
@@ -843,7 +880,9 @@ def expects_valid_column_name(value, key, nullable=False):
return
if value is None or len(str(value).strip()) == 0:
- raise OperatorSparkOperatorCustomerError(f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}")
+ raise OperatorSparkOperatorCustomerError(
+ f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}"
+ )
def expects_parameter(value, key, condition=None):
@@ -855,12 +894,16 @@ def expects_parameter(value, key, condition=None):
def expects_column(df, value, key):
if not value or value not in df.columns:
- raise OperatorSparkOperatorCustomerError(f"Expected column in dataframe for '{key}' however received '{value}'")
+ raise OperatorSparkOperatorCustomerError(
+ f"Expected column in dataframe for '{key}' however received '{value}'"
+ )
def expects_parameter_value_in_list(key, value, items):
if value not in items:
- raise OperatorSparkOperatorCustomerError(f"Illegal parameter value. {key} expected to be in {items}, but given {value}")
+ raise OperatorSparkOperatorCustomerError(
+ f"Illegal parameter value. {key} expected to be in {items}, but given {value}"
+ )
def encode_pyspark_model(model):
@@ -963,7 +1006,6 @@ def transform_using_trained_model(model, df, loaded):
)
-
def type_inference(df): # noqa: C901 # pylint: disable=R0912
"""Core type inference logic
@@ -1234,7 +1276,9 @@ def athena_source(spark, mode, dataset_definition, trained_parameters=None): #
trained_parameters["ctas_table_name"] = ""
try:
return default_spark_with_trained_parameters_and_state(
- df=spark.read.parquet(path), trained_parameters=trained_parameters, state=get_execution_state(state),
+ df=spark.read.parquet(path),
+ trained_parameters=trained_parameters,
+ state=get_execution_state(state),
)
except Exception as e:
raise RuntimeError(
@@ -1288,12 +1332,17 @@ def infer_and_cast_type(df, spark, inference_data_sample_size=1000, trained_para
def process_numeric(df, spark, **kwargs):
return dispatch(
- "operator", [df], kwargs, {"Scale values": (process_numeric_scale_values, "scale_values_parameters"),},
+ "operator",
+ [df],
+ kwargs,
+ {
+ "Scale values": (process_numeric_scale_values, "scale_values_parameters"),
+ },
)
def custom_pyspark(df, spark, code):
- """ Apply custom pyspark operation on the input dataframe
+ """Apply custom pyspark operation on the input dataframe
Example:
The custom code expects the user to provide an output df.
@@ -1326,14 +1375,50 @@ def custom_pyspark(df, spark, code):
return default_spark_with_stdout(output_df, stdout)
-op_1_output = athena_source(spark=spark, mode=mode, **{'dataset_definition': {'datasetSourceType': 'Athena', 'name': 'amazon_reviews', 'catalogName': 'AwsDataCatalog', 'databaseName': 'dsoaws', 'queryString': 'select * from amazon_reviews_parquet', 's3OutputLocation': 's3://sagemaker-us-east-1-835319576252/athena/', 'outputFormat': 'parquet'}})
-op_2_output = infer_and_cast_type(op_1_output['default'], spark=spark, **{})
-op_5_output = process_numeric(op_2_output['default'], spark=spark, **{'operator': 'Scale values', 'scale_values_parameters': {'scaler': 'Min-max scaler', 'min_max_scaler_parameters': {'min': -1, 'max': 1, 'input_column': 'star_rating', 'output_column': 'star_rating_min_max_scaled_builtin'}, 'standard_scaler_parameters': {}}})
-op_6_output = custom_pyspark(op_5_output['default'], spark=spark, **{'code': '# Table is available as variable `df`\nfrom pyspark.sql.functions import stddev, mean, col, floor\ndf = df.withColumn("sentiment", (floor(col("star_rating_min_max_scaled_builtin"))))'})
+op_1_output = athena_source(
+ spark=spark,
+ mode=mode,
+ **{
+ "dataset_definition": {
+ "datasetSourceType": "Athena",
+ "name": "amazon_reviews",
+ "catalogName": "AwsDataCatalog",
+ "databaseName": "dsoaws",
+ "queryString": "select * from amazon_reviews_parquet",
+ "s3OutputLocation": "s3://sagemaker-us-east-1-835319576252/athena/",
+ "outputFormat": "parquet",
+ }
+ },
+)
+op_2_output = infer_and_cast_type(op_1_output["default"], spark=spark, **{})
+op_5_output = process_numeric(
+ op_2_output["default"],
+ spark=spark,
+ **{
+ "operator": "Scale values",
+ "scale_values_parameters": {
+ "scaler": "Min-max scaler",
+ "min_max_scaler_parameters": {
+ "min": -1,
+ "max": 1,
+ "input_column": "star_rating",
+ "output_column": "star_rating_min_max_scaled_builtin",
+ },
+ "standard_scaler_parameters": {},
+ },
+ },
+)
+op_6_output = custom_pyspark(
+ op_5_output["default"],
+ spark=spark,
+ **{
+ "code": '# Table is available as variable `df`\nfrom pyspark.sql.functions import stddev, mean, col, floor\ndf = df.withColumn("sentiment", (floor(col("star_rating_min_max_scaled_builtin"))))'
+ },
+)
# Glossary: variable name to node_id
#
# op_1_output: 14039109-2da9-49b4-8eee-df39306c9c47
# op_2_output: 98b4c198-d379-42ab-af96-165dcd1a01d8
# op_5_output: 93919dab-601b-4aa2-93d3-d223b6f46e25
-# op_6_output: 019d2a9b-9601-4cca-8395-9a976db0ada5
\ No newline at end of file
+# op_6_output: 019d2a9b-9601-4cca-8395-9a976db0ada5
diff --git a/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb b/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb
index 39c08037..baff3016 100644
--- a/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb
+++ b/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb
@@ -50,10 +50,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.20.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -149,8 +148,8 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_group_name = f'FG-{flow_name}'\n",
- "print(f\"Feature Group Name: {feature_group_name}\")\n"
+ "feature_group_name = f\"FG-{flow_name}\"\n",
+ "print(f\"Feature Group Name: {feature_group_name}\")"
]
},
{
@@ -169,15 +168,12 @@
"metadata": {},
"outputs": [],
"source": [
- "datawrangler_FG_type_mapping = {\n",
- " 'float': 'Fractional',\n",
- " 'long': 'Integral'\n",
- "}\n",
+ "datawrangler_FG_type_mapping = {\"float\": \"Fractional\", \"long\": \"Integral\"}\n",
"\n",
"# Some schema types in Data Wrangler are not supported by Feature Store.\n",
"# Feature store supports String, Integral, and Fractional types.\n",
"# The following will create a default_FG_type set to String for these types.\n",
- "default_FG_type = \"String\"\n"
+ "default_FG_type = \"String\""
]
},
{
@@ -195,71 +191,23 @@
"outputs": [],
"source": [
"column_schema = [\n",
- " {\n",
- " \"name\": \"marketplace\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"customer_id\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_id\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_id\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_parent\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_title\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"star_rating\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"helpful_votes\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"total_votes\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"vine\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"verified_purchase\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_headline\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_body\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"year\",\n",
- " \"type\": \"object\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_date\",\n",
- " \"type\": \"date\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_category\",\n",
- " \"type\": \"string\"\n",
- " }\n",
- "]\n"
+ " {\"name\": \"marketplace\", \"type\": \"string\"},\n",
+ " {\"name\": \"customer_id\", \"type\": \"long\"},\n",
+ " {\"name\": \"review_id\", \"type\": \"string\"},\n",
+ " {\"name\": \"product_id\", \"type\": \"string\"},\n",
+ " {\"name\": \"product_parent\", \"type\": \"long\"},\n",
+ " {\"name\": \"product_title\", \"type\": \"string\"},\n",
+ " {\"name\": \"star_rating\", \"type\": \"long\"},\n",
+ " {\"name\": \"helpful_votes\", \"type\": \"long\"},\n",
+ " {\"name\": \"total_votes\", \"type\": \"long\"},\n",
+ " {\"name\": \"vine\", \"type\": \"string\"},\n",
+ " {\"name\": \"verified_purchase\", \"type\": \"string\"},\n",
+ " {\"name\": \"review_headline\", \"type\": \"string\"},\n",
+ " {\"name\": \"review_body\", \"type\": \"string\"},\n",
+ " {\"name\": \"year\", \"type\": \"object\"},\n",
+ " {\"name\": \"review_date\", \"type\": \"date\"},\n",
+ " {\"name\": \"product_category\", \"type\": \"string\"},\n",
+ "]"
]
},
{
@@ -283,23 +231,18 @@
"source": [
"record_identifier_name = None\n",
"if record_identifier_name is None:\n",
- " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n",
+ " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n",
"\n",
"event_time_feature_name = None\n",
"if event_time_feature_name is None:\n",
- " raise RuntimeError(\"Select a column name as the event time feature name.\")\n",
+ " raise RuntimeError(\"Select a column name as the event time feature name.\")\n",
"\n",
"# Below you map the schema detected from Data Wrangler to Feature Group Types.\n",
"feature_definitions = [\n",
- " {\n",
- " \"FeatureName\": schema['name'],\n",
- " \"FeatureType\": datawrangler_FG_type_mapping.get(\n",
- " schema['type'],\n",
- " default_FG_type\n",
- " )\n",
- " } for schema in column_schema\n",
+ " {\"FeatureName\": schema[\"name\"], \"FeatureType\": datawrangler_FG_type_mapping.get(schema[\"type\"], default_FG_type)}\n",
+ " for schema in column_schema\n",
"]\n",
- "print(feature_definitions)\n"
+ "print(feature_definitions)"
]
},
{
@@ -321,38 +264,33 @@
"sagemaker_client = boto3.client(\"sagemaker\", endpoint_url=sagemaker_endpoint_url)\n",
"\n",
"# Online Store Configuration\n",
- "online_store_config = {\n",
- " \"EnableOnlineStore\": True\n",
- "}\n",
+ "online_store_config = {\"EnableOnlineStore\": True}\n",
"\n",
"# Offline Store Configuration\n",
- "s3_uri = 's3://' + bucket # this is the default bucket defined in previous cells\n",
- "offline_store_config = {\n",
- " \"S3StorageConfig\": {\n",
- " \"S3Uri\": s3_uri\n",
- " }\n",
- "}\n",
+ "s3_uri = \"s3://\" + bucket # this is the default bucket defined in previous cells\n",
+ "offline_store_config = {\"S3StorageConfig\": {\"S3Uri\": s3_uri}}\n",
"\n",
"# Create Feature Group\n",
"create_fg_response = sagemaker_client.create_feature_group(\n",
- " FeatureGroupName = feature_group_name,\n",
- " EventTimeFeatureName = event_time_feature_name,\n",
- " RecordIdentifierFeatureName = record_identifier_name,\n",
- " FeatureDefinitions = feature_definitions,\n",
- " OnlineStoreConfig = online_store_config,\n",
- " OfflineStoreConfig = offline_store_config,\n",
- " RoleArn = iam_role)\n",
+ " FeatureGroupName=feature_group_name,\n",
+ " EventTimeFeatureName=event_time_feature_name,\n",
+ " RecordIdentifierFeatureName=record_identifier_name,\n",
+ " FeatureDefinitions=feature_definitions,\n",
+ " OnlineStoreConfig=online_store_config,\n",
+ " OfflineStoreConfig=offline_store_config,\n",
+ " RoleArn=iam_role,\n",
+ ")\n",
"\n",
"# Describe Feature Group\n",
"status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n",
- "while status['FeatureGroupStatus'] != 'Created':\n",
- " if status['FeatureGroupStatus'] == 'CreateFailed':\n",
+ "while status[\"FeatureGroupStatus\"] != \"Created\":\n",
+ " if status[\"FeatureGroupStatus\"] == \"CreateFailed\":\n",
" raise RuntimeError(f\"Feature Group Creation Failed: {status}\")\n",
" status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n",
- " print(\"Feature Group Status: \" + status['FeatureGroupStatus'])\n",
+ " print(\"Feature Group Status: \" + status[\"FeatureGroupStatus\"])\n",
" time.sleep(3)\n",
"\n",
- "print(status)\n"
+ "print(status)"
]
},
{
@@ -380,6 +318,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -391,6 +330,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -408,6 +348,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -423,6 +364,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -439,16 +381,13 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
@@ -471,48 +410,40 @@
"outputs": [],
"source": [
"# Processing job name\n",
- "print(f'Processing Job Name: {processing_job_name}')\n",
- "\n",
- "processingResources = {\n",
- " 'ClusterConfig': {\n",
- " 'InstanceCount': 1,\n",
- " 'InstanceType': 'ml.m5.4xlarge',\n",
- " 'VolumeSizeInGB': 30\n",
- " }\n",
- " }\n",
+ "print(f\"Processing Job Name: {processing_job_name}\")\n",
"\n",
- "appSpecification = {'ImageUri': container_uri}\n",
+ "processingResources = {\"ClusterConfig\": {\"InstanceCount\": 1, \"InstanceType\": \"ml.m5.4xlarge\", \"VolumeSizeInGB\": 30}}\n",
+ "\n",
+ "appSpecification = {\"ImageUri\": container_uri}\n",
"\n",
"sagemaker_client.create_processing_job(\n",
- " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n",
- " ProcessingOutputConfig={\n",
- " 'Outputs': [\n",
- " {\n",
- " 'OutputName': '14039109-2da9-49b4-8eee-df39306c9c47.default',\n",
- " 'FeatureStoreOutput': {\n",
- " 'FeatureGroupName': feature_group_name\n",
- " },\n",
- " 'AppManaged': True\n",
- " }\n",
- " ],\n",
- " },\n",
- " ProcessingJobName=processing_job_name,\n",
- " ProcessingResources=processingResources,\n",
- " AppSpecification=appSpecification,\n",
- " RoleArn=iam_role\n",
- " )\n",
+ " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n",
+ " ProcessingOutputConfig={\n",
+ " \"Outputs\": [\n",
+ " {\n",
+ " \"OutputName\": \"14039109-2da9-49b4-8eee-df39306c9c47.default\",\n",
+ " \"FeatureStoreOutput\": {\"FeatureGroupName\": feature_group_name},\n",
+ " \"AppManaged\": True,\n",
+ " }\n",
+ " ],\n",
+ " },\n",
+ " ProcessingJobName=processing_job_name,\n",
+ " ProcessingResources=processingResources,\n",
+ " AppSpecification=appSpecification,\n",
+ " RoleArn=iam_role,\n",
+ ")\n",
"\n",
"\n",
"status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n",
"\n",
- "while status['ProcessingJobStatus'] in ('InProgress', 'Failed'):\n",
- " if status['ProcessingJobStatus'] == 'Failed':\n",
+ "while status[\"ProcessingJobStatus\"] in (\"InProgress\", \"Failed\"):\n",
+ " if status[\"ProcessingJobStatus\"] == \"Failed\":\n",
" raise RuntimeError(f\"Processing Job failed: {status}\")\n",
" status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n",
- " print(status['ProcessingJobStatus'])\n",
+ " print(status[\"ProcessingJobStatus\"])\n",
" time.sleep(60)\n",
"\n",
- "print(status)\n"
+ "print(status)"
]
},
{
diff --git a/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb b/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb
index 8879d1f1..3791bb2e 100644
--- a/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb
+++ b/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb
@@ -46,10 +46,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.20.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -184,6 +183,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -195,6 +195,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -212,6 +213,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -227,6 +229,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -243,29 +246,24 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
" return processing_inputs\n",
"\n",
+ "\n",
"def create_container_arguments(output_name, output_content_type):\n",
- " output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- " }\n",
+ " output_config = {output_name: {\"content_type\": output_content_type}}\n",
" return [f\"--output-config '{json.dumps(output_config)}'\"]\n",
"\n",
+ "\n",
"# Create Processing Job Arguments\n",
"processing_job_arguments = {\n",
" \"AppSpecification\": {\n",
@@ -281,7 +279,7 @@
" \"S3Uri\": output_path,\n",
" \"LocalPath\": os.path.join(processing_dir, \"output\"),\n",
" \"S3UploadMode\": \"EndOfJob\",\n",
- " }\n",
+ " },\n",
" },\n",
" ],\n",
" },\n",
@@ -317,14 +315,11 @@
"from sagemaker.workflow.steps import ProcessingStep, Step, StepTypeEnum\n",
"\n",
"processor = Processor(\n",
- " role=iam_role,\n",
- " image_uri=container_uri,\n",
- " instance_count=instance_count,\n",
- " instance_type=instance_type\n",
+ " role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type\n",
")\n",
"\n",
- "class DataWranglerStep(ProcessingStep):\n",
"\n",
+ "class DataWranglerStep(ProcessingStep):\n",
" def __init__(self, name, processor, step_args):\n",
" super(NaiveStep, self).__init__(name, processor)\n",
" self.step_args = step_args\n",
@@ -333,10 +328,9 @@
" def arguments(self):\n",
" return self.step_args\n",
"\n",
+ "\n",
"step_process = DataWranglerStep(\n",
- " name=\"DataWranglerProcessingStep\",\n",
- " processor=processor,\n",
- " step_args=processing_job_arguments\n",
+ " name=\"DataWranglerProcessingStep\", processor=processor, step_args=processing_job_arguments\n",
")"
]
},
@@ -386,7 +380,7 @@
" name=pipeline_name,\n",
" parameters=[instance_type, instance_count],\n",
" steps=[step_process],\n",
- " sagemaker_session=sagemaker_session\n",
+ " sagemaker_session=sagemaker_session,\n",
")"
]
},
@@ -517,35 +511,36 @@
"source": [
"import botocore.waiter\n",
"\n",
+ "\n",
"def get_waiter(pipeline, delay=24, max_attempts=60):\n",
" waiter_id = \"PipelineExecutionComplete\"\n",
- " model = botocore.waiter.WaiterModel({\n",
- " \"version\": 2,\n",
- " \"waiters\": {\n",
- " waiter_id: {\n",
- " \"delay\": delay,\n",
- " \"maxAttempts\": max_attempts,\n",
- " \"operation\": 'DescribePipelineExecution',\n",
- " \"acceptors\": [\n",
- " {\n",
- " \"expected\": \"Succeeded\",\n",
- " \"matcher\": \"path\",\n",
- " \"state\": \"success\",\n",
- " \"argument\": \"PipelineExecutionStatus\"\n",
- " },\n",
- " {\n",
- " \"expected\": \"Failed\",\n",
- " \"matcher\": \"path\",\n",
- " \"state\": \"failure\",\n",
- " \"argument\": \"PipelineExecutionStatus\"\n",
- " },\n",
- " ]\n",
- " }\n",
+ " model = botocore.waiter.WaiterModel(\n",
+ " {\n",
+ " \"version\": 2,\n",
+ " \"waiters\": {\n",
+ " waiter_id: {\n",
+ " \"delay\": delay,\n",
+ " \"maxAttempts\": max_attempts,\n",
+ " \"operation\": \"DescribePipelineExecution\",\n",
+ " \"acceptors\": [\n",
+ " {\n",
+ " \"expected\": \"Succeeded\",\n",
+ " \"matcher\": \"path\",\n",
+ " \"state\": \"success\",\n",
+ " \"argument\": \"PipelineExecutionStatus\",\n",
+ " },\n",
+ " {\n",
+ " \"expected\": \"Failed\",\n",
+ " \"matcher\": \"path\",\n",
+ " \"state\": \"failure\",\n",
+ " \"argument\": \"PipelineExecutionStatus\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " },\n",
" }\n",
- " })\n",
- " return botocore.waiter.create_waiter_with_client(\n",
- " waiter_id, model, sagemaker_session.sagemaker_client\n",
- " )"
+ " )\n",
+ " return botocore.waiter.create_waiter_with_client(waiter_id, model, sagemaker_session.sagemaker_client)"
]
},
{
diff --git a/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb b/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb
index 7539a1f2..e94646aa 100644
--- a/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb
+++ b/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb
@@ -62,10 +62,11 @@
"import seaborn as sns\n",
"\n",
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
- "# Get region \n",
+ "# Get region\n",
"session = boto3.session.Session()\n",
"region_name = session.region_name\n",
"\n",
@@ -73,9 +74,9 @@
"sagemaker_session = sagemaker.Session()\n",
"bucket = sagemaker_session.default_bucket()\n",
"\n",
- "# Set Athena database & table \n",
- "database_name = 'dsoaws'\n",
- "table_name = 'amazon_reviews_parquet'\n"
+ "# Set Athena database & table\n",
+ "database_name = \"dsoaws\"\n",
+ "table_name = \"amazon_reviews_parquet\""
]
},
{
@@ -104,7 +105,7 @@
"outputs": [],
"source": [
"# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
- "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
]
},
{
@@ -116,10 +117,13 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT DISTINCT product_category \\\n",
+ "cursor.execute(\n",
+ " \"SELECT DISTINCT product_category \\\n",
" FROM {0}.{1} \\\n",
- " ORDER BY product_category'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY product_category\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_categories = as_pandas(cursor)\n",
@@ -153,12 +157,15 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT product_category, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT product_category, \\\n",
" COUNT(star_rating) AS count_star_rating \\\n",
" FROM {0}.{1} \\\n",
" GROUP BY product_category \\\n",
- " ORDER BY count_star_rating DESC'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY count_star_rating DESC\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_star_ratings = as_pandas(cursor)\n",
@@ -172,7 +179,7 @@
"outputs": [],
"source": [
"# Store max ratings\n",
- "max_ratings = df_star_ratings['count_star_rating'].max()\n",
+ "max_ratings = df_star_ratings[\"count_star_rating\"].max()\n",
"print(max_ratings)"
]
},
@@ -184,24 +191,24 @@
"source": [
"# Set size and style to use\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
"\n",
- "plt.style.use('seaborn-whitegrid')\n",
+ "plt.style.use(\"seaborn-whitegrid\")\n",
"\n",
"# Create Seaborn barplot\n",
- "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df_star_ratings, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df_star_ratings, saturation=1)\n",
"\n",
"# Set title\n",
"plt.title(\"Number of Ratings per Product Category\")\n",
"\n",
- "# Set x-axis ticks to match scale \n",
+ "# Set x-axis ticks to match scale\n",
"if max_ratings > 200000:\n",
- " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
+ " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
" plt.xlim(0, 20000000)\n",
"elif max_ratings <= 200000:\n",
- " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n",
+ " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n",
" plt.xlim(0, 200000)\n",
"\n",
"plt.xlabel(\"Number of Ratings\")\n",
@@ -232,12 +239,15 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT product_category, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT product_category, \\\n",
" AVG(star_rating) AS avg_star_rating \\\n",
" FROM {0}.{1} \\\n",
" GROUP BY product_category \\\n",
- " ORDER BY avg_star_rating DESC'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY avg_star_rating DESC\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_average_ratings = as_pandas(cursor)\n",
@@ -251,22 +261,26 @@
"outputs": [],
"source": [
"# Set some Seaborn parameters in advance\n",
- "sns.set_style = 'seaborn-whitegrid'\n",
- "\n",
- "sns.set(rc={\"font.style\":\"normal\",\n",
- "# \"axes.facecolor\":\"white\",\n",
- " \"figure.facecolor\":\"white\",\n",
- " \"figure.titlesize\":20,\n",
- " \"text.color\":\"black\",\n",
- " \"xtick.color\":\"black\",\n",
- " \"ytick.color\":\"black\",\n",
- " \"axes.labelcolor\":\"black\",\n",
- " \"axes.grid\":True,\n",
- " 'axes.labelsize':10,\n",
- "# 'figure.figsize':(10.0, 10.0),\n",
- " 'xtick.labelsize':10,\n",
- " 'font.size':10,\n",
- " 'ytick.labelsize':10})"
+ "sns.set_style = \"seaborn-whitegrid\"\n",
+ "\n",
+ "sns.set(\n",
+ " rc={\n",
+ " \"font.style\": \"normal\",\n",
+ " # \"axes.facecolor\":\"white\",\n",
+ " \"figure.facecolor\": \"white\",\n",
+ " \"figure.titlesize\": 20,\n",
+ " \"text.color\": \"black\",\n",
+ " \"xtick.color\": \"black\",\n",
+ " \"ytick.color\": \"black\",\n",
+ " \"axes.labelcolor\": \"black\",\n",
+ " \"axes.grid\": True,\n",
+ " \"axes.labelsize\": 10,\n",
+ " # 'figure.figsize':(10.0, 10.0),\n",
+ " \"xtick.labelsize\": 10,\n",
+ " \"font.size\": 10,\n",
+ " \"ytick.labelsize\": 10,\n",
+ " }\n",
+ ")"
]
},
{
@@ -277,12 +291,13 @@
"source": [
"# Helper code to display values on bars\n",
"\n",
+ "\n",
"def show_values_barplot(axs, space):\n",
" def _show_on_plot(ax):\n",
" for p in ax.patches:\n",
" _x = p.get_x() + p.get_width() + float(space)\n",
" _y = p.get_y() + p.get_height()\n",
- " value = round(float(p.get_width()),2)\n",
+ " value = round(float(p.get_width()), 2)\n",
" ax.text(_x, _y, value, ha=\"left\")\n",
"\n",
" if isinstance(axs, np.ndarray):\n",
@@ -301,13 +316,13 @@
"# Plot average ratings per category\n",
"\n",
"# Create plot\n",
- "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df_average_ratings, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df_average_ratings, saturation=1)\n",
"\n",
- "# Set title and x-axis ticks \n",
- "plt.title('Average Rating by Product Category')\n",
- "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n",
+ "# Set title and x-axis ticks\n",
+ "plt.title(\"Average Rating by Product Category\")\n",
+ "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n",
"\n",
- "# Helper code to show actual values afters bars \n",
+ "# Helper code to show actual values afters bars\n",
"show_values_barplot(barplot, 0.1)\n",
"\n",
"plt.xlabel(\"Average Rating\")\n",
@@ -344,14 +359,17 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT product_category, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT product_category, \\\n",
" AVG(star_rating) AS avg_star_rating, \\\n",
" STDDEV(star_rating) AS stddev_star_rating, \\\n",
" SQRT(COUNT(*)) AS sqrt_count \\\n",
" FROM {}.{} \\\n",
" GROUP BY product_category \\\n",
- " ORDER BY avg_star_rating DESC'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY avg_star_rating DESC\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_avg_stddev_sqrt = as_pandas(cursor)\n",
@@ -374,13 +392,16 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT product_category, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT product_category, \\\n",
" AVG(star_rating) AS avg_star_rating, \\\n",
" (STDDEV(star_rating) / SQRT(COUNT(*))) AS sd_mean \\\n",
" FROM {}.{} \\\n",
" GROUP BY product_category \\\n",
- " ORDER BY avg_star_rating DESC'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY avg_star_rating DESC\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_breakdown_category_avg = as_pandas(cursor)\n",
@@ -403,13 +424,16 @@
"# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT product_category, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT product_category, \\\n",
" star_rating, \\\n",
" COUNT(*) AS count_reviews \\\n",
" FROM {}.{} \\\n",
" GROUP BY product_category, star_rating \\\n",
- " ORDER BY product_category, star_rating ASC, count_reviews DESC'\n",
- " .format(database_name, table_name))\n",
+ " ORDER BY product_category, star_rating ASC, count_reviews DESC\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_breakdown_category = as_pandas(cursor)\n",
@@ -423,11 +447,11 @@
"outputs": [],
"source": [
"# Create grouped DataFrames by category and by star rating\n",
- "grouped_category = df_breakdown_category.groupby('product_category')\n",
- "grouped_star = df_breakdown_category.groupby('star_rating')\n",
+ "grouped_category = df_breakdown_category.groupby(\"product_category\")\n",
+ "grouped_star = df_breakdown_category.groupby(\"star_rating\")\n",
"\n",
"# Create sum of ratings per star rating\n",
- "df_sum = df_breakdown_category.groupby(['star_rating']).sum()\n",
+ "df_sum = df_breakdown_category.groupby([\"star_rating\"]).sum()\n",
"df_sum.head(10)"
]
},
@@ -438,7 +462,7 @@
"outputs": [],
"source": [
"# Calculate total number of star ratings\n",
- "total = df_sum['count_reviews'].sum()\n",
+ "total = df_sum[\"count_reviews\"].sum()\n",
"print(total)"
]
},
@@ -452,17 +476,17 @@
"\n",
"distribution = {}\n",
"count_reviews_per_star = []\n",
- "i=0\n",
- " \n",
+ "i = 0\n",
+ "\n",
"for category, ratings in grouped_category:\n",
" count_reviews_per_star = []\n",
- " for star in ratings['star_rating']:\n",
- " count_reviews_per_star.append(ratings.get_value(i, 'count_reviews'))\n",
- " i=i+1;\n",
+ " for star in ratings[\"star_rating\"]:\n",
+ " count_reviews_per_star.append(ratings.get_value(i, \"count_reviews\"))\n",
+ " i = i + 1\n",
" distribution[category] = count_reviews_per_star\n",
"\n",
"# Check if distribution has been created succesfully\n",
- "print(distribution)\n"
+ "print(distribution)"
]
},
{
@@ -501,8 +525,8 @@
"# Sort distribution by highest average rating per category\n",
"sorted_distribution = {}\n",
"\n",
- "df_average_ratings.iloc[:,0]\n",
- "for index, value in df_average_ratings.iloc[:,0].items():\n",
+ "df_average_ratings.iloc[:, 0]\n",
+ "for index, value in df_average_ratings.iloc[:, 0].items():\n",
" sorted_distribution[value] = distribution[value]"
]
},
@@ -525,8 +549,7 @@
" star2.append(stars[1])\n",
" star3.append(stars[2])\n",
" star4.append(stars[3])\n",
- " star5.append(stars[4])\n",
- " "
+ " star5.append(stars[4])"
]
},
{
@@ -548,7 +571,7 @@
"proportion_star5 = np.true_divide(star5, total) * 100\n",
"\n",
"# Add colors\n",
- "colors = ['red', 'purple','blue','orange','green']\n",
+ "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n",
"\n",
"# The position of the bars on the x-axis\n",
"r = range(len(categories))\n",
@@ -556,25 +579,57 @@
"\n",
"# Plot bars\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
- "\n",
- "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n",
- "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n",
- "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n",
- "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n",
- "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n",
- "\n",
- "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n",
- "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
- "plt.yticks(r, categories, fontweight='bold')\n",
- "\n",
- "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
+ "\n",
+ "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n",
+ "ax4 = plt.barh(\n",
+ " r,\n",
+ " proportion_star4,\n",
+ " left=proportion_star5,\n",
+ " color=colors[3],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"4-Star Ratings\",\n",
+ ")\n",
+ "ax3 = plt.barh(\n",
+ " r,\n",
+ " proportion_star3,\n",
+ " left=proportion_star5 + proportion_star4,\n",
+ " color=colors[2],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"3-Star Ratings\",\n",
+ ")\n",
+ "ax2 = plt.barh(\n",
+ " r,\n",
+ " proportion_star2,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3,\n",
+ " color=colors[1],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"2-Star Ratings\",\n",
+ ")\n",
+ "ax1 = plt.barh(\n",
+ " r,\n",
+ " proportion_star1,\n",
+ " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n",
+ " color=colors[0],\n",
+ " edgecolor=\"white\",\n",
+ " height=barHeight,\n",
+ " label=\"1-Star Ratings\",\n",
+ ")\n",
+ "\n",
+ "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n",
+ "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n",
+ "plt.yticks(r, categories, fontweight=\"bold\")\n",
+ "\n",
+ "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n",
"plt.gca().invert_yaxis()\n",
"\n",
"plt.tight_layout()\n",
- "plt.show()\n"
+ "plt.show()"
]
},
{
@@ -597,11 +652,12 @@
"metadata": {},
"outputs": [],
"source": [
- "# Execute query using connection cursor \n",
+ "# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
"# If rating > 3, sentiment = 1 (positive), else 0 (negative)\n",
- "cursor.execute('SELECT customer_id, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT customer_id, \\\n",
" product_id, \\\n",
" star_rating, \\\n",
" CASE \\\n",
@@ -611,8 +667,10 @@
" AS is_positive_sentiment \\\n",
" FROM {}.{} \\\n",
" ORDER BY review_id \\\n",
- " LIMIT 10000'\n",
- " .format(database_name, table_name))\n",
+ " LIMIT 10000\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"# Load query results into Pandas DataFrame and show results\n",
"df_sentiment = as_pandas(cursor)\n",
@@ -646,10 +704,11 @@
},
"outputs": [],
"source": [
- "# Execute query using connection cursor \n",
+ "# Execute query using connection cursor\n",
"cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
"\n",
- "cursor.execute('SELECT review_body, \\\n",
+ "cursor.execute(\n",
+ " \"SELECT review_body, \\\n",
" CASE \\\n",
" WHEN star_rating > 3 THEN 1 \\\n",
" ELSE 0 \\\n",
@@ -657,8 +716,10 @@
" AS is_positive_sentiment \\\n",
" FROM {}.{} \\\n",
" ORDER BY review_id \\\n",
- " LIMIT 10000'\n",
- " .format(database_name, table_name))\n",
+ " LIMIT 10000\".format(\n",
+ " database_name, table_name\n",
+ " )\n",
+ ")\n",
"\n",
"df_reviews = as_pandas(cursor)\n",
"df_reviews.head(10)"
@@ -678,7 +739,8 @@
"outputs": [],
"source": [
"import bs4\n",
- "df_reviews['review_body'] = df_reviews['review_body'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text())\n",
+ "\n",
+ "df_reviews[\"review_body\"] = df_reviews[\"review_body\"].apply(lambda x: bs4.BeautifulSoup(x, \"lxml\").get_text())\n",
"df_reviews"
]
},
@@ -690,32 +752,41 @@
"source": [
"from wordcloud import WordCloud, STOPWORDS\n",
"\n",
- "def plot_wordcloud(text, mask=None, max_words=200, max_font_size=150, figure_size=(20.0,15.0), \n",
- " title = None, title_size=40, image_color=False):\n",
+ "\n",
+ "def plot_wordcloud(\n",
+ " text,\n",
+ " mask=None,\n",
+ " max_words=200,\n",
+ " max_font_size=150,\n",
+ " figure_size=(20.0, 15.0),\n",
+ " title=None,\n",
+ " title_size=40,\n",
+ " image_color=False,\n",
+ "):\n",
" stopwords = set(STOPWORDS)\n",
"\n",
- " wordcloud = WordCloud(background_color='gray',\n",
- " stopwords = stopwords,\n",
- " max_words = max_words,\n",
- " max_font_size = max_font_size, \n",
- " random_state = 50,\n",
- " width=800, \n",
- " height=400,\n",
- " mask = mask)\n",
+ " wordcloud = WordCloud(\n",
+ " background_color=\"gray\",\n",
+ " stopwords=stopwords,\n",
+ " max_words=max_words,\n",
+ " max_font_size=max_font_size,\n",
+ " random_state=50,\n",
+ " width=800,\n",
+ " height=400,\n",
+ " mask=mask,\n",
+ " )\n",
" wordcloud.generate(str(text))\n",
- " \n",
+ "\n",
" plt.figure(figsize=figure_size)\n",
" if image_color:\n",
- " image_colors = ImageColorGenerator(mask);\n",
- " plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation=\"bilinear\");\n",
- " plt.title(title, fontdict={'size': title_size, \n",
- " 'verticalalignment': 'bottom'})\n",
+ " image_colors = ImageColorGenerator(mask)\n",
+ " plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation=\"bilinear\")\n",
+ " plt.title(title, fontdict={\"size\": title_size, \"verticalalignment\": \"bottom\"})\n",
" else:\n",
- " plt.imshow(wordcloud);\n",
- " plt.title(title, fontdict={'size': title_size, 'color': 'black', \n",
- " 'verticalalignment': 'bottom'})\n",
- " plt.axis('off');\n",
- " plt.tight_layout() "
+ " plt.imshow(wordcloud)\n",
+ " plt.title(title, fontdict={\"size\": title_size, \"color\": \"black\", \"verticalalignment\": \"bottom\"})\n",
+ " plt.axis(\"off\")\n",
+ " plt.tight_layout()"
]
},
{
@@ -724,7 +795,9 @@
"metadata": {},
"outputs": [],
"source": [
- "plot_wordcloud(df_reviews.query('is_positive_sentiment == 0')['review_body'], title=\"Word Cloud of Negative Amazon Reviews\")"
+ "plot_wordcloud(\n",
+ " df_reviews.query(\"is_positive_sentiment == 0\")[\"review_body\"], title=\"Word Cloud of Negative Amazon Reviews\"\n",
+ ")"
]
},
{
@@ -733,7 +806,9 @@
"metadata": {},
"outputs": [],
"source": [
- "plot_wordcloud(df_reviews.query('is_positive_sentiment == 1')['review_body'], title=\"Word Cloud of Positive Amazon Reviews\")"
+ "plot_wordcloud(\n",
+ " df_reviews.query(\"is_positive_sentiment == 1\")[\"review_body\"], title=\"Word Cloud of Positive Amazon Reviews\"\n",
+ ")"
]
},
{
@@ -757,17 +832,21 @@
"source": [
"import string\n",
"\n",
- "df_reviews['num_words'] = df_reviews['review_body'].apply(lambda x: len(str(x).split()))\n",
+ "df_reviews[\"num_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x).split()))\n",
"\n",
- "df_reviews['num_unique_words'] = df_reviews['review_body'].apply(lambda x: len(set(str(x).split())))\n",
+ "df_reviews[\"num_unique_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(set(str(x).split())))\n",
"\n",
- "df_reviews['num_chars'] = df_reviews['review_body'].apply(lambda x: len(str(x)))\n",
+ "df_reviews[\"num_chars\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x)))\n",
"\n",
- "df_reviews['num_stopwords'] = df_reviews['review_body'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))\n",
+ "df_reviews[\"num_stopwords\"] = df_reviews[\"review_body\"].apply(\n",
+ " lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])\n",
+ ")\n",
"\n",
- "df_reviews['num_punctuations'] = df_reviews['review_body'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )\n",
+ "df_reviews[\"num_punctuations\"] = df_reviews[\"review_body\"].apply(\n",
+ " lambda x: len([c for c in str(x) if c in string.punctuation])\n",
+ ")\n",
"\n",
- "df_reviews['mean_word_len'] = df_reviews['review_body'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))"
+ "df_reviews[\"mean_word_len\"] = df_reviews[\"review_body\"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))"
]
},
{
@@ -803,7 +882,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_reviews = df_reviews.query('num_words <= 500 and num_punctuations < 500')"
+ "df_reviews = df_reviews.query(\"num_words <= 500 and num_punctuations < 500\")"
]
},
{
@@ -819,21 +898,21 @@
"metadata": {},
"outputs": [],
"source": [
- "f, axes = plt.subplots(3, 1, figsize=(10,20))\n",
+ "f, axes = plt.subplots(3, 1, figsize=(10, 20))\n",
"\n",
- "sns.violinplot(x='is_positive_sentiment', y='num_words', data=df_reviews, ax=axes[0])\n",
- "axes[0].set_xlabel('Sentiment', fontsize=12)\n",
- "axes[0].set_ylabel('Number Of Words', fontsize=12)\n",
+ "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_words\", data=df_reviews, ax=axes[0])\n",
+ "axes[0].set_xlabel(\"Sentiment\", fontsize=12)\n",
+ "axes[0].set_ylabel(\"Number Of Words\", fontsize=12)\n",
"axes[0].set_title(\"Number Of Words In Each Class\", fontsize=15)\n",
"\n",
- "sns.violinplot(x='is_positive_sentiment', y='num_chars', data=df_reviews, ax=axes[1])\n",
- "axes[1].set_xlabel('Sentiment', fontsize=12)\n",
- "axes[1].set_ylabel('Number Of Characters', fontsize=12)\n",
+ "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_chars\", data=df_reviews, ax=axes[1])\n",
+ "axes[1].set_xlabel(\"Sentiment\", fontsize=12)\n",
+ "axes[1].set_ylabel(\"Number Of Characters\", fontsize=12)\n",
"axes[1].set_title(\"Number Of Characters In Each Class\", fontsize=15)\n",
"\n",
- "sns.violinplot(x='is_positive_sentiment', y='num_punctuations', data=df_reviews, ax=axes[2])\n",
- "axes[2].set_xlabel('Sentiment', fontsize=12)\n",
- "axes[2].set_ylabel('Number Of Punctutations', fontsize=12)\n",
+ "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_punctuations\", data=df_reviews, ax=axes[2])\n",
+ "axes[2].set_xlabel(\"Sentiment\", fontsize=12)\n",
+ "axes[2].set_ylabel(\"Number Of Punctutations\", fontsize=12)\n",
"axes[2].set_title(\"Number Of Punctuations In Each Class\", fontsize=15)\n",
"plt.show()"
]
@@ -852,10 +931,10 @@
"outputs": [],
"source": [
"# Count number of reviews per sentiment class\n",
- "print(df_reviews['is_positive_sentiment'].value_counts())\n",
+ "print(df_reviews[\"is_positive_sentiment\"].value_counts())\n",
"\n",
"# Create Plot\n",
- "plot = sns.countplot(x='is_positive_sentiment', data=df_reviews)\n",
+ "plot = sns.countplot(x=\"is_positive_sentiment\", data=df_reviews)\n",
"plt.xlabel(\"Sentiment\", fontsize=16)\n",
"plt.ylabel(\"Number Of Reviews\", fontsize=16)\n",
"plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n",
@@ -880,22 +959,21 @@
"source": [
"from sklearn.utils import resample\n",
"\n",
- "positive = df_reviews[df_reviews['is_positive_sentiment']==1]\n",
- "negative = df_reviews[df_reviews['is_positive_sentiment']==0]\n",
+ "positive = df_reviews[df_reviews[\"is_positive_sentiment\"] == 1]\n",
+ "negative = df_reviews[df_reviews[\"is_positive_sentiment\"] == 0]\n",
"\n",
- "positive_downsampled = resample(positive,\n",
- " replace = False, # sample without replacement\n",
- " n_samples = len(negative), # match minority n\n",
- " random_state = 27) # reproducible results\n",
+ "positive_downsampled = resample(\n",
+ " positive, replace=False, n_samples=len(negative), random_state=27 # sample without replacement # match minority n\n",
+ ") # reproducible results\n",
"\n",
"# combine minority and downsampled majority\n",
"downsampled = pd.concat([positive_downsampled, negative])\n",
"\n",
"# checking counts\n",
- "print(downsampled['is_positive_sentiment'].value_counts())\n",
+ "print(downsampled[\"is_positive_sentiment\"].value_counts())\n",
"\n",
"# Create Plot\n",
- "plot = sns.countplot(x='is_positive_sentiment', data=downsampled)\n",
+ "plot = sns.countplot(x=\"is_positive_sentiment\", data=downsampled)\n",
"plt.xlabel(\"Sentiment\", fontsize=16)\n",
"plt.ylabel(\"Number Of Reviews\", fontsize=16)\n",
"plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n",
@@ -923,9 +1001,9 @@
"train, test = train_test_split(downsampled, test_size=0.2, random_state=0)\n",
"test, validate = train_test_split(test, test_size=0.5, random_state=0)\n",
"\n",
- "print(f'Number of training examples: {len(train.index)}')\n",
- "print(f'Number of testing examples: {len(test.index)}')\n",
- "print(f'Number of validation examples: {len(validate.index)}')\n"
+ "print(f\"Number of training examples: {len(train.index)}\")\n",
+ "print(f\"Number of testing examples: {len(test.index)}\")\n",
+ "print(f\"Number of validation examples: {len(validate.index)}\")"
]
},
{
@@ -943,17 +1021,17 @@
"source": [
"# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n",
"\n",
- "labels = ['Train', 'Validation', 'Test']\n",
+ "labels = [\"Train\", \"Validation\", \"Test\"]\n",
"sizes = [len(train.index), len(validate.index), len(test.index)]\n",
- "explode = (0.1, 0, 0) \n",
+ "explode = (0.1, 0, 0)\n",
"\n",
"fig1, ax1 = plt.subplots()\n",
"\n",
"ax1.set_title(\"Split Of Train, Validatin And Test Data\", fontsize=16)\n",
- "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 12})\n",
+ "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90, textprops={\"fontsize\": 12})\n",
"\n",
"# Equal aspect ratio ensures that pie is drawn as a circle.\n",
- "ax1.axis('equal') \n",
+ "ax1.axis(\"equal\")\n",
"plt.show()"
]
},
@@ -980,18 +1058,16 @@
"\n",
"\n",
"def query_athena(sql, region_name, s3_staging_dir):\n",
- " cursor = pyathena.connect(\n",
- " region_name=region_name,\n",
- " s3_staging_dir=\"{}\".format(s3_staging_dir)).cursor()\n",
+ " cursor = pyathena.connect(region_name=region_name, s3_staging_dir=\"{}\".format(s3_staging_dir)).cursor()\n",
" cursor.execute(sql)\n",
" return cursor\n",
"\n",
+ "\n",
"@magics_class\n",
"class AthenaMagics(Magics):\n",
" s3_staging_dir = None\n",
" region_name = None\n",
"\n",
- " \n",
" def parse_args(self, line):\n",
" args = magic_arguments.parse_argstring(self.athena, line)\n",
"\n",
@@ -1000,27 +1076,27 @@
" raise ValueError(\"s3_staging_dir for Athena should be set\")\n",
" if args.s3_staging_dir is not None:\n",
" self.s3_staging_dir = args.s3_staging_dir\n",
- " \n",
+ "\n",
" # region name\n",
" if args.region_name is None and self.region_name is None:\n",
" raise ValueError(\"region_name for Athena should be set\")\n",
" if args.region_name is not None:\n",
" self.region_name = args.region_name\n",
- " \n",
+ "\n",
" @cell_magic\n",
" @magic_arguments.magic_arguments()\n",
- " @magic_arguments.argument('--s3_staging_dir', '-s',\n",
- " help='s3 path required by athena for writing query results (e.g. s3://your/staging/dir)'\n",
- " )\n",
- " @magic_arguments.argument('--region_name', '-r',\n",
- " help='aws region name (e.g. us-west-2)'\n",
+ " @magic_arguments.argument(\n",
+ " \"--s3_staging_dir\",\n",
+ " \"-s\",\n",
+ " help=\"s3 path required by athena for writing query results (e.g. s3://your/staging/dir)\",\n",
" )\n",
- " def athena(self, line='', cell=None):\n",
+ " @magic_arguments.argument(\"--region_name\", \"-r\", help=\"aws region name (e.g. us-west-2)\")\n",
+ " def athena(self, line=\"\", cell=None):\n",
" self.parse_args(line)\n",
" cursor = query_athena(cell, self.region_name, self.s3_staging_dir)\n",
" return as_pandas(cursor)\n",
"\n",
- " \n",
+ "\n",
"ip = get_ipython()\n",
"ip.register_magics(AthenaMagics)"
]
diff --git a/05_explore/archive/02_Explore_Redshift_Data.ipynb b/05_explore/archive/02_Explore_Redshift_Data.ipynb
index b4433b08..ea44e7bc 100644
--- a/05_explore/archive/02_Explore_Redshift_Data.ipynb
+++ b/05_explore/archive/02_Explore_Redshift_Data.ipynb
@@ -30,13 +30,13 @@
"metadata": {},
"outputs": [],
"source": [
- "redshift_schema = 'redshift'\n",
- "redshift_cluster_identifier = 'dsoaws'\n",
- "redshift_host = 'dsoaws'\n",
- "redshift_database = 'dsoaws'\n",
- "redshift_port = '5439'\n",
- "redshift_table_2015 = 'amazon_reviews_tsv_2015'\n",
- "redshift_table_2014 = 'amazon_reviews_tsv_2014'"
+ "redshift_schema = \"redshift\"\n",
+ "redshift_cluster_identifier = \"dsoaws\"\n",
+ "redshift_host = \"dsoaws\"\n",
+ "redshift_database = \"dsoaws\"\n",
+ "redshift_port = \"5439\"\n",
+ "redshift_table_2015 = \"amazon_reviews_tsv_2015\"\n",
+ "redshift_table_2014 = \"amazon_reviews_tsv_2014\""
]
},
{
@@ -55,13 +55,13 @@
"import json\n",
"import boto3\n",
"\n",
- "secretsmanager = boto3.client('secretsmanager')\n",
+ "secretsmanager = boto3.client(\"secretsmanager\")\n",
"\n",
- "secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')\n",
- "cred = json.loads(secret['SecretString'])\n",
+ "secret = secretsmanager.get_secret_value(SecretId=\"dsoaws_redshift_login\")\n",
+ "cred = json.loads(secret[\"SecretString\"])\n",
"\n",
- "redshift_username = cred[0]['username']\n",
- "redshift_pw = cred[1]['password']"
+ "redshift_username = cred[0][\"username\"]\n",
+ "redshift_pw = cred[1][\"password\"]"
]
},
{
@@ -70,11 +70,11 @@
"metadata": {},
"outputs": [],
"source": [
- "redshift = boto3.client('redshift')\n",
+ "redshift = boto3.client(\"redshift\")\n",
"\n",
"response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)\n",
"\n",
- "redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']\n",
+ "redshift_endpoint_address = response[\"Clusters\"][0][\"Endpoint\"][\"Address\"]\n",
"\n",
"print(redshift_endpoint_address)"
]
@@ -94,7 +94,11 @@
"source": [
"from sqlalchemy import create_engine\n",
"\n",
- "engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database))"
+ "engine = create_engine(\n",
+ " \"postgresql://{}:{}@{}:{}/{}\".format(\n",
+ " redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database\n",
+ " )\n",
+ ")"
]
},
{
@@ -124,9 +128,14 @@
"outputs": [],
"source": [
"%%time\n",
- "df = pd.read_sql_query(\"\"\"SELECT approximate count(distinct customer_id)\n",
+ "df = pd.read_sql_query(\n",
+ " \"\"\"SELECT approximate count(distinct customer_id)\n",
" FROM {}.{}\n",
- " GROUP BY product_category\"\"\".format(redshift_schema, redshift_table_2015), engine)"
+ " GROUP BY product_category\"\"\".format(\n",
+ " redshift_schema, redshift_table_2015\n",
+ " ),\n",
+ " engine,\n",
+ ")"
]
},
{
@@ -136,9 +145,14 @@
"outputs": [],
"source": [
"%%time\n",
- "df = pd.read_sql_query(\"\"\"SELECT count(distinct customer_id)\n",
+ "df = pd.read_sql_query(\n",
+ " \"\"\"SELECT count(distinct customer_id)\n",
" FROM {}.{}\n",
- " GROUP BY product_category\"\"\".format(redshift_schema, redshift_table_2015), engine)"
+ " GROUP BY product_category\"\"\".format(\n",
+ " redshift_schema, redshift_table_2015\n",
+ " ),\n",
+ " engine,\n",
+ ")"
]
},
{
@@ -158,8 +172,9 @@
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
- "%config InlineBackend.figure_format='retina'\n"
+ "%config InlineBackend.figure_format='retina'"
]
},
{
@@ -174,7 +189,9 @@
"FROM {}.{}\n",
"GROUP BY product_category\n",
"ORDER BY count_star_rating DESC\n",
- "\"\"\".format(redshift_schema, redshift_table_2015)\n",
+ "\"\"\".format(\n",
+ " redshift_schema, redshift_table_2015\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -215,7 +232,7 @@
"outputs": [],
"source": [
"# Store max ratings\n",
- "max_ratings = df['count_star_rating'].max()\n",
+ "max_ratings = df[\"count_star_rating\"].max()\n",
"print(max_ratings)"
]
},
@@ -227,27 +244,30 @@
"source": [
"# Set size and style to use\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
- " \n",
- "plt.style.use('seaborn-whitegrid')\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
+ "\n",
+ "plt.style.use(\"seaborn-whitegrid\")\n",
"\n",
"# Create Seaborn barplot\n",
- "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n",
"\n",
"# Set title\n",
"plt.title(\"Number of Ratings per Product Category (Redshift)\")\n",
"\n",
"# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n",
"if max_ratings <= 8000:\n",
- " plt.xticks([10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ['10K', '20K', '30K', '40K', '50K', '60K','70K', '80K' ])\n",
+ " plt.xticks(\n",
+ " [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],\n",
+ " [\"10K\", \"20K\", \"30K\", \"40K\", \"50K\", \"60K\", \"70K\", \"80K\"],\n",
+ " )\n",
" plt.xlim(0, 80000)\n",
"elif max_ratings <= 200000:\n",
- " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n",
- " plt.xlim(0, 200000) \n",
+ " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n",
+ " plt.xlim(0, 200000)\n",
"elif max_ratings > 200000:\n",
- " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
+ " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
" plt.xlim(0, 20000000)\n",
"\n",
"plt.xlabel(\"Number of Ratings\")\n",
@@ -275,8 +295,8 @@
"metadata": {},
"outputs": [],
"source": [
- "athena_schema = 'athena'\n",
- "athena_table_name = 'amazon_reviews_tsv'\n"
+ "athena_schema = \"athena\"\n",
+ "athena_table_name = \"amazon_reviews_tsv\""
]
},
{
@@ -290,7 +310,9 @@
"FROM {}.{}\n",
"GROUP BY product_category\n",
"ORDER BY count_star_rating DESC\n",
- "\"\"\".format(athena_schema, athena_table_name)\n",
+ "\"\"\".format(\n",
+ " athena_schema, athena_table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -313,14 +335,14 @@
"source": [
"# Set size and style to use\n",
"if num_categories > 10:\n",
- " plt.figure(figsize=(10,10))\n",
- "else: \n",
- " plt.figure(figsize=(10,5))\n",
+ " plt.figure(figsize=(10, 10))\n",
+ "else:\n",
+ " plt.figure(figsize=(10, 5))\n",
"\n",
- "plt.style.use('seaborn-whitegrid')\n",
+ "plt.style.use(\"seaborn-whitegrid\")\n",
"\n",
"# Create Seaborn barplot\n",
- "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n",
+ "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n",
"\n",
"# Set title\n",
"plt.title(\"Number of Ratings per Product Category (Athena via Redshift Spectrum)\")\n",
@@ -328,13 +350,16 @@
"# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n",
"# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n",
"if max_ratings <= 8000:\n",
- " plt.xticks([10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ['10K', '20K', '30K', '40K', '50K', '60K','70K', '80K' ])\n",
+ " plt.xticks(\n",
+ " [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],\n",
+ " [\"10K\", \"20K\", \"30K\", \"40K\", \"50K\", \"60K\", \"70K\", \"80K\"],\n",
+ " )\n",
" plt.xlim(0, 80000)\n",
"elif max_ratings <= 200000:\n",
- " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n",
- " plt.xlim(0, 200000) \n",
+ " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n",
+ " plt.xlim(0, 200000)\n",
"elif max_ratings > 200000:\n",
- " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
+ " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
" plt.xlim(0, 20000000)\n",
"\n",
"plt.xlabel(\"Number of Ratings\")\n",
diff --git a/05_explore/preprocess-deequ-pyspark.py b/05_explore/preprocess-deequ-pyspark.py
index b345ad04..42eee609 100644
--- a/05_explore/preprocess-deequ-pyspark.py
+++ b/05_explore/preprocess-deequ-pyspark.py
@@ -7,8 +7,9 @@
import shutil
import csv
import subprocess
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-deps', 'pydeequ==0.1.5'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas==1.1.4'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "pydeequ==0.1.5"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas==1.1.4"])
import pyspark
from pyspark.sql import SparkSession
@@ -22,123 +23,106 @@
# PySpark Deequ GitHub Repo: https://github.com/awslabs/python-deequ
+
def main():
args_iter = iter(sys.argv[1:])
args = dict(zip(args_iter, args_iter))
# Retrieve the args and replace 's3://' with 's3a://' (used by Spark)
- s3_input_data = args['s3_input_data'].replace('s3://', 's3a://')
+ s3_input_data = args["s3_input_data"].replace("s3://", "s3a://")
print(s3_input_data)
- s3_output_analyze_data = args['s3_output_analyze_data'].replace('s3://', 's3a://')
+ s3_output_analyze_data = args["s3_output_analyze_data"].replace("s3://", "s3a://")
print(s3_output_analyze_data)
- spark = SparkSession \
- .builder \
- .appName("PySparkAmazonReviewsAnalyzer") \
- .getOrCreate()
-
- schema = StructType([
- StructField("marketplace", StringType(), True),
- StructField("customer_id", StringType(), True),
- StructField("review_id", StringType(), True),
- StructField("product_id", StringType(), True),
- StructField("product_parent", StringType(), True),
- StructField("product_title", StringType(), True),
- StructField("product_category", StringType(), True),
- StructField("star_rating", IntegerType(), True),
- StructField("helpful_votes", IntegerType(), True),
- StructField("total_votes", IntegerType(), True),
- StructField("vine", StringType(), True),
- StructField("verified_purchase", StringType(), True),
- StructField("review_headline", StringType(), True),
- StructField("review_body", StringType(), True),
- StructField("review_date", StringType(), True)
- ])
-
- dataset = spark.read.csv(s3_input_data,
- header=True,
- schema=schema,
- sep="\t",
- quote="")
+ spark = SparkSession.builder.appName("PySparkAmazonReviewsAnalyzer").getOrCreate()
+
+ schema = StructType(
+ [
+ StructField("marketplace", StringType(), True),
+ StructField("customer_id", StringType(), True),
+ StructField("review_id", StringType(), True),
+ StructField("product_id", StringType(), True),
+ StructField("product_parent", StringType(), True),
+ StructField("product_title", StringType(), True),
+ StructField("product_category", StringType(), True),
+ StructField("star_rating", IntegerType(), True),
+ StructField("helpful_votes", IntegerType(), True),
+ StructField("total_votes", IntegerType(), True),
+ StructField("vine", StringType(), True),
+ StructField("verified_purchase", StringType(), True),
+ StructField("review_headline", StringType(), True),
+ StructField("review_body", StringType(), True),
+ StructField("review_date", StringType(), True),
+ ]
+ )
+
+ dataset = spark.read.csv(s3_input_data, header=True, schema=schema, sep="\t", quote="")
# Calculate statistics on the dataset
- analysisResult = AnalysisRunner(spark) \
- .onData(dataset) \
- .addAnalyzer(Size()) \
- .addAnalyzer(Completeness("review_id")) \
- .addAnalyzer(ApproxCountDistinct("review_id")) \
- .addAnalyzer(Mean("star_rating")) \
- .addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0")) \
- .addAnalyzer(Correlation("total_votes", "star_rating")) \
- .addAnalyzer(Correlation("total_votes", "helpful_votes")) \
- .run()
+ analysisResult = (
+ AnalysisRunner(spark)
+ .onData(dataset)
+ .addAnalyzer(Size())
+ .addAnalyzer(Completeness("review_id"))
+ .addAnalyzer(ApproxCountDistinct("review_id"))
+ .addAnalyzer(Mean("star_rating"))
+ .addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0"))
+ .addAnalyzer(Correlation("total_votes", "star_rating"))
+ .addAnalyzer(Correlation("total_votes", "helpful_votes"))
+ .run()
+ )
metrics = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
metrics.show(truncate=False)
- metrics \
- .repartition(1) \
- .write.format('csv') \
- .mode('overwrite') \
- .option('header',True) \
- .option('sep','\t') \
- .save('{}/dataset-metrics'.format(s3_output_analyze_data))
+ metrics.repartition(1).write.format("csv").mode("overwrite").option("header", True).option("sep", "\t").save(
+ "{}/dataset-metrics".format(s3_output_analyze_data)
+ )
# Check data quality
- verificationResult = VerificationSuite(spark) \
- .onData(dataset) \
+ verificationResult = (
+ VerificationSuite(spark)
+ .onData(dataset)
.addCheck(
- Check(spark, CheckLevel.Error, "Review Check") \
- .hasSize(lambda x: x >= 200000) \
- .hasMin("star_rating", lambda x: x == 1.0) \
- .hasMax("star_rating", lambda x: x == 5.0) \
- .isComplete("review_id") \
- .isUnique("review_id") \
- .isComplete("marketplace") \
- .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"])) \
+ Check(spark, CheckLevel.Error, "Review Check")
+ .hasSize(lambda x: x >= 200000)
+ .hasMin("star_rating", lambda x: x == 1.0)
+ .hasMax("star_rating", lambda x: x == 5.0)
+ .isComplete("review_id")
+ .isUnique("review_id")
+ .isComplete("marketplace")
+ .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"])
+ )
.run()
+ )
print(f"Verification Run Status: {verificationResult.status}")
resultsDataFrame = VerificationResult.checkResultsAsDataFrame(spark, verificationResult)
- resultsDataFrame.show(truncate=False)
- resultsDataFrame \
- .repartition(1) \
- .write.format('csv') \
- .mode('overwrite') \
- .option('header', True) \
- .option('sep', '\t') \
- .save('{}/constraint-checks'.format(s3_output_analyze_data))
-
- verificationSuccessMetricsDataFrame = VerificationResult.successMetricsAsDataFrame(spark, verificationResult)
+ resultsDataFrame.show(truncate=False)
+ resultsDataFrame.repartition(1).write.format("csv").mode("overwrite").option("header", True).option(
+ "sep", "\t"
+ ).save("{}/constraint-checks".format(s3_output_analyze_data))
+
+ verificationSuccessMetricsDataFrame = VerificationResult.successMetricsAsDataFrame(spark, verificationResult)
verificationSuccessMetricsDataFrame.show(truncate=False)
- verificationSuccessMetricsDataFrame \
- .repartition(1) \
- .write.format('csv') \
- .mode('overwrite') \
- .option('header', True) \
- .option('sep', '\t') \
- .save('{}/success-metrics'.format(s3_output_analyze_data))
+ verificationSuccessMetricsDataFrame.repartition(1).write.format("csv").mode("overwrite").option(
+ "header", True
+ ).option("sep", "\t").save("{}/success-metrics".format(s3_output_analyze_data))
# Suggest new checks and constraints
- suggestionsResult = ConstraintSuggestionRunner(spark) \
- .onData(dataset) \
- .addConstraintRule(DEFAULT()) \
- .run()
+ suggestionsResult = ConstraintSuggestionRunner(spark).onData(dataset).addConstraintRule(DEFAULT()).run()
suggestions = suggestionsResult["constraint_suggestions"]
parallelizedSuggestions = spark.sparkContext.parallelize(suggestions)
-
+
suggestionsResultsDataFrame = spark.createDataFrame(parallelizedSuggestions)
suggestionsResultsDataFrame.show(truncate=False)
- suggestionsResultsDataFrame \
- .repartition(1) \
- .write.format('csv') \
- .mode('overwrite') \
- .option('header', True) \
- .option('sep', '\t') \
- .save('{}/constraint-suggestions'.format(s3_output_analyze_data))
-
+ suggestionsResultsDataFrame.repartition(1).write.format("csv").mode("overwrite").option("header", True).option(
+ "sep", "\t"
+ ).save("{}/constraint-suggestions".format(s3_output_analyze_data))
+
+
# spark.stop()
-
+
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb b/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb
index 8ebd63b1..0d64c8ed 100644
--- a/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb
+++ b/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb
@@ -26,13 +26,13 @@
"import sagemaker\n",
"import boto3\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)"
]
},
{
@@ -56,32 +56,25 @@
"import csv\n",
"from transformers import DistilBertTokenizer\n",
"\n",
- "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
+ "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
"\n",
- "REVIEW_BODY_COLUMN = 'review_body'\n",
- "REVIEW_ID_COLUMN = 'review_id'\n",
+ "REVIEW_BODY_COLUMN = \"review_body\"\n",
+ "REVIEW_ID_COLUMN = \"review_id\"\n",
"# DATE_COLUMN = 'date'\n",
"\n",
- "LABEL_COLUMN = 'star_rating'\n",
+ "LABEL_COLUMN = \"star_rating\"\n",
"LABEL_VALUES = [1, 2, 3, 4, 5]\n",
"\n",
"label_map = {}\n",
"for (i, label) in enumerate(LABEL_VALUES):\n",
" label_map[label] = i\n",
"\n",
- " \n",
+ "\n",
"class InputFeatures(object):\n",
- " \"\"\"BERT feature vectors.\"\"\"\n",
- "\n",
- " def __init__(self,\n",
- " input_ids,\n",
- " input_mask,\n",
- " segment_ids,\n",
- " label_id,\n",
- " review_id,\n",
- " date,\n",
- " label):\n",
- "# review_body):\n",
+ " \"\"\"BERT feature vectors.\"\"\"\n",
+ "\n",
+ " def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):\n",
+ " # review_body):\n",
" self.input_ids = input_ids\n",
" self.input_mask = input_mask\n",
" self.segment_ids = segment_ids\n",
@@ -89,48 +82,51 @@
" self.review_id = review_id\n",
" self.date = date\n",
" self.label = label\n",
+ "\n",
+ "\n",
"# self.review_body = review_body\n",
"\n",
- " \n",
+ "\n",
"class Input(object):\n",
- " \"\"\"A single training/test input for sequence classification.\"\"\"\n",
- "\n",
- " def __init__(self, text, review_id, date, label=None):\n",
- " \"\"\"Constructs an Input.\n",
- " Args:\n",
- " text: string. The untokenized text of the first sequence. For single\n",
- " sequence tasks, only this sequence must be specified.\n",
- " label: (Optional) string. The label of the example. This should be\n",
- " specified for train and dev examples, but not for test examples.\n",
- " \"\"\"\n",
- " self.text = text\n",
- " self.review_id = review_id\n",
- " self.date = date\n",
- " self.label = label\n",
- " \n",
+ " \"\"\"A single training/test input for sequence classification.\"\"\"\n",
+ "\n",
+ " def __init__(self, text, review_id, date, label=None):\n",
+ " \"\"\"Constructs an Input.\n",
+ " Args:\n",
+ " text: string. The untokenized text of the first sequence. For single\n",
+ " sequence tasks, only this sequence must be specified.\n",
+ " label: (Optional) string. The label of the example. This should be\n",
+ " specified for train and dev examples, but not for test examples.\n",
+ " \"\"\"\n",
+ " self.text = text\n",
+ " self.review_id = review_id\n",
+ " self.date = date\n",
+ " self.label = label\n",
+ "\n",
"\n",
"def convert_input(the_input, max_seq_length):\n",
" # First, we need to preprocess our data so that it matches the data BERT was trained on:\n",
" # 1. Lowercase our text (if we're using a BERT lowercase model)\n",
" # 2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n",
" # 3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n",
- " # \n",
+ " #\n",
" # Fortunately, the Transformers tokenizer does this for us!\n",
"\n",
" tokens = tokenizer.tokenize(the_input.text)\n",
- " print('**tokens**\\n{}\\n'.format(tokens))\n",
+ " print(\"**tokens**\\n{}\\n\".format(tokens))\n",
"\n",
- " encode_plus_tokens = tokenizer.encode_plus(the_input.text,\n",
- " pad_to_max_length=True,\n",
- " max_length=max_seq_length,\n",
- "# truncation=True\n",
- " )\n",
+ " encode_plus_tokens = tokenizer.encode_plus(\n",
+ " the_input.text,\n",
+ " pad_to_max_length=True,\n",
+ " max_length=max_seq_length,\n",
+ " # truncation=True\n",
+ " )\n",
"\n",
" # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n",
- " input_ids = encode_plus_tokens['input_ids']\n",
- " \n",
- " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n",
- " input_mask = encode_plus_tokens['attention_mask']\n",
+ " input_ids = encode_plus_tokens[\"input_ids\"]\n",
+ "\n",
+ " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n",
+ " input_mask = encode_plus_tokens[\"attention_mask\"]\n",
"\n",
" # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n",
" segment_ids = [0] * max_seq_length\n",
@@ -145,23 +141,24 @@
" label_id=label_id,\n",
" review_id=the_input.review_id,\n",
" date=the_input.date,\n",
- " label=the_input.label)\n",
- "# review_body=the_input.text)\n",
- "\n",
- " print('**input_ids**\\n{}\\n'.format(features.input_ids))\n",
- " print('**input_mask**\\n{}\\n'.format(features.input_mask))\n",
- " print('**segment_ids**\\n{}\\n'.format(features.segment_ids))\n",
- " print('**label_id**\\n{}\\n'.format(features.label_id))\n",
- " print('**review_id**\\n{}\\n'.format(features.review_id))\n",
- " print('**date**\\n{}\\n'.format(features.date))\n",
- " print('**label**\\n{}\\n'.format(features.label))\n",
- "# print('**review_body**\\n{}\\n'.format(features.review_body))\n",
+ " label=the_input.label,\n",
+ " )\n",
+ " # review_body=the_input.text)\n",
+ "\n",
+ " print(\"**input_ids**\\n{}\\n\".format(features.input_ids))\n",
+ " print(\"**input_mask**\\n{}\\n\".format(features.input_mask))\n",
+ " print(\"**segment_ids**\\n{}\\n\".format(features.segment_ids))\n",
+ " print(\"**label_id**\\n{}\\n\".format(features.label_id))\n",
+ " print(\"**review_id**\\n{}\\n\".format(features.review_id))\n",
+ " print(\"**date**\\n{}\\n\".format(features.date))\n",
+ " print(\"**label**\\n{}\\n\".format(features.label))\n",
+ " # print('**review_body**\\n{}\\n'.format(features.review_body))\n",
"\n",
" return features\n",
"\n",
"\n",
"# We'll need to transform our data into a format that BERT understands.\n",
- "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n",
+ "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.\n",
"# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data\n",
"def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):\n",
" records = []\n",
@@ -169,33 +166,35 @@
"\n",
" for (input_idx, the_input) in enumerate(inputs):\n",
" if input_idx % 10000 == 0:\n",
- " print('Writing input {} of {}\\n'.format(input_idx, len(inputs)))\n",
+ " print(\"Writing input {} of {}\\n\".format(input_idx, len(inputs)))\n",
"\n",
" features = convert_input(the_input, max_seq_length)\n",
"\n",
" all_features = collections.OrderedDict()\n",
- " \n",
- " # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids \n",
- " all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n",
- " all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n",
- " all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n",
- " all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n",
+ "\n",
+ " # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids\n",
+ " all_features[\"input_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n",
+ " all_features[\"input_mask\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n",
+ " all_features[\"segment_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n",
+ " all_features[\"label_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n",
"\n",
" tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n",
" tf_record_writer.write(tf_record.SerializeToString())\n",
"\n",
" # Create Record For Feature Store With All Features\n",
- " records.append({#'tf_record': tf_record.SerializeToString(),\n",
- " 'input_ids': features.input_ids,\n",
- " 'input_mask': features.input_mask,\n",
- " 'segment_ids': features.segment_ids,\n",
- " 'label_id': features.label_id,\n",
- " 'review_id': the_input.review_id,\n",
- " 'date': the_input.date,\n",
- " 'label': features.label,\n",
- "# 'review_body': features.review_body\n",
- " })\n",
- " \n",
+ " records.append(\n",
+ " { #'tf_record': tf_record.SerializeToString(),\n",
+ " \"input_ids\": features.input_ids,\n",
+ " \"input_mask\": features.input_mask,\n",
+ " \"segment_ids\": features.segment_ids,\n",
+ " \"label_id\": features.label_id,\n",
+ " \"review_id\": the_input.review_id,\n",
+ " \"date\": the_input.date,\n",
+ " \"label\": features.label,\n",
+ " # 'review_body': features.review_body\n",
+ " }\n",
+ " )\n",
+ "\n",
" tf_record_writer.close()\n",
"\n",
" return records"
@@ -246,7 +245,7 @@
"from datetime import datetime\n",
"from time import strftime\n",
"\n",
- "#timestamp = datetime.now().replace(microsecond=0).isoformat()\n",
+ "# timestamp = datetime.now().replace(microsecond=0).isoformat()\n",
"timestamp = datetime.now().strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
"print(timestamp)"
]
@@ -262,21 +261,30 @@
"import pandas as pd\n",
"\n",
"data = [\n",
- " [5, 'ABCD12345', \"\"\"I needed an \"antivirus\" application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\"],\n",
- " [3, 'EFGH12345', \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\"],\n",
- " [1, 'IJKL2345', \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\"]\n",
- " ]\n",
- "\n",
- "df = pd.DataFrame(data, columns=['star_rating', 'review_id', 'review_body'])\n",
+ " [\n",
+ " 5,\n",
+ " \"ABCD12345\",\n",
+ " \"\"\"I needed an \"antivirus\" application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\",\n",
+ " ],\n",
+ " [\n",
+ " 3,\n",
+ " \"EFGH12345\",\n",
+ " \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\",\n",
+ " ],\n",
+ " [\n",
+ " 1,\n",
+ " \"IJKL2345\",\n",
+ " \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\",\n",
+ " ],\n",
+ "]\n",
+ "\n",
+ "df = pd.DataFrame(data, columns=[\"star_rating\", \"review_id\", \"review_body\"])\n",
"\n",
"# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
- "inputs = df.apply(lambda x: Input(\n",
- " label = x[LABEL_COLUMN],\n",
- " text = x[REVIEW_BODY_COLUMN],\n",
- " review_id = x[REVIEW_ID_COLUMN],\n",
- " date = timestamp\n",
- " ),\n",
- " axis = 1)"
+ "inputs = df.apply(\n",
+ " lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),\n",
+ " axis=1,\n",
+ ")"
]
},
{
@@ -307,7 +315,7 @@
"metadata": {},
"outputs": [],
"source": [
- "output_file='./data-tfrecord-featurestore/data.tfrecord'"
+ "output_file = \"./data-tfrecord-featurestore/data.tfrecord\""
]
},
{
@@ -334,7 +342,7 @@
"metadata": {},
"outputs": [],
"source": [
- "featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)"
+ "featurestore_runtime = boto3.Session().client(service_name=\"sagemaker-featurestore-runtime\", region_name=region)"
]
},
{
@@ -356,7 +364,7 @@
"source": [
"from time import gmtime, strftime, sleep\n",
"\n",
- "feature_group_name = 'reviews-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n",
+ "feature_group_name = \"reviews-feature-group-\" + strftime(\"%d-%H-%M-%S\", gmtime())\n",
"print(feature_group_name)"
]
},
@@ -371,16 +379,16 @@
" FeatureTypeEnum,\n",
")\n",
"\n",
- "feature_definitions= [\n",
- " FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),\n",
- " FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),\n",
- "# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) \n",
+ "feature_definitions = [\n",
+ " FeatureDefinition(feature_name=\"input_ids\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"input_mask\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"segment_ids\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"label_id\", feature_type=FeatureTypeEnum.INTEGRAL),\n",
+ " FeatureDefinition(feature_name=\"review_id\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"date\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"label\", feature_type=FeatureTypeEnum.INTEGRAL),\n",
+ " # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"split_type\", feature_type=FeatureTypeEnum.STRING),\n",
"]"
]
},
@@ -392,9 +400,7 @@
"source": [
"from sagemaker.feature_store.feature_group import FeatureGroup\n",
"\n",
- "feature_group = FeatureGroup(name=feature_group_name, \n",
- " feature_definitions=feature_definitions,\n",
- " sagemaker_session=sess)\n",
+ "feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)\n",
"print(feature_group)"
]
},
@@ -428,7 +434,7 @@
"metadata": {},
"outputs": [],
"source": [
- "prefix = 'reviews-feature-store-' + timestamp\n",
+ "prefix = \"reviews-feature-store-\" + timestamp\n",
"print(prefix)"
]
},
@@ -452,7 +458,7 @@
" record_identifier_name=record_identifier_feature_name,\n",
" event_time_feature_name=event_time_feature_name,\n",
" role_arn=role,\n",
- " enable_online_store=True\n",
+ " enable_online_store=True,\n",
")"
]
},
@@ -487,7 +493,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#sm.list_feature_groups()"
+ "# sm.list_feature_groups()"
]
},
{
@@ -507,6 +513,7 @@
"source": [
"import time\n",
"\n",
+ "\n",
"def wait_for_feature_group_creation_complete(feature_group):\n",
" status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
" while status == \"Creating\":\n",
@@ -515,7 +522,7 @@
" status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
" if status != \"Created\":\n",
" raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n",
- " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n"
+ " print(f\"FeatureGroup {feature_group.name} successfully created.\")"
]
},
{
@@ -524,7 +531,7 @@
"metadata": {},
"outputs": [],
"source": [
- "wait_for_feature_group_creation_complete(feature_group=feature_group)\n"
+ "wait_for_feature_group_creation_complete(feature_group=feature_group)"
]
},
{
@@ -568,8 +575,9 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"df_records = pd.DataFrame.from_dict(records)\n",
- "df_records['split_type']='train'\n",
+ "df_records[\"split_type\"] = \"train\"\n",
"df_records"
]
},
@@ -588,7 +596,7 @@
"source": [
"def cast_object_to_string(data_frame):\n",
" for label in data_frame.columns:\n",
- " if data_frame.dtypes[label] == 'object':\n",
+ " if data_frame.dtypes[label] == \"object\":\n",
" data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")"
]
},
@@ -616,9 +624,7 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_group.ingest(\n",
- " data_frame=df_records, max_workers=3, wait=True\n",
- ")"
+ "feature_group.ingest(data_frame=df_records, max_workers=3, wait=True)"
]
},
{
@@ -638,16 +644,15 @@
"source": [
"offline_store_contents = None\n",
"\n",
- "while (offline_store_contents is None):\n",
- " objects_in_bucket = s3.list_objects(Bucket=bucket,\n",
- " Prefix=prefix)\n",
- " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n",
- " offline_store_contents = objects_in_bucket['Contents']\n",
+ "while offline_store_contents is None:\n",
+ " objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=prefix)\n",
+ " if \"Contents\" in objects_in_bucket and len(objects_in_bucket[\"Contents\"]) > 1:\n",
+ " offline_store_contents = objects_in_bucket[\"Contents\"]\n",
" else:\n",
- " print('Waiting for data in offline store...\\n')\n",
+ " print(\"Waiting for data in offline store...\\n\")\n",
" sleep(60)\n",
- " \n",
- "print('Data available.')"
+ "\n",
+ "print(\"Data available.\")"
]
},
{
@@ -674,10 +679,11 @@
},
"outputs": [],
"source": [
- "record_identifier_value = 'IJKL2345'\n",
+ "record_identifier_value = \"IJKL2345\"\n",
"\n",
- "featurestore_runtime.get_record(FeatureGroupName=feature_group_name, \n",
- " RecordIdentifierValueAsString=record_identifier_value)"
+ "featurestore_runtime.get_record(\n",
+ " FeatureGroupName=feature_group_name, RecordIdentifierValueAsString=record_identifier_value\n",
+ ")"
]
},
{
@@ -751,9 +757,11 @@
"source": [
"query_string = \"\"\"\n",
"SELECT input_ids, input_mask, segment_ids, label_id, split_type FROM \"{}\" WHERE split_type='train' LIMIT 5\n",
- "\"\"\".format(feature_store_table)\n",
+ "\"\"\".format(\n",
+ " feature_store_table\n",
+ ")\n",
"\n",
- "print('Running ' + query_string)"
+ "print(\"Running \" + query_string)"
]
},
{
@@ -770,7 +778,7 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_store_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/query_results/')\n",
+ "feature_store_query.run(query_string=query_string, output_location=\"s3://\" + bucket + \"/\" + prefix + \"/query_results/\")\n",
"\n",
"feature_store_query.wait()"
]
diff --git a/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb b/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb
index 03efc6c9..b77a0a84 100644
--- a/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb
+++ b/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb
@@ -72,8 +72,8 @@
"bucket = sess.default_bucket()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)"
]
},
{
@@ -103,9 +103,9 @@
"try:\n",
" s3_public_path_tsv\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the INGEST section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the INGEST section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -139,9 +139,9 @@
"try:\n",
" s3_private_path_tsv\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the INGEST section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the INGEST section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -181,7 +181,7 @@
},
"outputs": [],
"source": [
- "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n",
+ "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n",
"print(raw_input_data_s3_uri)"
]
},
@@ -278,12 +278,13 @@
"timestamp = int(time.time())\n",
"\n",
"experiment = Experiment.create(\n",
- " experiment_name='Amazon-Customer-Reviews-BERT-Experiment-{}'.format(timestamp),\n",
- " description='Amazon Customer Reviews BERT Experiment', \n",
- " sagemaker_boto_client=sm)\n",
+ " experiment_name=\"Amazon-Customer-Reviews-BERT-Experiment-{}\".format(timestamp),\n",
+ " description=\"Amazon Customer Reviews BERT Experiment\",\n",
+ " sagemaker_boto_client=sm,\n",
+ ")\n",
"\n",
"experiment_name = experiment.experiment_name\n",
- "print('Experiment name: {}'.format(experiment_name))"
+ "print(\"Experiment name: {}\".format(experiment_name))"
]
},
{
@@ -304,12 +305,12 @@
"\n",
"timestamp = int(time.time())\n",
"\n",
- "trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n",
- " experiment_name=experiment_name,\n",
- " sagemaker_boto_client=sm)\n",
+ "trial = Trial.create(\n",
+ " trial_name=\"trial-{}\".format(timestamp), experiment_name=experiment_name, sagemaker_boto_client=sm\n",
+ ")\n",
"\n",
"trial_name = trial.trial_name\n",
- "print('Trial name: {}'.format(trial_name))"
+ "print(\"Trial name: {}\".format(trial_name))"
]
},
{
@@ -326,9 +327,9 @@
"outputs": [],
"source": [
"experiment_config = {\n",
- " 'ExperimentName': experiment_name,\n",
- " 'TrialName': trial_name,\n",
- " 'TrialComponentDisplayName': 'prepare'\n",
+ " \"ExperimentName\": experiment_name,\n",
+ " \"TrialName\": trial_name,\n",
+ " \"TrialComponentDisplayName\": \"prepare\",\n",
"}"
]
},
@@ -381,7 +382,7 @@
"metadata": {},
"outputs": [],
"source": [
- "featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)"
+ "featurestore_runtime = boto3.Session().client(service_name=\"sagemaker-featurestore-runtime\", region_name=region)"
]
},
{
@@ -392,7 +393,7 @@
"source": [
"timestamp = int(time.time())\n",
"\n",
- "feature_store_offline_prefix = 'reviews-feature-store-' + str(timestamp)\n",
+ "feature_store_offline_prefix = \"reviews-feature-store-\" + str(timestamp)\n",
"\n",
"print(feature_store_offline_prefix)"
]
@@ -403,7 +404,7 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_group_name = 'reviews-feature-group-' + str(timestamp)\n",
+ "feature_group_name = \"reviews-feature-group-\" + str(timestamp)\n",
"\n",
"print(feature_group_name)"
]
@@ -419,15 +420,15 @@
" FeatureTypeEnum,\n",
")\n",
"\n",
- "feature_definitions= [\n",
- " FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),\n",
- " FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),\n",
- " FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),\n",
- "# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING)\n",
+ "feature_definitions = [\n",
+ " FeatureDefinition(feature_name=\"input_ids\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"input_mask\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"segment_ids\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"label_id\", feature_type=FeatureTypeEnum.INTEGRAL),\n",
+ " FeatureDefinition(feature_name=\"review_id\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"date\", feature_type=FeatureTypeEnum.STRING),\n",
+ " FeatureDefinition(feature_name=\"label\", feature_type=FeatureTypeEnum.INTEGRAL),\n",
+ " # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING)\n",
"]"
]
},
@@ -439,10 +440,7 @@
"source": [
"from sagemaker.feature_store.feature_group import FeatureGroup\n",
"\n",
- "feature_group = FeatureGroup(\n",
- " name=feature_group_name, \n",
- " feature_definitions=feature_definitions,\n",
- " sagemaker_session=sess)\n",
+ "feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)\n",
"\n",
"print(feature_group)"
]
@@ -462,13 +460,13 @@
},
"outputs": [],
"source": [
- "processing_instance_type='ml.c5.2xlarge'\n",
- "processing_instance_count=2\n",
- "train_split_percentage=0.90\n",
- "validation_split_percentage=0.05\n",
- "test_split_percentage=0.05\n",
- "balance_dataset=True\n",
- "max_seq_length=64"
+ "processing_instance_type = \"ml.c5.2xlarge\"\n",
+ "processing_instance_count = 2\n",
+ "train_split_percentage = 0.90\n",
+ "validation_split_percentage = 0.05\n",
+ "test_split_percentage = 0.05\n",
+ "balance_dataset = True\n",
+ "max_seq_length = 64"
]
},
{
@@ -512,12 +510,14 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " env={'AWS_DEFAULT_REGION': region},\n",
- " max_runtime_in_seconds=7200)"
+ "processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " env={\"AWS_DEFAULT_REGION\": region},\n",
+ " max_runtime_in_seconds=7200,\n",
+ ")"
]
},
{
@@ -528,35 +528,49 @@
"source": [
"from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
"\n",
- "processor.run(code='preprocess-scikit-text-to-bert-feature-store.py',\n",
- " inputs=[\n",
- " ProcessingInput(input_name='raw-input-data',\n",
- " source=raw_input_data_s3_uri,\n",
- " destination='/opt/ml/processing/input/data/',\n",
- " s3_data_distribution_type='ShardedByS3Key')\n",
- " ],\n",
- " outputs=[\n",
- " ProcessingOutput(output_name='bert-train',\n",
- " s3_upload_mode='EndOfJob', \n",
- " source='/opt/ml/processing/output/bert/train'),\n",
- " ProcessingOutput(output_name='bert-validation',\n",
- " s3_upload_mode='EndOfJob', \n",
- " source='/opt/ml/processing/output/bert/validation'),\n",
- " ProcessingOutput(output_name='bert-test',\n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/bert/test'),\n",
- " ],\n",
- " arguments=['--train-split-percentage', str(train_split_percentage),\n",
- " '--validation-split-percentage', str(validation_split_percentage),\n",
- " '--test-split-percentage', str(test_split_percentage),\n",
- " '--max-seq-length', str(max_seq_length),\n",
- " '--balance-dataset', str(balance_dataset),\n",
- " '--feature-store-offline-prefix', str(feature_store_offline_prefix),\n",
- " '--feature-group-name', str(feature_group_name)\n",
- " ],\n",
- " experiment_config=experiment_config,\n",
- " logs=True,\n",
- " wait=False)"
+ "processor.run(\n",
+ " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n",
+ " inputs=[\n",
+ " ProcessingInput(\n",
+ " input_name=\"raw-input-data\",\n",
+ " source=raw_input_data_s3_uri,\n",
+ " destination=\"/opt/ml/processing/input/data/\",\n",
+ " s3_data_distribution_type=\"ShardedByS3Key\",\n",
+ " )\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-train\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/bert/train\"\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-validation\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/validation\",\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-test\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/bert/test\"\n",
+ " ),\n",
+ " ],\n",
+ " arguments=[\n",
+ " \"--train-split-percentage\",\n",
+ " str(train_split_percentage),\n",
+ " \"--validation-split-percentage\",\n",
+ " str(validation_split_percentage),\n",
+ " \"--test-split-percentage\",\n",
+ " str(test_split_percentage),\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length),\n",
+ " \"--balance-dataset\",\n",
+ " str(balance_dataset),\n",
+ " \"--feature-store-offline-prefix\",\n",
+ " str(feature_store_offline_prefix),\n",
+ " \"--feature-group-name\",\n",
+ " str(feature_group_name),\n",
+ " ],\n",
+ " experiment_config=experiment_config,\n",
+ " logs=True,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -567,7 +581,7 @@
},
"outputs": [],
"source": [
- "scikit_processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n",
+ "scikit_processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n",
"print(scikit_processing_job_name)"
]
},
@@ -581,7 +595,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Processing Job'.format(region, scikit_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Job'.format(\n",
+ " region, scikit_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -594,7 +614,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, scikit_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, scikit_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -607,7 +633,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, scikit_processing_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n",
+ " bucket, scikit_processing_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -625,8 +657,9 @@
},
"outputs": [],
"source": [
- "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=scikit_processing_job_name,\n",
- " sagemaker_session=sess)\n",
+ "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n",
+ " processing_job_name=scikit_processing_job_name, sagemaker_session=sess\n",
+ ")\n",
"\n",
"processing_job_description = running_processor.describe()\n",
"\n",
@@ -668,15 +701,15 @@
"source": [
"processing_job_description = running_processor.describe()\n",
"\n",
- "output_config = processing_job_description['ProcessingOutputConfig']\n",
- "for output in output_config['Outputs']:\n",
- " if output['OutputName'] == 'bert-train':\n",
- " processed_train_data_s3_uri = output['S3Output']['S3Uri']\n",
- " if output['OutputName'] == 'bert-validation':\n",
- " processed_validation_data_s3_uri = output['S3Output']['S3Uri']\n",
- " if output['OutputName'] == 'bert-test':\n",
- " processed_test_data_s3_uri = output['S3Output']['S3Uri']\n",
- " \n",
+ "output_config = processing_job_description[\"ProcessingOutputConfig\"]\n",
+ "for output in output_config[\"Outputs\"]:\n",
+ " if output[\"OutputName\"] == \"bert-train\":\n",
+ " processed_train_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n",
+ " if output[\"OutputName\"] == \"bert-validation\":\n",
+ " processed_validation_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n",
+ " if output[\"OutputName\"] == \"bert-test\":\n",
+ " processed_test_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n",
+ "\n",
"print(processed_train_data_s3_uri)\n",
"print(processed_validation_data_s3_uri)\n",
"print(processed_test_data_s3_uri)"
@@ -879,9 +912,11 @@
"source": [
"query_string = \"\"\"\n",
"SELECT input_ids, input_mask, segment_ids, label_id, split_type FROM \"{}\" WHERE split_type='train' LIMIT 5\n",
- "\"\"\".format(feature_store_table)\n",
+ "\"\"\".format(\n",
+ " feature_store_table\n",
+ ")\n",
"\n",
- "print('Running ' + query_string)"
+ "print(\"Running \" + query_string)"
]
},
{
@@ -890,7 +925,10 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_store_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+feature_store_offline_prefix+'/query_results/')\n",
+ "feature_store_query.run(\n",
+ " query_string=query_string,\n",
+ " output_location=\"s3://\" + bucket + \"/\" + feature_store_offline_prefix + \"/query_results/\",\n",
+ ")\n",
"\n",
"feature_store_query.wait()"
]
@@ -901,10 +939,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#import pandas as pd\n",
- "#dataset = pd.DataFrame()\n",
- "#dataset = feature_store_query.as_dataframe()\n",
- "#dataset\n",
+ "# import pandas as pd\n",
+ "# dataset = pd.DataFrame()\n",
+ "# dataset = feature_store_query.as_dataframe()\n",
+ "# dataset\n",
"\n",
"feature_store_query.as_dataframe()"
]
@@ -925,14 +963,12 @@
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
- "#pd.set_option(\"max_rows\", 100)\n",
+ "# pd.set_option(\"max_rows\", 100)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
- " sagemaker_session=sess,\n",
- " experiment_name=experiment_name,\n",
- " sort_by=\"CreationTime\",\n",
- " sort_order=\"Descending\"\n",
+ " sagemaker_session=sess, experiment_name=experiment_name, sort_by=\"CreationTime\", sort_order=\"Descending\"\n",
")\n",
"\n",
"experiment_analytics_df = experiment_analytics.dataframe()\n",
@@ -945,7 +981,7 @@
"metadata": {},
"outputs": [],
"source": [
- "trial_component_name=experiment_analytics_df.TrialComponentName[0]\n",
+ "trial_component_name = experiment_analytics_df.TrialComponentName[0]\n",
"print(trial_component_name)"
]
},
@@ -955,7 +991,7 @@
"metadata": {},
"outputs": [],
"source": [
- "trial_component_description=sm.describe_trial_component(TrialComponentName=trial_component_name)\n",
+ "trial_component_description = sm.describe_trial_component(TrialComponentName=trial_component_name)\n",
"trial_component_description"
]
},
diff --git a/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb b/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb
index 33e1f4b5..13a63207 100644
--- a/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb
+++ b/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb
@@ -29,10 +29,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.17.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -159,6 +158,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -170,6 +170,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -187,6 +188,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -202,6 +204,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -218,29 +221,24 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
" return processing_inputs\n",
"\n",
+ "\n",
"def create_container_arguments(output_name, output_content_type):\n",
- " output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- " }\n",
+ " output_config = {output_name: {\"content_type\": output_content_type}}\n",
" return [f\"--output-config '{json.dumps(output_config)}'\"]\n",
"\n",
+ "\n",
"# Create Processing Job Arguments\n",
"processing_job_arguments = {\n",
" \"AppSpecification\": {\n",
@@ -256,7 +254,7 @@
" \"S3Uri\": output_path,\n",
" \"LocalPath\": os.path.join(processing_dir, \"output\"),\n",
" \"S3UploadMode\": \"EndOfJob\",\n",
- " }\n",
+ " },\n",
" },\n",
" ],\n",
" },\n",
@@ -357,14 +355,11 @@
"region = boto3.Session().region_name\n",
"container = sagemaker.image_uris.retrieve(\"xgboost\", region, \"1.2-1\")\n",
"hyperparameters = {\n",
- " \"max_depth\":\"5\",\n",
+ " \"max_depth\": \"5\",\n",
" \"objective\": \"reg:squarederror\",\n",
" \"num_round\": \"10\",\n",
"}\n",
- "train_content_type = (\n",
- " \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\"\n",
- " else \"text/csv\"\n",
- ")\n",
+ "train_content_type = \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\" else \"text/csv\"\n",
"train_input = sagemaker.inputs.TrainingInput(\n",
" s3_data=f\"s3://{bucket}/{training_path}\",\n",
" content_type=train_content_type,\n",
diff --git a/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb b/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb
index 101c34b6..ce009003 100644
--- a/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb
+++ b/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb
@@ -50,10 +50,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.17.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -165,8 +164,8 @@
}
],
"source": [
- "feature_group_name = f'FG-{flow_name}'\n",
- "print(f\"Feature Group Name: {feature_group_name}\")\n"
+ "feature_group_name = f\"FG-{flow_name}\"\n",
+ "print(f\"Feature Group Name: {feature_group_name}\")"
]
},
{
@@ -185,15 +184,12 @@
"metadata": {},
"outputs": [],
"source": [
- "datawrangler_FG_type_mapping = {\n",
- " 'float': 'Fractional',\n",
- " 'long': 'Integral'\n",
- "}\n",
+ "datawrangler_FG_type_mapping = {\"float\": \"Fractional\", \"long\": \"Integral\"}\n",
"\n",
"# Some schema types in Data Wrangler are not supported by Feature Store.\n",
"# Feature store supports String, Integral, and Fractional types.\n",
"# The following will create a default_FG_type set to String for these types.\n",
- "default_FG_type = \"String\"\n"
+ "default_FG_type = \"String\""
]
},
{
@@ -211,71 +207,23 @@
"outputs": [],
"source": [
"column_schema = [\n",
- " {\n",
- " \"name\": \"marketplace\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"customer_id\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_id\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_id\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_parent\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_title\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"product_category\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"vine\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"verified_purchase\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_headline\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_body\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"review_date\",\n",
- " \"type\": \"date\"\n",
- " },\n",
- " {\n",
- " \"name\": \"star_rating\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"helpful_votes\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"total_votes\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"star_rating_scaled\",\n",
- " \"type\": \"float\"\n",
- " }\n",
- "]\n"
+ " {\"name\": \"marketplace\", \"type\": \"string\"},\n",
+ " {\"name\": \"customer_id\", \"type\": \"long\"},\n",
+ " {\"name\": \"review_id\", \"type\": \"string\"},\n",
+ " {\"name\": \"product_id\", \"type\": \"string\"},\n",
+ " {\"name\": \"product_parent\", \"type\": \"long\"},\n",
+ " {\"name\": \"product_title\", \"type\": \"string\"},\n",
+ " {\"name\": \"product_category\", \"type\": \"string\"},\n",
+ " {\"name\": \"vine\", \"type\": \"string\"},\n",
+ " {\"name\": \"verified_purchase\", \"type\": \"string\"},\n",
+ " {\"name\": \"review_headline\", \"type\": \"string\"},\n",
+ " {\"name\": \"review_body\", \"type\": \"string\"},\n",
+ " {\"name\": \"review_date\", \"type\": \"date\"},\n",
+ " {\"name\": \"star_rating\", \"type\": \"long\"},\n",
+ " {\"name\": \"helpful_votes\", \"type\": \"long\"},\n",
+ " {\"name\": \"total_votes\", \"type\": \"long\"},\n",
+ " {\"name\": \"star_rating_scaled\", \"type\": \"float\"},\n",
+ "]"
]
},
{
@@ -305,25 +253,20 @@
}
],
"source": [
- "record_identifier_name = 'review_id'\n",
+ "record_identifier_name = \"review_id\"\n",
"if record_identifier_name is None:\n",
- " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n",
+ " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n",
"\n",
- "event_time_feature_name = 'review_date'\n",
+ "event_time_feature_name = \"review_date\"\n",
"if event_time_feature_name is None:\n",
- " raise RuntimeError(\"Select a column name as the event time feature name.\")\n",
+ " raise RuntimeError(\"Select a column name as the event time feature name.\")\n",
"\n",
"# Below you map the schema detected from Data Wrangler to Feature Group Types.\n",
"feature_definitions = [\n",
- " {\n",
- " \"FeatureName\": schema['name'],\n",
- " \"FeatureType\": datawrangler_FG_type_mapping.get(\n",
- " schema['type'],\n",
- " default_FG_type\n",
- " )\n",
- " } for schema in column_schema\n",
+ " {\"FeatureName\": schema[\"name\"], \"FeatureType\": datawrangler_FG_type_mapping.get(schema[\"type\"], default_FG_type)}\n",
+ " for schema in column_schema\n",
"]\n",
- "print(feature_definitions)\n"
+ "print(feature_definitions)"
]
},
{
@@ -358,38 +301,33 @@
"sagemaker_client = boto3.client(\"sagemaker\", endpoint_url=sagemaker_endpoint_url)\n",
"\n",
"# Online Store Configuration\n",
- "online_store_config = {\n",
- " \"EnableOnlineStore\": True\n",
- "}\n",
+ "online_store_config = {\"EnableOnlineStore\": True}\n",
"\n",
"# Offline Store Configuration\n",
- "s3_uri = 's3://' + bucket # this is the default bucket defined in previous cells\n",
- "offline_store_config = {\n",
- " \"S3StorageConfig\": {\n",
- " \"S3Uri\": s3_uri\n",
- " }\n",
- "}\n",
+ "s3_uri = \"s3://\" + bucket # this is the default bucket defined in previous cells\n",
+ "offline_store_config = {\"S3StorageConfig\": {\"S3Uri\": s3_uri}}\n",
"\n",
"# Create Feature Group\n",
"create_fg_response = sagemaker_client.create_feature_group(\n",
- " FeatureGroupName = feature_group_name,\n",
- " EventTimeFeatureName = event_time_feature_name,\n",
- " RecordIdentifierFeatureName = record_identifier_name,\n",
- " FeatureDefinitions = feature_definitions,\n",
- " OnlineStoreConfig = online_store_config,\n",
- " OfflineStoreConfig = offline_store_config,\n",
- " RoleArn = iam_role)\n",
+ " FeatureGroupName=feature_group_name,\n",
+ " EventTimeFeatureName=event_time_feature_name,\n",
+ " RecordIdentifierFeatureName=record_identifier_name,\n",
+ " FeatureDefinitions=feature_definitions,\n",
+ " OnlineStoreConfig=online_store_config,\n",
+ " OfflineStoreConfig=offline_store_config,\n",
+ " RoleArn=iam_role,\n",
+ ")\n",
"\n",
"# Describe Feature Group\n",
"status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n",
- "while status['FeatureGroupStatus'] != 'Created':\n",
- " if status['FeatureGroupStatus'] == 'CreateFailed':\n",
+ "while status[\"FeatureGroupStatus\"] != \"Created\":\n",
+ " if status[\"FeatureGroupStatus\"] == \"CreateFailed\":\n",
" raise RuntimeError(f\"Feature Group Creation Failed: {status}\")\n",
" status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n",
- " print(\"Feature Group Status: \" + status['FeatureGroupStatus'])\n",
+ " print(\"Feature Group Status: \" + status[\"FeatureGroupStatus\"])\n",
" time.sleep(3)\n",
"\n",
- "print(status)\n"
+ "print(status)"
]
},
{
@@ -417,6 +355,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -428,6 +367,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -445,6 +385,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -460,6 +401,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -476,16 +418,13 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
@@ -533,48 +472,40 @@
],
"source": [
"# Processing job name\n",
- "print(f'Processing Job Name: {processing_job_name}')\n",
- "\n",
- "processingResources = {\n",
- " 'ClusterConfig': {\n",
- " 'InstanceCount': 1,\n",
- " 'InstanceType': 'ml.m5.4xlarge',\n",
- " 'VolumeSizeInGB': 30\n",
- " }\n",
- " }\n",
+ "print(f\"Processing Job Name: {processing_job_name}\")\n",
"\n",
- "appSpecification = {'ImageUri': container_uri}\n",
+ "processingResources = {\"ClusterConfig\": {\"InstanceCount\": 1, \"InstanceType\": \"ml.m5.4xlarge\", \"VolumeSizeInGB\": 30}}\n",
+ "\n",
+ "appSpecification = {\"ImageUri\": container_uri}\n",
"\n",
"sagemaker_client.create_processing_job(\n",
- " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n",
- " ProcessingOutputConfig={\n",
- " 'Outputs': [\n",
- " {\n",
- " 'OutputName': 'e880c72f-910c-4554-9a28-a66ce9d3b35f.default',\n",
- " 'FeatureStoreOutput': {\n",
- " 'FeatureGroupName': feature_group_name\n",
- " },\n",
- " 'AppManaged': True\n",
- " }\n",
- " ],\n",
- " },\n",
- " ProcessingJobName=processing_job_name,\n",
- " ProcessingResources=processingResources,\n",
- " AppSpecification=appSpecification,\n",
- " RoleArn=iam_role\n",
- " )\n",
+ " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n",
+ " ProcessingOutputConfig={\n",
+ " \"Outputs\": [\n",
+ " {\n",
+ " \"OutputName\": \"e880c72f-910c-4554-9a28-a66ce9d3b35f.default\",\n",
+ " \"FeatureStoreOutput\": {\"FeatureGroupName\": feature_group_name},\n",
+ " \"AppManaged\": True,\n",
+ " }\n",
+ " ],\n",
+ " },\n",
+ " ProcessingJobName=processing_job_name,\n",
+ " ProcessingResources=processingResources,\n",
+ " AppSpecification=appSpecification,\n",
+ " RoleArn=iam_role,\n",
+ ")\n",
"\n",
"\n",
"status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n",
"\n",
- "while status['ProcessingJobStatus'] in ('InProgress', 'Failed'):\n",
- " if status['ProcessingJobStatus'] == 'Failed':\n",
+ "while status[\"ProcessingJobStatus\"] in (\"InProgress\", \"Failed\"):\n",
+ " if status[\"ProcessingJobStatus\"] == \"Failed\":\n",
" raise RuntimeError(f\"Processing Job failed: {status}\")\n",
" status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n",
- " print(status['ProcessingJobStatus'])\n",
+ " print(status[\"ProcessingJobStatus\"])\n",
" time.sleep(60)\n",
"\n",
- "print(status)\n"
+ "print(status)"
]
},
{
diff --git a/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb b/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb
index 9d5b8a76..d79a7219 100644
--- a/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb
+++ b/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb
@@ -46,10 +46,9 @@
"\n",
"original_version = sagemaker.__version__\n",
"if sagemaker.__version__ != \"2.17.0\":\n",
- " subprocess.check_call(\n",
- " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n",
- " )\n",
+ " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n",
" import importlib\n",
+ "\n",
" importlib.reload(sagemaker)"
]
},
@@ -184,6 +183,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_s3_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -195,6 +195,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_redshift_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -212,6 +213,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_athena_processing_input(base_dir, name, dataset_definition):\n",
" return {\n",
" \"InputName\": name,\n",
@@ -227,6 +229,7 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def create_processing_inputs(processing_dir, flow, flow_uri):\n",
" \"\"\"Helper function for creating processing inputs\n",
" :param flow: loaded data wrangler flow notebook\n",
@@ -243,29 +246,24 @@
" source_type = data_def[\"datasetSourceType\"]\n",
"\n",
" if source_type == \"S3\":\n",
- " s3_processing_input = create_s3_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(s3_processing_input)\n",
" elif source_type == \"Athena\":\n",
- " athena_processing_input = create_athena_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(athena_processing_input)\n",
" elif source_type == \"Redshift\":\n",
- " redshift_processing_input = create_redshift_processing_input(\n",
- " processing_dir, name, data_def)\n",
+ " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n",
" processing_inputs.append(redshift_processing_input)\n",
" else:\n",
" raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n",
" return processing_inputs\n",
"\n",
+ "\n",
"def create_container_arguments(output_name, output_content_type):\n",
- " output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- " }\n",
+ " output_config = {output_name: {\"content_type\": output_content_type}}\n",
" return [f\"--output-config '{json.dumps(output_config)}'\"]\n",
"\n",
+ "\n",
"# Create Processing Job Arguments\n",
"processing_job_arguments = {\n",
" \"AppSpecification\": {\n",
@@ -281,7 +279,7 @@
" \"S3Uri\": output_path,\n",
" \"LocalPath\": os.path.join(processing_dir, \"output\"),\n",
" \"S3UploadMode\": \"EndOfJob\",\n",
- " }\n",
+ " },\n",
" },\n",
" ],\n",
" },\n",
@@ -315,8 +313,8 @@
"source": [
"from sagemaker.workflow.steps import ProcessingStep, Step, StepTypeEnum\n",
"\n",
- "class NaiveStep(Step):\n",
"\n",
+ "class NaiveStep(Step):\n",
" def __init__(self, name, step_type: StepTypeEnum, step_args):\n",
" self.name = name\n",
" self.step_type = step_type\n",
@@ -329,18 +327,12 @@
" raise NotImplementedError()\n",
"\n",
" def to_request(self):\n",
- " return {\n",
- " 'Name': self.name,\n",
- " 'Type': self.step_type.value,\n",
- " 'Arguments': self.step_args\n",
- " }\n",
+ " return {\"Name\": self.name, \"Type\": self.step_type.value, \"Arguments\": self.step_args}\n",
"\n",
"\n",
"step_process = NaiveStep(\n",
- " name=\"DataWranglerProcessingStep\",\n",
- " step_type=StepTypeEnum.PROCESSING,\n",
- " step_args=processing_job_arguments\n",
- ")\n"
+ " name=\"DataWranglerProcessingStep\", step_type=StepTypeEnum.PROCESSING, step_args=processing_job_arguments\n",
+ ")"
]
},
{
@@ -390,8 +382,8 @@
" name=pipeline_name,\n",
" parameters=[instance_type, instance_count],\n",
" steps=[step_process],\n",
- " sagemaker_session=sagemaker_session\n",
- ")\n"
+ " sagemaker_session=sagemaker_session,\n",
+ ")"
]
},
{
@@ -411,7 +403,7 @@
"\n",
"\n",
"definition = json.loads(pipeline.definition())\n",
- "definition\n"
+ "definition"
]
},
{
@@ -443,7 +435,7 @@
" raise\n",
"\n",
"pipeline_arn = response[\"PipelineArn\"]\n",
- "print(pipeline_arn)\n"
+ "print(pipeline_arn)"
]
},
{
@@ -467,7 +459,7 @@
"source": [
"start_response = pipeline.start()\n",
"pipeline_execution_arn = start_response.arn\n",
- "print(pipeline_execution_arn)\n"
+ "print(pipeline_execution_arn)"
]
},
{
@@ -503,7 +495,7 @@
")\n",
"execution_steps = execution_steps_response[\"PipelineExecutionSteps\"]\n",
"print(\"Execution steps:\")\n",
- "pprint(execution_steps)\n"
+ "pprint(execution_steps)"
]
},
{
@@ -525,33 +517,33 @@
"\n",
"def get_waiter(pipeline, delay=24, max_attempts=60):\n",
" waiter_id = \"PipelineExecutionComplete\"\n",
- " model = botocore.waiter.WaiterModel({\n",
- " \"version\": 2,\n",
- " \"waiters\": {\n",
- " waiter_id: {\n",
- " \"delay\": delay,\n",
- " \"maxAttempts\": max_attempts,\n",
- " \"operation\": 'DescribePipelineExecution',\n",
- " \"acceptors\": [\n",
- " {\n",
- " \"expected\": \"Succeeded\",\n",
- " \"matcher\": \"path\",\n",
- " \"state\": \"success\",\n",
- " \"argument\": \"PipelineExecutionStatus\"\n",
- " },\n",
- " {\n",
- " \"expected\": \"Failed\",\n",
- " \"matcher\": \"path\",\n",
- " \"state\": \"failure\",\n",
- " \"argument\": \"PipelineExecutionStatus\"\n",
- " },\n",
- " ]\n",
- " }\n",
+ " model = botocore.waiter.WaiterModel(\n",
+ " {\n",
+ " \"version\": 2,\n",
+ " \"waiters\": {\n",
+ " waiter_id: {\n",
+ " \"delay\": delay,\n",
+ " \"maxAttempts\": max_attempts,\n",
+ " \"operation\": \"DescribePipelineExecution\",\n",
+ " \"acceptors\": [\n",
+ " {\n",
+ " \"expected\": \"Succeeded\",\n",
+ " \"matcher\": \"path\",\n",
+ " \"state\": \"success\",\n",
+ " \"argument\": \"PipelineExecutionStatus\",\n",
+ " },\n",
+ " {\n",
+ " \"expected\": \"Failed\",\n",
+ " \"matcher\": \"path\",\n",
+ " \"state\": \"failure\",\n",
+ " \"argument\": \"PipelineExecutionStatus\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " },\n",
" }\n",
- " })\n",
- " return botocore.waiter.create_waiter_with_client(\n",
- " waiter_id, model, sagemaker_session.sagemaker_client\n",
- " )\n"
+ " )\n",
+ " return botocore.waiter.create_waiter_with_client(waiter_id, model, sagemaker_session.sagemaker_client)"
]
},
{
@@ -561,7 +553,7 @@
"outputs": [],
"source": [
"waiter = get_waiter(pipeline)\n",
- "waiter.wait(PipelineExecutionArn=pipeline_execution_arn)\n"
+ "waiter.wait(PipelineExecutionArn=pipeline_execution_arn)"
]
},
{
@@ -575,7 +567,7 @@
")\n",
"execution_steps = execution_steps_response[\"PipelineExecutionSteps\"]\n",
"print(\"Execution steps:\")\n",
- "pprint(execution_steps)\n"
+ "pprint(execution_steps)"
]
},
{
diff --git a/06_prepare/data-wrangler/data_wrangler_antje.py b/06_prepare/data-wrangler/data_wrangler_antje.py
index 81c56f8c..506f3cff 100644
--- a/06_prepare/data-wrangler/data_wrangler_antje.py
+++ b/06_prepare/data-wrangler/data_wrangler_antje.py
@@ -1,10 +1,12 @@
from pyspark.sql.session import SparkSession
from pyspark.sql.dataframe import DataFrame
+
# You may want to configure the Spark Context with the right credentials provider.
-spark = SparkSession.builder.master('local').getOrCreate()
+spark = SparkSession.builder.master("local").getOrCreate()
mode = None
+
def capture_stdout(func, *args, **kwargs):
"""Capture standard output to a string buffer"""
@@ -54,7 +56,7 @@ def default_spark_with_trained_parameters_and_state(df, trained_parameters, stat
def dispatch(key_name, args, kwargs, funcs):
"""
- Dispatches to another operator based on a key in the passed parameters.
+ Dispatches to another operator based on a key in the passed parameters.
This also slices out any parameters using the parameter_name passed in,
and will reassemble the trained_parameters correctly after invocation.
@@ -98,7 +100,9 @@ def dispatch(key_name, args, kwargs, funcs):
updated_trained_parameters = result["trained_parameters"]
if existing_trained_parameters is not None or updated_trained_parameters is not None:
- existing_trained_parameters = existing_trained_parameters if existing_trained_parameters is not None else {}
+ existing_trained_parameters = (
+ existing_trained_parameters if existing_trained_parameters is not None else {}
+ )
existing_trained_parameters[parameter_name] = result["trained_parameters"]
# Update the result trained_parameters so they are part of the original structure.
@@ -132,6 +136,7 @@ class OperatorCustomerError(Exception):
from sagemaker_dataprep.compute.operators.utils import (
dispatch,
default_spark_with_trained_parameters,
+)
from pyspark.ml.feature import (
VectorAssembler,
StandardScaler,
@@ -156,7 +161,9 @@ def process_numeric_standard_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
@@ -210,7 +217,9 @@ def process_numeric_robust_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
@@ -266,14 +275,21 @@ def process_numeric_min_max_scaler(
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
temp_normalized_vector_col = temp_col_name(assembled)
trained_parameters = load_trained_parameters(
- trained_parameters, {"input_column": input_column, "min": min, "max": max,}
+ trained_parameters,
+ {
+ "input_column": input_column,
+ "min": min,
+ "max": max,
+ },
)
scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
@@ -311,13 +327,20 @@ def process_numeric_max_absolute_scaler(df, input_column=None, output_column=Non
process_numeric_expects_numeric_column(df, input_column)
temp_vector_col = temp_col_name(df)
- assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df)
+ assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(
+ df
+ )
assembled_wo_nans = VectorAssembler(
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
).transform(df)
temp_normalized_vector_col = temp_col_name(assembled)
- trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,})
+ trained_parameters = load_trained_parameters(
+ trained_parameters,
+ {
+ "input_column": input_column,
+ },
+ )
scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
trained_parameters, MinMaxScalerModel, "scaler_model"
@@ -414,7 +437,9 @@ def athena_start_query_execution_core(client, request):
try:
result = client.start_query_execution(**request)
except Exception as e:
- raise RuntimeError(f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}")
+ raise RuntimeError(
+ f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}"
+ )
return result
@@ -502,7 +527,10 @@ def athena_start_query_execution(dataset_definition, client):
query_request = {
"QueryString": ctas_query,
- "QueryExecutionContext": {"Database": database_name, "Catalog": catalog_name,},
+ "QueryExecutionContext": {
+ "Database": database_name,
+ "Catalog": catalog_name,
+ },
"ResultConfiguration": {"OutputLocation": metadata_s3_output_location},
}
logging.debug("Query request is: %s", query_request)
@@ -674,8 +702,13 @@ def cast_single_column_type(
# | 2|None| bar |
# | 3| 1 | |
# +---+----+------------------+
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
- df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
+ df = df.withColumn(
+ non_castable_column,
+ f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),
+ )
elif invalid_data_handling_method == NonCastableDataHandlingMethod.REPLACE_WITH_FIXED_VALUE:
# Replace non-castable data to a value in the same column
# Original dataframe
@@ -696,7 +729,9 @@ def cast_single_column_type(
# +---+----+
value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type)
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise(
f.to_date(f.lit(value), date_formatting)
@@ -729,8 +764,13 @@ def cast_single_column_type(
# +---+----+------------------+
value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type)
- df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date)
- df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),)
+ df = df.withColumn(
+ temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date
+ )
+ df = df.withColumn(
+ non_castable_column,
+ f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),
+ )
replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise(
f.to_date(f.lit(value), date_formatting)
@@ -782,8 +822,7 @@ class OperatorSparkOperatorCustomerError(Exception):
def temp_col_name(df, *illegal_names):
- """Generates a temporary column name that is unused.
- """
+ """Generates a temporary column name that is unused."""
name = "temp_col"
idx = 0
name_set = set(list(df.columns) + list(illegal_names))
@@ -795,8 +834,7 @@ def temp_col_name(df, *illegal_names):
def get_temp_col_if_not_set(df, col_name):
- """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name.
- """
+ """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name."""
if col_name:
return col_name, False
else:
@@ -806,7 +844,7 @@ def get_temp_col_if_not_set(df, col_name):
def replace_input_if_output_is_temp(df, input_column, output_column, output_is_temp):
"""Replaces the input column in the dataframe if the output was not set
- This is used with get_temp_col_if_not_set to enable the behavior where a
+ This is used with get_temp_col_if_not_set to enable the behavior where a
transformer will replace its input column if an output is not specified.
"""
if output_is_temp:
@@ -846,7 +884,9 @@ def expects_valid_column_name(value, key, nullable=False):
return
if value is None or len(str(value).strip()) == 0:
- raise OperatorSparkOperatorCustomerError(f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}")
+ raise OperatorSparkOperatorCustomerError(
+ f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}"
+ )
def expects_parameter(value, key, condition=None):
@@ -858,12 +898,16 @@ def expects_parameter(value, key, condition=None):
def expects_column(df, value, key):
if not value or value not in df.columns:
- raise OperatorSparkOperatorCustomerError(f"Expected column in dataframe for '{key}' however received '{value}'")
+ raise OperatorSparkOperatorCustomerError(
+ f"Expected column in dataframe for '{key}' however received '{value}'"
+ )
def expects_parameter_value_in_list(key, value, items):
if value not in items:
- raise OperatorSparkOperatorCustomerError(f"Illegal parameter value. {key} expected to be in {items}, but given {value}")
+ raise OperatorSparkOperatorCustomerError(
+ f"Illegal parameter value. {key} expected to be in {items}, but given {value}"
+ )
def encode_pyspark_model(model):
@@ -966,7 +1010,6 @@ def transform_using_trained_model(model, df, loaded):
)
-
def type_inference(df): # noqa: C901 # pylint: disable=R0912
"""Core type inference logic
@@ -1237,7 +1280,9 @@ def athena_source(spark, mode, dataset_definition, trained_parameters=None): #
trained_parameters["ctas_table_name"] = ""
try:
return default_spark_with_trained_parameters_and_state(
- df=spark.read.parquet(path), trained_parameters=trained_parameters, state=get_execution_state(state),
+ df=spark.read.parquet(path),
+ trained_parameters=trained_parameters,
+ state=get_execution_state(state),
)
except Exception as e:
raise RuntimeError(
@@ -1291,7 +1336,12 @@ def infer_and_cast_type(df, spark, inference_data_sample_size=1000, trained_para
def process_numeric(df, spark, **kwargs):
return dispatch(
- "operator", [df], kwargs, {"Scale values": (process_numeric_scale_values, "scale_values_parameters"),},
+ "operator",
+ [df],
+ kwargs,
+ {
+ "Scale values": (process_numeric_scale_values, "scale_values_parameters"),
+ },
)
@@ -1303,14 +1353,48 @@ def custom_formula(df, spark, formula, output_column=None):
return default_spark(output_df)
-op_1_output = athena_source(spark=spark, mode=mode, **{'dataset_definition': {'datasetSourceType': 'Athena', 'name': 'amazon-reviews-pds-tsv', 'catalogName': 'AwsDataCatalog', 'databaseName': 'dsoaws', 'queryString': 'select * from amazon_reviews_tsv', 's3OutputLocation': 's3://sagemaker-us-east-1-806570384721/athena/', 'outputFormat': 'parquet'}})
-op_2_output = infer_and_cast_type(op_1_output['default'], spark=spark, **{})
-op_3_output = process_numeric(op_2_output['default'], spark=spark, **{'operator': 'Scale values', 'scale_values_parameters': {'scaler': 'Min-max scaler', 'min_max_scaler_parameters': {'min': -1, 'max': 1, 'input_column': 'star_rating', 'output_column': 'star_rating_scaled'}, 'standard_scaler_parameters': {}}})
-op_4_output = custom_formula(op_3_output['default'], spark=spark, **{'output_column': 'star_rating_scaled_floored', 'formula': 'floor(star_rating_scaled)'})
+op_1_output = athena_source(
+ spark=spark,
+ mode=mode,
+ **{
+ "dataset_definition": {
+ "datasetSourceType": "Athena",
+ "name": "amazon-reviews-pds-tsv",
+ "catalogName": "AwsDataCatalog",
+ "databaseName": "dsoaws",
+ "queryString": "select * from amazon_reviews_tsv",
+ "s3OutputLocation": "s3://sagemaker-us-east-1-806570384721/athena/",
+ "outputFormat": "parquet",
+ }
+ },
+)
+op_2_output = infer_and_cast_type(op_1_output["default"], spark=spark, **{})
+op_3_output = process_numeric(
+ op_2_output["default"],
+ spark=spark,
+ **{
+ "operator": "Scale values",
+ "scale_values_parameters": {
+ "scaler": "Min-max scaler",
+ "min_max_scaler_parameters": {
+ "min": -1,
+ "max": 1,
+ "input_column": "star_rating",
+ "output_column": "star_rating_scaled",
+ },
+ "standard_scaler_parameters": {},
+ },
+ },
+)
+op_4_output = custom_formula(
+ op_3_output["default"],
+ spark=spark,
+ **{"output_column": "star_rating_scaled_floored", "formula": "floor(star_rating_scaled)"},
+)
# Glossary: variable name to node_id
#
# op_1_output: d46ffe0e-f774-4ecc-bdbf-40a708832774
# op_2_output: b1cdf334-0f01-40e6-819b-5806e59d41e6
# op_3_output: e880c72f-910c-4554-9a28-a66ce9d3b35f
-# op_4_output: 969f0c55-dbfe-4658-88fc-15d4de6762e0
\ No newline at end of file
+# op_4_output: 969f0c55-dbfe-4658-88fc-15d4de6762e0
diff --git a/06_prepare/preprocess-scikit-text-to-bert-feature-store.py b/06_prepare/preprocess-scikit-text-to-bert-feature-store.py
index 1211ba85..7e1cd385 100644
--- a/06_prepare/preprocess-scikit-text-to-bert-feature-store.py
+++ b/06_prepare/preprocess-scikit-text-to-bert-feature-store.py
@@ -20,16 +20,18 @@
import subprocess
## PIP INSTALLS ##
-# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
+# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
# use anaconda and anaconda only supports 2.3.0 at this time
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"])
import pandas as pd
import re
import sagemaker
@@ -40,51 +42,55 @@
FeatureTypeEnum,
)
-region = os.environ['AWS_DEFAULT_REGION']
-print('Region: {}'.format(region))
+region = os.environ["AWS_DEFAULT_REGION"]
+print("Region: {}".format(region))
#############################
## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc.
## Role and Bucket are malformed if we do this later.
-sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region)
+sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region)
caller_identity = sts.get_caller_identity()
-print('caller_identity: {}'.format(caller_identity))
+print("caller_identity: {}".format(caller_identity))
-assumed_role_arn = caller_identity['Arn']
-print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn))
+assumed_role_arn = caller_identity["Arn"]
+print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn))
-assumed_role_name = assumed_role_arn.split('/')[-2]
+assumed_role_name = assumed_role_arn.split("/")[-2]
-iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region)
-get_role_response = iam.get_role(RoleName=assumed_role_name)
-print('get_role_response {}'.format(get_role_response))
-role = get_role_response['Role']['Arn']
-print('role {}'.format(role))
+iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region)
+get_role_response = iam.get_role(RoleName=assumed_role_name)
+print("get_role_response {}".format(get_role_response))
+role = get_role_response["Role"]["Arn"]
+print("role {}".format(role))
bucket = sagemaker.Session().default_bucket()
-print('The DEFAULT BUCKET is {}'.format(bucket))
+print("The DEFAULT BUCKET is {}".format(bucket))
#############################
-sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region)
+sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region)
-featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region)
+featurestore_runtime = boto3.Session(region_name=region).client(
+ service_name="sagemaker-featurestore-runtime", region_name=region
+)
-s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region)
+s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region)
-sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region),
- sagemaker_client=sm,
- sagemaker_featurestore_runtime_client=featurestore_runtime)
+sagemaker_session = sagemaker.Session(
+ boto_session=boto3.Session(region_name=region),
+ sagemaker_client=sm,
+ sagemaker_featurestore_runtime_client=featurestore_runtime,
+)
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-REVIEW_BODY_COLUMN = 'review_body'
-REVIEW_ID_COLUMN = 'review_id'
+REVIEW_BODY_COLUMN = "review_body"
+REVIEW_ID_COLUMN = "review_id"
# DATE_COLUMN = 'date'
-LABEL_COLUMN = 'star_rating'
+LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]
-
+
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
label_map[label] = i
@@ -92,94 +98,88 @@
def cast_object_to_string(data_frame):
for label in data_frame.columns:
- if data_frame.dtypes[label] == 'object':
+ if data_frame.dtypes[label] == "object":
data_frame[label] = data_frame[label].astype("str").astype("string")
return data_frame
-
+
def wait_for_feature_group_creation_complete(feature_group):
try:
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
while status == "Creating":
print("Waiting for Feature Group Creation")
time.sleep(5)
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
if status != "Created":
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
raise RuntimeError(f"Failed to create feature group {feature_group.name}")
print(f"FeatureGroup {feature_group.name} successfully created.")
except:
- print('No feature group created yet.')
-
-
+ print("No feature group created yet.")
+
+
def create_or_load_feature_group(prefix, feature_group_name):
# Feature Definitions for our records
- feature_definitions= [
- FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),
- FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),
-# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING)
+ feature_definitions = [
+ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
+ FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
+ # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]
-
+
feature_group = FeatureGroup(
- name=feature_group_name,
- feature_definitions=feature_definitions,
- sagemaker_session=sagemaker_session)
-
- print('Feature Group: {}'.format(feature_group))
-
- try:
- print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...')
+ name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
+ )
+
+ print("Feature Group: {}".format(feature_group))
+
+ try:
+ print(
+ "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
+ )
wait_for_feature_group_creation_complete(feature_group)
except Exception as e:
- print('Before CREATE FG wait exeption: {}'.format(e))
-# pass
-
+ print("Before CREATE FG wait exeption: {}".format(e))
+ # pass
+
try:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"
-
- print('Creating Feature Group with role {}...'.format(role))
+
+ print("Creating Feature Group with role {}...".format(role))
feature_group.create(
s3_uri=f"s3://{bucket}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name=event_time_feature_name,
role_arn=role,
- enable_online_store=True
+ enable_online_store=True,
)
- print('Creating Feature Group. Completed.')
-
- print('Waiting for new Feature Group to become available...')
+ print("Creating Feature Group. Completed.")
+
+ print("Waiting for new Feature Group to become available...")
wait_for_feature_group_creation_complete(feature_group)
- print('Feature Group available.')
+ print("Feature Group available.")
feature_group.describe()
-
+
except Exception as e:
- print('Exception: {}'.format(e))
-
+ print("Exception: {}".format(e))
+
return feature_group
-
+
class InputFeatures(object):
- """BERT feature vectors."""
-
- def __init__(self,
- input_ids,
- input_mask,
- segment_ids,
- label_id,
- review_id,
- date,
- label):
-# review_body):
+ """BERT feature vectors."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
+ # review_body):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
@@ -187,36 +187,38 @@ def __init__(self,
self.review_id = review_id
self.date = date
self.label = label
+
+
# self.review_body = review_body
-
-
+
+
class Input(object):
- """A single training/test input for sequence classification."""
-
- def __init__(self, text, review_id, date, label=None):
- """Constructs an Input.
- Args:
- text: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.text = text
- self.review_id = review_id
- self.date = date
- self.label = label
-
-
+ """A single training/test input for sequence classification."""
+
+ def __init__(self, text, review_id, date, label=None):
+ """Constructs an Input.
+ Args:
+ text: string. The untokenized text of the first sequence. For single
+ sequence tasks, only this sequence must be specified.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.text = text
+ self.review_id = review_id
+ self.date = date
+ self.label = label
+
+
def convert_input(the_input, max_seq_length):
# First, we need to preprocess our data so that it matches the data BERT was trained on:
#
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- #
+ #
# Fortunately, the Transformers tokenizer does this for us!
#
- tokens = tokenizer.tokenize(the_input.text)
+ tokens = tokenizer.tokenize(the_input.text)
# Next, we need to do the following:
#
@@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length):
#
# Again, the Transformers tokenizer does this for us!
#
- encode_plus_tokens = tokenizer.encode_plus(the_input.text,
- pad_to_max_length=True,
- max_length=max_seq_length,
-# truncation=True
- )
+ encode_plus_tokens = tokenizer.encode_plus(
+ the_input.text,
+ pad_to_max_length=True,
+ max_length=max_seq_length,
+ # truncation=True
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
-
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ input_ids = encode_plus_tokens["input_ids"]
+
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
# Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
segment_ids = [0] * max_seq_length
@@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length):
label_id=label_id,
review_id=the_input.review_id,
date=the_input.date,
- label=the_input.label)
-# review_body=the_input.text)
-
-# print('**input_ids**\n{}\n'.format(features.input_ids))
-# print('**input_mask**\n{}\n'.format(features.input_mask))
-# print('**segment_ids**\n{}\n'.format(features.segment_ids))
-# print('**label_id**\n{}\n'.format(features.label_id))
-# print('**review_id**\n{}\n'.format(features.review_id))
-# print('**date**\n{}\n'.format(features.date))
-# print('**label**\n{}\n'.format(features.label))
-# print('**review_body**\n{}\n'.format(features.review_body))
+ label=the_input.label,
+ )
+ # review_body=the_input.text)
+
+ # print('**input_ids**\n{}\n'.format(features.input_ids))
+ # print('**input_mask**\n{}\n'.format(features.input_mask))
+ # print('**segment_ids**\n{}\n'.format(features.segment_ids))
+ # print('**label_id**\n{}\n'.format(features.label_id))
+ # print('**review_id**\n{}\n'.format(features.review_id))
+ # print('**date**\n{}\n'.format(features.date))
+ # print('**label**\n{}\n'.format(features.label))
+ # print('**review_body**\n{}\n'.format(features.review_body))
return features
-def transform_inputs_to_tfrecord(inputs,
- output_file,
- max_seq_length):
+def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
"""Convert a set of `Input`s to a TFRecord file."""
records = []
tf_record_writer = tf.io.TFRecordWriter(output_file)
-
+
for (input_idx, the_input) in enumerate(inputs):
if input_idx % 10000 == 0:
- print('Writing input {} of {}\n'.format(input_idx, len(inputs)))
+ print("Writing input {} of {}\n".format(input_idx, len(inputs)))
features = convert_input(the_input, max_seq_length)
all_features = collections.OrderedDict()
- all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
- all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
- all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
- all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
+ all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
+ all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
+ all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
+ all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
tf_record_writer.write(tf_record.SerializeToString())
- records.append({#'tf_record': tf_record.SerializeToString(),
- 'input_ids': features.input_ids,
- 'input_mask': features.input_mask,
- 'segment_ids': features.segment_ids,
- 'label_id': features.label_id,
- 'review_id': the_input.review_id,
- 'date': the_input.date,
- 'label': features.label,
-# 'review_body': features.review_body
- })
+ records.append(
+ { #'tf_record': tf_record.SerializeToString(),
+ "input_ids": features.input_ids,
+ "input_mask": features.input_mask,
+ "segment_ids": features.segment_ids,
+ "label_id": features.label_id,
+ "review_id": the_input.review_id,
+ "date": the_input.date,
+ "label": features.label,
+ # 'review_body': features.review_body
+ }
+ )
#####################################
####### TODO: REMOVE THIS BREAK #######
- #####################################
+ #####################################
# break
-
+
tf_record_writer.close()
-
+
return records
-
+
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--train-split-percentage', type=float,
+ parser.add_argument(
+ "--train-split-percentage",
+ type=float,
default=0.90,
)
- parser.add_argument('--validation-split-percentage', type=float,
- default=0.05,
- )
- parser.add_argument('--test-split-percentage', type=float,
+ parser.add_argument(
+ "--validation-split-percentage",
+ type=float,
default=0.05,
)
- parser.add_argument('--balance-dataset', type=eval,
- default=True
+ parser.add_argument(
+ "--test-split-percentage",
+ type=float,
+ default=0.05,
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument("--balance-dataset", type=eval, default=True)
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
- parser.add_argument('--feature-store-offline-prefix', type=str,
+ )
+ parser.add_argument(
+ "--feature-store-offline-prefix",
+ type=str,
default=None,
- )
- parser.add_argument('--feature-group-name', type=str,
+ )
+ parser.add_argument(
+ "--feature-group-name",
+ type=str,
default=None,
- )
-
+ )
+
return parser.parse_args()
-
-def _transform_tsv_to_tfrecord(file,
- max_seq_length,
- balance_dataset,
- prefix,
- feature_group_name):
- print('file {}'.format(file))
- print('max_seq_length {}'.format(max_seq_length))
- print('balance_dataset {}'.format(balance_dataset))
- print('prefix {}'.format(prefix))
- print('feature_group_name {}'.format(feature_group_name))
+
+def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name):
+ print("file {}".format(file))
+ print("max_seq_length {}".format(max_seq_length))
+ print("balance_dataset {}".format(balance_dataset))
+ print("prefix {}".format(prefix))
+ print("feature_group_name {}".format(feature_group_name))
# need to re-load since we can't pass feature_group object in _partial functions for some reason
feature_group = create_or_load_feature_group(prefix, feature_group_name)
-
+
filename_without_extension = Path(Path(file).stem).stem
- df = pd.read_csv(file,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')
+ df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")
df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)
- print('Shape of dataframe {}'.format(df.shape))
+ print("Shape of dataframe {}".format(df.shape))
- if balance_dataset:
+ if balance_dataset:
# Balance the dataset down to the minority class
from sklearn.utils import resample
- five_star_df = df.query('star_rating == 5')
- four_star_df = df.query('star_rating == 4')
- three_star_df = df.query('star_rating == 3')
- two_star_df = df.query('star_rating == 2')
- one_star_df = df.query('star_rating == 1')
-
- minority_count = min(five_star_df.shape[0],
- four_star_df.shape[0],
- three_star_df.shape[0],
- two_star_df.shape[0],
- one_star_df.shape[0])
-
- five_star_df = resample(five_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- four_star_df = resample(four_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- three_star_df = resample(three_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- two_star_df = resample(two_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- one_star_df = resample(one_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
+ five_star_df = df.query("star_rating == 5")
+ four_star_df = df.query("star_rating == 4")
+ three_star_df = df.query("star_rating == 3")
+ two_star_df = df.query("star_rating == 2")
+ one_star_df = df.query("star_rating == 1")
+
+ minority_count = min(
+ five_star_df.shape[0],
+ four_star_df.shape[0],
+ three_star_df.shape[0],
+ two_star_df.shape[0],
+ one_star_df.shape[0],
+ )
+
+ five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)
df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])
- df_balanced = df_balanced.reset_index(drop=True)
- print('Shape of balanced dataframe {}'.format(df_balanced.shape))
- print(df_balanced['star_rating'].head(100))
+ df_balanced = df_balanced.reset_index(drop=True)
+ print("Shape of balanced dataframe {}".format(df_balanced.shape))
+ print(df_balanced["star_rating"].head(100))
df = df_balanced
-
- print('Shape of dataframe before splitting {}'.format(df.shape))
-
- print('train split percentage {}'.format(args.train_split_percentage))
- print('validation split percentage {}'.format(args.validation_split_percentage))
- print('test split percentage {}'.format(args.test_split_percentage))
-
+
+ print("Shape of dataframe before splitting {}".format(df.shape))
+
+ print("train split percentage {}".format(args.train_split_percentage))
+ print("validation split percentage {}".format(args.validation_split_percentage))
+ print("test split percentage {}".format(args.test_split_percentage))
+
holdout_percentage = 1.00 - args.train_split_percentage
- print('holdout percentage {}'.format(holdout_percentage))
- df_train, df_holdout = train_test_split(df,
- test_size=holdout_percentage,
- stratify=df['star_rating'])
+ print("holdout percentage {}".format(holdout_percentage))
+ df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])
test_holdout_percentage = args.test_split_percentage / holdout_percentage
- print('test holdout percentage {}'.format(test_holdout_percentage))
- df_validation, df_test = train_test_split(df_holdout,
- test_size=test_holdout_percentage,
- stratify=df_holdout['star_rating'])
-
+ print("test holdout percentage {}".format(test_holdout_percentage))
+ df_validation, df_test = train_test_split(
+ df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"]
+ )
+
df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
- print('Shape of train dataframe {}'.format(df_train.shape))
- print('Shape of validation dataframe {}'.format(df_validation.shape))
- print('Shape of test dataframe {}'.format(df_test.shape))
+ print("Shape of train dataframe {}".format(df_train.shape))
+ print("Shape of validation dataframe {}".format(df_validation.shape))
+ print("Shape of test dataframe {}".format(df_test.shape))
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)
- train_inputs = df_train.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- validation_inputs = df_validation.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- test_inputs = df_test.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
+ train_inputs = df_train.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ validation_inputs = df_validation.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ test_inputs = df_test.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
# Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
- #
- #
+ #
+ #
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
# 4. Map our words to indexes using a vocab file that BERT provides
# 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
# 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
- #
+ #
# We don't have to worry about these details. The Transformers tokenizer does this for us.
- #
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
+ #
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
# Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
- train_records = transform_inputs_to_tfrecord(train_inputs,
- '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- validation_records = transform_inputs_to_tfrecord(validation_inputs,
- '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- test_records = transform_inputs_to_tfrecord(test_inputs,
- '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension),
- max_seq_length)
-
+ train_records = transform_inputs_to_tfrecord(
+ train_inputs,
+ "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ validation_records = transform_inputs_to_tfrecord(
+ validation_inputs,
+ "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ test_records = transform_inputs_to_tfrecord(
+ test_inputs,
+ "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
df_train_records = pd.DataFrame.from_dict(train_records)
- df_train_records['split_type'] = 'train'
- df_train_records.head()
-
+ df_train_records["split_type"] = "train"
+ df_train_records.head()
+
df_validation_records = pd.DataFrame.from_dict(validation_records)
- df_validation_records['split_type'] = 'validation'
- df_validation_records.head()
+ df_validation_records["split_type"] = "validation"
+ df_validation_records.head()
df_test_records = pd.DataFrame.from_dict(test_records)
- df_test_records['split_type'] = 'test'
- df_test_records.head()
-
- # Add record to feature store
+ df_test_records["split_type"] = "test"
+ df_test_records.head()
+
+ # Add record to feature store
df_fs_train_records = cast_object_to_string(df_train_records)
df_fs_validation_records = cast_object_to_string(df_validation_records)
df_fs_test_records = cast_object_to_string(df_test_records)
- print('Ingesting Features...')
- feature_group.ingest(
- data_frame=df_fs_train_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_validation_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_test_records, max_workers=3, wait=True
- )
- print('Feature ingest completed.')
+ print("Ingesting Features...")
+ feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True)
+ print("Feature ingest completed.")
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
+ print("Current host: {}".format(args.current_host))
+
+ feature_group = create_or_load_feature_group(
+ prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name
+ )
feature_group.describe()
-
+
print(feature_group.as_hive_ddl())
-
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
-
- transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord,
- max_seq_length=args.max_seq_length,
- balance_dataset=args.balance_dataset,
- prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
-
- input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data))
+
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
+
+ transform_tsv_to_tfrecord = functools.partial(
+ _transform_tsv_to_tfrecord,
+ max_seq_length=args.max_seq_length,
+ balance_dataset=args.balance_dataset,
+ prefix=args.feature_store_offline_prefix,
+ feature_group_name=args.feature_group_name,
+ )
+
+ input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
num_cpus = multiprocessing.cpu_count()
- print('num_cpus {}'.format(num_cpus))
+ print("num_cpus {}".format(num_cpus))
p = multiprocessing.Pool(num_cpus)
p.map(transform_tsv_to_tfrecord, input_files)
- print('Listing contents of {}'.format(args.output_data))
+ print("Listing contents of {}".format(args.output_data))
dirs_output = os.listdir(args.output_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(train_data))
+ print("Listing contents of {}".format(train_data))
dirs_output = os.listdir(train_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(validation_data))
+ print("Listing contents of {}".format(validation_data))
dirs_output = os.listdir(validation_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(test_data))
+ print("Listing contents of {}".format(test_data))
dirs_output = os.listdir(test_data)
for file in dirs_output:
print(file)
-
+
offline_store_contents = None
- while (offline_store_contents is None):
- objects_in_bucket = s3.list_objects(Bucket=bucket,
- Prefix=args.feature_store_offline_prefix)
- if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
- offline_store_contents = objects_in_bucket['Contents']
+ while offline_store_contents is None:
+ objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix)
+ if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
+ offline_store_contents = objects_in_bucket["Contents"]
else:
- print('Waiting for data in offline store...\n')
+ print("Waiting for data in offline store...\n")
sleep(60)
- print('Data available.')
-
- print('Complete')
-
-
+ print("Data available.")
+
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
process(args)
diff --git a/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb b/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb
index e4f3233b..a97b640b 100644
--- a/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb
+++ b/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb
@@ -57,9 +57,9 @@
"try:\n",
" max_seq_length\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -79,11 +79,11 @@
"source": [
"def select_data_and_label_from_record(record):\n",
" x = {\n",
- " 'input_ids': record['input_ids'],\n",
- " 'input_mask': record['input_mask'],\n",
- "# 'segment_ids': record['segment_ids']\n",
+ " \"input_ids\": record[\"input_ids\"],\n",
+ " \"input_mask\": record[\"input_mask\"],\n",
+ " # 'segment_ids': record['segment_ids']\n",
" }\n",
- " y = record['label_ids']\n",
+ " y = record[\"label_ids\"]\n",
"\n",
" return (x, y)"
]
@@ -94,51 +94,47 @@
"metadata": {},
"outputs": [],
"source": [
- "def file_based_input_dataset_builder(channel,\n",
- " input_filenames,\n",
- " pipe_mode,\n",
- " is_training,\n",
- " drop_remainder):\n",
+ "def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder):\n",
"\n",
" # For training, we want a lot of parallel reading and shuffling.\n",
" # For eval, we want no shuffling and parallel reading doesn't matter.\n",
"\n",
" if pipe_mode:\n",
- " print('***** Using pipe_mode with channel {}'.format(channel))\n",
+ " print(\"***** Using pipe_mode with channel {}\".format(channel))\n",
" from sagemaker_tensorflow import PipeModeDataset\n",
- " dataset = PipeModeDataset(channel=channel,\n",
- " record_format='TFRecord')\n",
+ "\n",
+ " dataset = PipeModeDataset(channel=channel, record_format=\"TFRecord\")\n",
" else:\n",
- " print('***** Using input_filenames {}'.format(input_filenames))\n",
+ " print(\"***** Using input_filenames {}\".format(input_filenames))\n",
" dataset = tf.data.TFRecordDataset(input_filenames)\n",
"\n",
" dataset = dataset.repeat(100)\n",
" dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)\n",
"\n",
" name_to_features = {\n",
- " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- "# \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n",
+ " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " # \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n",
" }\n",
"\n",
" def _decode_record(record, name_to_features):\n",
" \"\"\"Decodes a record to a TensorFlow example.\"\"\"\n",
" return tf.io.parse_single_example(record, name_to_features)\n",
- " \n",
+ "\n",
" dataset = dataset.apply(\n",
" tf.data.experimental.map_and_batch(\n",
- " lambda record: _decode_record(record, name_to_features),\n",
- " batch_size=8,\n",
- " drop_remainder=drop_remainder,\n",
- " num_parallel_calls=tf.data.experimental.AUTOTUNE))\n",
+ " lambda record: _decode_record(record, name_to_features),\n",
+ " batch_size=8,\n",
+ " drop_remainder=drop_remainder,\n",
+ " num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
+ " )\n",
+ " )\n",
"\n",
" dataset.cache()\n",
"\n",
" if is_training:\n",
- " dataset = dataset.shuffle(seed=42,\n",
- " buffer_size=10,\n",
- " reshuffle_each_iteration=True)\n",
+ " dataset = dataset.shuffle(seed=42, buffer_size=10, reshuffle_each_iteration=True)\n",
"\n",
" return dataset"
]
@@ -149,16 +145,13 @@
"metadata": {},
"outputs": [],
"source": [
- "train_data = './data-tfrecord/bert-train'\n",
- "train_data_filenames = glob('{}/*.tfrecord'.format(train_data))\n",
- "print('train_data_filenames {}'.format(train_data_filenames))\n",
+ "train_data = \"./data-tfrecord/bert-train\"\n",
+ "train_data_filenames = glob(\"{}/*.tfrecord\".format(train_data))\n",
+ "print(\"train_data_filenames {}\".format(train_data_filenames))\n",
"\n",
"train_dataset = file_based_input_dataset_builder(\n",
- " channel='train',\n",
- " input_filenames=train_data_filenames,\n",
- " pipe_mode=False,\n",
- " is_training=True,\n",
- " drop_remainder=False).map(select_data_and_label_from_record)"
+ " channel=\"train\", input_filenames=train_data_filenames, pipe_mode=False, is_training=True, drop_remainder=False\n",
+ ").map(select_data_and_label_from_record)"
]
},
{
@@ -167,16 +160,17 @@
"metadata": {},
"outputs": [],
"source": [
- "validation_data = './data-tfrecord/bert-validation'\n",
- "validation_data_filenames = glob('{}/*.tfrecord'.format(validation_data))\n",
- "print('validation_data_filenames {}'.format(validation_data_filenames))\n",
+ "validation_data = \"./data-tfrecord/bert-validation\"\n",
+ "validation_data_filenames = glob(\"{}/*.tfrecord\".format(validation_data))\n",
+ "print(\"validation_data_filenames {}\".format(validation_data_filenames))\n",
"\n",
"validation_dataset = file_based_input_dataset_builder(\n",
- " channel='validation',\n",
+ " channel=\"validation\",\n",
" input_filenames=validation_data_filenames,\n",
" pipe_mode=False,\n",
" is_training=False,\n",
- " drop_remainder=False).map(select_data_and_label_from_record)"
+ " drop_remainder=False,\n",
+ ").map(select_data_and_label_from_record)"
]
},
{
@@ -185,16 +179,13 @@
"metadata": {},
"outputs": [],
"source": [
- "test_data = './data-tfrecord/bert-test'\n",
- "test_data_filenames = glob('{}/*.tfrecord'.format(test_data))\n",
+ "test_data = \"./data-tfrecord/bert-test\"\n",
+ "test_data_filenames = glob(\"{}/*.tfrecord\".format(test_data))\n",
"print(test_data_filenames)\n",
"\n",
"test_dataset = file_based_input_dataset_builder(\n",
- " channel='test',\n",
- " input_filenames=test_data_filenames,\n",
- " pipe_mode=False,\n",
- " is_training=False,\n",
- " drop_remainder=False).map(select_data_and_label_from_record)"
+ " channel=\"test\", input_filenames=test_data_filenames, pipe_mode=False, is_training=False, drop_remainder=False\n",
+ ").map(select_data_and_label_from_record)"
]
},
{
@@ -210,13 +201,13 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "steps_per_epoch=50\n",
- "validation_steps=50\n",
- "test_steps=150\n",
- "freeze_bert_layer=True\n",
- "learning_rate=3e-5\n",
- "epsilon=1e-08"
+ "epochs = 1\n",
+ "steps_per_epoch = 50\n",
+ "validation_steps = 50\n",
+ "test_steps = 150\n",
+ "freeze_bert_layer = True\n",
+ "learning_rate = 3e-5\n",
+ "epsilon = 1e-08"
]
},
{
@@ -235,24 +226,14 @@
},
"outputs": [],
"source": [
- "CLASSES=[1, 2, 3, 4, 5]\n",
- "\n",
- "config = DistilBertConfig.from_pretrained('distilbert-base-uncased',\n",
- " num_labels=len(CLASSES),\n",
- " id2label={\n",
- " 0: 1,\n",
- " 1: 2,\n",
- " 2: 3,\n",
- " 3: 4,\n",
- " 4: 5\n",
- " },\n",
- " label2id={\n",
- " 1: 0,\n",
- " 2: 1,\n",
- " 3: 2,\n",
- " 4: 3,\n",
- " 5: 4\n",
- " })\n",
+ "CLASSES = [1, 2, 3, 4, 5]\n",
+ "\n",
+ "config = DistilBertConfig.from_pretrained(\n",
+ " \"distilbert-base-uncased\",\n",
+ " num_labels=len(CLASSES),\n",
+ " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n",
+ " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n",
+ ")\n",
"print(config)"
]
},
@@ -264,20 +245,21 @@
"source": [
"from transformers import TFDistilBertModel\n",
"\n",
- "transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', \n",
- " config=config)\n",
+ "transformer_model = TFDistilBertModel.from_pretrained(\"distilbert-base-uncased\", config=config)\n",
"\n",
- "input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')\n",
- "input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') \n",
+ "input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name=\"input_ids\", dtype=\"int32\")\n",
+ "input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name=\"input_mask\", dtype=\"int32\")\n",
"\n",
"embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]\n",
- "X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)\n",
+ "X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(\n",
+ " embedding_layer\n",
+ ")\n",
"X = tf.keras.layers.GlobalMaxPool1D()(X)\n",
- "X = tf.keras.layers.Dense(50, activation='relu')(X)\n",
+ "X = tf.keras.layers.Dense(50, activation=\"relu\")(X)\n",
"X = tf.keras.layers.Dropout(0.2)(X)\n",
- "X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)\n",
+ "X = tf.keras.layers.Dense(len(CLASSES), activation=\"sigmoid\")(X)\n",
"\n",
- "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)\n",
+ "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)\n",
"\n",
"for layer in model.layers[:3]:\n",
" layer.trainable = not freeze_bert_layer"
@@ -296,10 +278,10 @@
"metadata": {},
"outputs": [],
"source": [
- "loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
- "metric=tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
+ "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
+ "metric = tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")\n",
"\n",
- "optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n",
+ "optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n",
"\n",
"model.compile(optimizer=optimizer, loss=loss, metrics=[metric])\n",
"\n",
@@ -314,7 +296,7 @@
"source": [
"callbacks = []\n",
"\n",
- "log_dir = './tmp/tensorboard/'\n",
+ "log_dir = \"./tmp/tensorboard/\"\n",
"tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
"callbacks.append(tensorboard_callback)"
]
@@ -327,13 +309,15 @@
},
"outputs": [],
"source": [
- "history = model.fit(train_dataset,\n",
- " shuffle=True,\n",
- " epochs=epochs,\n",
- " steps_per_epoch=steps_per_epoch,\n",
- " validation_data=validation_dataset,\n",
- " validation_steps=validation_steps,\n",
- " callbacks=callbacks)"
+ "history = model.fit(\n",
+ " train_dataset,\n",
+ " shuffle=True,\n",
+ " epochs=epochs,\n",
+ " steps_per_epoch=steps_per_epoch,\n",
+ " validation_data=validation_dataset,\n",
+ " validation_steps=validation_steps,\n",
+ " callbacks=callbacks,\n",
+ ")"
]
},
{
@@ -342,7 +326,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Trained model {}'.format(model))"
+ "print(\"Trained model {}\".format(model))"
]
},
{
@@ -358,9 +342,7 @@
"metadata": {},
"outputs": [],
"source": [
- "test_history = model.evaluate(test_dataset,\n",
- " steps=test_steps, \n",
- " callbacks=callbacks)\n",
+ "test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)\n",
"print(test_history)"
]
},
@@ -377,7 +359,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tensorflow_model_dir = './tmp/tensorflow/'"
+ "tensorflow_model_dir = \"./tmp/tensorflow/\""
]
},
{
@@ -446,21 +428,19 @@
"\n",
"from transformers import DistilBertTokenizer\n",
"\n",
- "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
+ "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
"\n",
- "sample_review_body = 'This product is terrible.'\n",
+ "sample_review_body = \"This product is terrible.\"\n",
"\n",
- "encode_plus_tokens = tokenizer.encode_plus(sample_review_body,\n",
- " padding=True,\n",
- " max_length=max_seq_length,\n",
- " truncation=True,\n",
- " return_tensors='tf')\n",
+ "encode_plus_tokens = tokenizer.encode_plus(\n",
+ " sample_review_body, padding=True, max_length=max_seq_length, truncation=True, return_tensors=\"tf\"\n",
+ ")\n",
"\n",
"# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n",
- "input_ids = encode_plus_tokens['input_ids']\n",
+ "input_ids = encode_plus_tokens[\"input_ids\"]\n",
"\n",
- "# Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n",
- "input_mask = encode_plus_tokens['attention_mask']\n",
+ "# Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n",
+ "input_mask = encode_plus_tokens[\"attention_mask\"]\n",
"\n",
"outputs = model.predict(x=(input_ids, input_mask))\n",
"\n",
@@ -468,8 +448,7 @@
"\n",
"prediction = [{\"label\": config.id2label[item.argmax()], \"score\": item.max().item()} for item in scores]\n",
"\n",
- "print('Predicted star_rating \"{}\" for review_body \"{}\"'.format(prediction[0]['label'], sample_review_body))\n",
- " "
+ "print('Predicted star_rating \"{}\" for review_body \"{}\"'.format(prediction[0][\"label\"], sample_review_body))"
]
},
{
diff --git a/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb b/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb
index 908ed9ce..6d414288 100644
--- a/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb
+++ b/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb
@@ -34,12 +34,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -67,9 +67,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -99,9 +99,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -131,9 +131,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -163,9 +163,9 @@
"try:\n",
" max_seq_length\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -195,9 +195,9 @@
"try:\n",
" experiment_name\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -229,9 +229,9 @@
"try:\n",
" trial_name\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -303,12 +303,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -337,28 +334,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=1\n",
- "validation_steps=1\n",
- "test_steps=1\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=True\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 1\n",
+ "validation_steps = 1\n",
+ "test_steps = 1\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = True\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -397,10 +394,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -424,70 +421,79 @@
"from sagemaker.debugger import CollectionConfig\n",
"from sagemaker.debugger import DebuggerHookConfig\n",
"\n",
- "actions=rule_configs.ActionList(\n",
- "# rule_configs.StopTraining(),\n",
- "# rule_configs.Email(\"\")\n",
+ "actions = rule_configs.ActionList(\n",
+ " # rule_configs.StopTraining(),\n",
+ " # rule_configs.Email(\"\")\n",
")\n",
"\n",
- "rules=[\n",
- " Rule.sagemaker(\n",
- " base_config=rule_configs.loss_not_decreasing(),\n",
- " rule_parameters={\n",
- " 'collection_names': 'losses,metrics',\n",
- " 'use_losses_collection': 'true',\n",
- " 'num_steps': '10',\n",
- " 'diff_percent': '50'\n",
- " },\n",
- " collections_to_save=[\n",
- " CollectionConfig(name='losses',\n",
- " parameters={\n",
- " 'save_interval': '10',\n",
- " }),\n",
- " CollectionConfig(name='metrics',\n",
- " parameters={\n",
- " 'save_interval': '10',\n",
- " })\n",
- " ],\n",
- " actions=actions \n",
- " ),\n",
- " Rule.sagemaker(\n",
- " base_config=rule_configs.overtraining(),\n",
- " rule_parameters={\n",
- " 'collection_names': 'losses,metrics',\n",
- " 'patience_train': '10',\n",
- " 'patience_validation': '10',\n",
- " 'delta': '0.5'\n",
- " },\n",
- " collections_to_save=[\n",
- " CollectionConfig(name='losses',\n",
- " parameters={\n",
- " 'save_interval': '10',\n",
- " }),\n",
- " CollectionConfig(name='metrics',\n",
- " parameters={\n",
- " 'save_interval': '10',\n",
- " })\n",
- " ],\n",
- " actions=actions \n",
- " ),\n",
- " ProfilerRule.sagemaker(rule_configs.ProfilerReport()),\n",
- " ProfilerRule.sagemaker(rule_configs.BatchSize()),\n",
- " ProfilerRule.sagemaker(rule_configs.CPUBottleneck()),\n",
- " ProfilerRule.sagemaker(rule_configs.GPUMemoryIncrease()),\n",
- " ProfilerRule.sagemaker(rule_configs.IOBottleneck()),\n",
- " ProfilerRule.sagemaker(rule_configs.LoadBalancing()),\n",
- " ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),\n",
- " ProfilerRule.sagemaker(rule_configs.OverallSystemUsage()),\n",
- "# ProfilerRule.sagemaker(rule_configs.OverallFrameworkMetrics()),\n",
- " ProfilerRule.sagemaker(rule_configs.StepOutlier()) \n",
- " ]\n",
+ "rules = [\n",
+ " Rule.sagemaker(\n",
+ " base_config=rule_configs.loss_not_decreasing(),\n",
+ " rule_parameters={\n",
+ " \"collection_names\": \"losses,metrics\",\n",
+ " \"use_losses_collection\": \"true\",\n",
+ " \"num_steps\": \"10\",\n",
+ " \"diff_percent\": \"50\",\n",
+ " },\n",
+ " collections_to_save=[\n",
+ " CollectionConfig(\n",
+ " name=\"losses\",\n",
+ " parameters={\n",
+ " \"save_interval\": \"10\",\n",
+ " },\n",
+ " ),\n",
+ " CollectionConfig(\n",
+ " name=\"metrics\",\n",
+ " parameters={\n",
+ " \"save_interval\": \"10\",\n",
+ " },\n",
+ " ),\n",
+ " ],\n",
+ " actions=actions,\n",
+ " ),\n",
+ " Rule.sagemaker(\n",
+ " base_config=rule_configs.overtraining(),\n",
+ " rule_parameters={\n",
+ " \"collection_names\": \"losses,metrics\",\n",
+ " \"patience_train\": \"10\",\n",
+ " \"patience_validation\": \"10\",\n",
+ " \"delta\": \"0.5\",\n",
+ " },\n",
+ " collections_to_save=[\n",
+ " CollectionConfig(\n",
+ " name=\"losses\",\n",
+ " parameters={\n",
+ " \"save_interval\": \"10\",\n",
+ " },\n",
+ " ),\n",
+ " CollectionConfig(\n",
+ " name=\"metrics\",\n",
+ " parameters={\n",
+ " \"save_interval\": \"10\",\n",
+ " },\n",
+ " ),\n",
+ " ],\n",
+ " actions=actions,\n",
+ " ),\n",
+ " ProfilerRule.sagemaker(rule_configs.ProfilerReport()),\n",
+ " ProfilerRule.sagemaker(rule_configs.BatchSize()),\n",
+ " ProfilerRule.sagemaker(rule_configs.CPUBottleneck()),\n",
+ " ProfilerRule.sagemaker(rule_configs.GPUMemoryIncrease()),\n",
+ " ProfilerRule.sagemaker(rule_configs.IOBottleneck()),\n",
+ " ProfilerRule.sagemaker(rule_configs.LoadBalancing()),\n",
+ " ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),\n",
+ " ProfilerRule.sagemaker(rule_configs.OverallSystemUsage()),\n",
+ " # ProfilerRule.sagemaker(rule_configs.OverallFrameworkMetrics()),\n",
+ " ProfilerRule.sagemaker(rule_configs.StepOutlier()),\n",
+ "]\n",
"\n",
"hook_config = DebuggerHookConfig(\n",
" hook_parameters={\n",
- " 'save_interval': '10', # number of steps\n",
- " 'export_tensorboard': 'true',\n",
- " 'tensorboard_dir': 'hook_tensorboard/',\n",
- " })"
+ " \"save_interval\": \"10\", # number of steps\n",
+ " \"export_tensorboard\": \"true\",\n",
+ " \"tensorboard_dir\": \"hook_tensorboard/\",\n",
+ " }\n",
+ ")"
]
},
{
@@ -511,7 +517,7 @@
"\n",
"profiler_config = ProfilerConfig(\n",
" system_monitor_interval_millis=500,\n",
- " framework_profile_params=FrameworkProfile(local_path=\"/opt/ml/output/profiler/\", start_step=5, num_steps=10)\n",
+ " framework_profile_params=FrameworkProfile(local_path=\"/opt/ml/output/profiler/\", start_step=5, num_steps=10),\n",
")"
]
},
@@ -531,8 +537,8 @@
"source": [
"import uuid\n",
"\n",
- "checkpoint_s3_prefix = 'checkpoints/{}'.format(str(uuid.uuid4()))\n",
- "checkpoint_s3_uri = 's3://{}/{}/'.format(bucket, checkpoint_s3_prefix)\n",
+ "checkpoint_s3_prefix = \"checkpoints/{}\".format(str(uuid.uuid4()))\n",
+ "checkpoint_s3_uri = \"s3://{}/{}/\".format(bucket, checkpoint_s3_prefix)\n",
"\n",
"print(checkpoint_s3_uri)"
]
@@ -564,43 +570,46 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " checkpoint_s3_uri=checkpoint_s3_uri,\n",
- " py_version='py37',\n",
- " framework_version='2.3.1',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " metric_definitions=metrics_definitions,\n",
- " rules=rules,\n",
- " debugger_hook_config=hook_config,\n",
- " profiler_config=profiler_config,\n",
- "# max_run=7200, # number of seconds\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " checkpoint_s3_uri=checkpoint_s3_uri,\n",
+ " py_version=\"py37\",\n",
+ " framework_version=\"2.3.1\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " metric_definitions=metrics_definitions,\n",
+ " rules=rules,\n",
+ " debugger_hook_config=hook_config,\n",
+ " profiler_config=profiler_config,\n",
+ " # max_run=7200, # number of seconds\n",
+ ")"
]
},
{
@@ -616,11 +625,7 @@
"metadata": {},
"outputs": [],
"source": [
- "experiment_config = {\n",
- " 'ExperimentName': experiment_name,\n",
- " 'TrialName': trial_name,\n",
- " 'TrialComponentDisplayName': 'train'\n",
- "}"
+ "experiment_config = {\"ExperimentName\": experiment_name, \"TrialName\": trial_name, \"TrialComponentDisplayName\": \"train\"}"
]
},
{
@@ -636,12 +641,11 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " experiment_config=experiment_config, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " experiment_config=experiment_config,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -651,7 +655,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -662,7 +666,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -673,7 +683,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -684,7 +700,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -695,7 +717,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Checkpoint Data After The Training Job Has Completed'.format(bucket, checkpoint_s3_prefix, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Checkpoint Data After The Training Job Has Completed'.format(\n",
+ " bucket, checkpoint_s3_prefix, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -824,12 +852,13 @@
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
" sagemaker_session=sess,\n",
" experiment_name=experiment_name,\n",
- " metric_names=['validation:accuracy'],\n",
+ " metric_names=[\"validation:accuracy\"],\n",
" sort_by=\"CreationTime\",\n",
" sort_order=\"Descending\",\n",
")\n",
@@ -953,7 +982,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#Internal - DO NOT RUN\n",
+ "# Internal - DO NOT RUN\n",
"\n",
"# step_prefix = '07_train'\n",
"# !aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz s3://dsoaws/$step_prefix/tensorflow/ --acl public-read-write --acl bucket-owner-full-control\n",
diff --git a/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb b/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb
index c97a4028..45ec7104 100644
--- a/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb
+++ b/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb
@@ -17,12 +17,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -43,9 +43,9 @@
"try:\n",
" training_job_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please wait for the Training notebook to finish.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please wait for the Training notebook to finish.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -54,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Previous training_job_name: {}'.format(training_job_name))"
+ "print(\"Previous training_job_name: {}\".format(training_job_name))"
]
},
{
@@ -63,7 +63,7 @@
"metadata": {},
"outputs": [],
"source": [
- "training_job_name = 'tensorflow-training-2021-01-06-21-36-03-293'"
+ "training_job_name = \"tensorflow-training-2021-01-06-21-36-03-293\""
]
},
{
@@ -79,7 +79,7 @@
"metadata": {},
"outputs": [],
"source": [
- "models_dir = './models'"
+ "models_dir = \"./models\""
]
},
{
@@ -101,11 +101,11 @@
"import pickle as pkl\n",
"\n",
"try:\n",
- " tar = tarfile.open('{}/model.tar.gz'.format(models_dir))\n",
+ " tar = tarfile.open(\"{}/model.tar.gz\".format(models_dir))\n",
" tar.extractall(path=models_dir)\n",
" tar.close()\n",
"except Exception as e:\n",
- " print('[ERROR] in tar operation: {}'.format(e))"
+ " print(\"[ERROR] in tar operation: {}\".format(e))"
]
},
{
@@ -123,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
- "transformer_model_dir = '{}/transformers/fine-tuned/'.format(models_dir)\n",
+ "transformer_model_dir = \"{}/transformers/fine-tuned/\".format(models_dir)\n",
"\n",
"!ls -al $transformer_model_dir"
]
@@ -152,27 +152,17 @@
"metadata": {},
"outputs": [],
"source": [
- "from transformers import DistilBertForSequenceClassification # PyTorch version\n",
+ "from transformers import DistilBertForSequenceClassification # PyTorch version\n",
"\n",
"try:\n",
- " loaded_pytorch_model = DistilBertForSequenceClassification.from_pretrained(transformer_model_dir,\n",
- " id2label={\n",
- " 0: 1,\n",
- " 1: 2,\n",
- " 2: 3,\n",
- " 3: 4,\n",
- " 4: 5\n",
- " },\n",
- " label2id={\n",
- " 1: 0,\n",
- " 2: 1,\n",
- " 3: 2,\n",
- " 4: 3,\n",
- " 5: 4\n",
- " },\n",
- " from_tf=True)\n",
+ " loaded_pytorch_model = DistilBertForSequenceClassification.from_pretrained(\n",
+ " transformer_model_dir,\n",
+ " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n",
+ " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n",
+ " from_tf=True,\n",
+ " )\n",
"except Exception as e:\n",
- " print('[ERROR] in loading model {}: '.format(e))"
+ " print(\"[ERROR] in loading model {}: \".format(e))"
]
},
{
@@ -201,7 +191,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pytorch_models_dir = './models/transformers/pytorch'"
+ "pytorch_models_dir = \"./models/transformers/pytorch\""
]
},
{
@@ -246,7 +236,7 @@
"source": [
"from transformers import DistilBertTokenizer\n",
"\n",
- "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')"
+ "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")"
]
},
{
@@ -277,12 +267,12 @@
"from transformers import DistilBertForSequenceClassification\n",
"from transformers import DistilBertConfig\n",
"\n",
- "config = DistilBertConfig.from_json_file('{}/config.json'.format(pytorch_models_dir))\n",
+ "config = DistilBertConfig.from_json_file(\"{}/config.json\".format(pytorch_models_dir))\n",
"\n",
- "model_path = '{}/{}'.format(pytorch_models_dir, 'model.pth') \n",
+ "model_path = \"{}/{}\".format(pytorch_models_dir, \"model.pth\")\n",
"model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)\n",
"\n",
- "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"model.to(device)"
]
},
@@ -295,24 +285,24 @@
"import json\n",
"\n",
"max_seq_length = 64\n",
- "classes=[1, 2, 3, 4, 5]\n",
+ "classes = [1, 2, 3, 4, 5]\n",
"\n",
"model.eval()\n",
"\n",
"input_data = '[{\"features\": [\"This is great!\"]}, \\\n",
" {\"features\": [\"This is bad.\"]}]'\n",
- "print('input_data: {}'.format(input_data))\n",
+ "print(\"input_data: {}\".format(input_data))\n",
"\n",
"data_json = json.loads(input_data)\n",
- "print('data_json: {}'.format(data_json))\n",
+ "print(\"data_json: {}\".format(data_json))\n",
"\n",
"predicted_classes = []\n",
"\n",
"for data_json_line in data_json:\n",
- " print('data_json_line: {}'.format(data_json_line))\n",
- " print('type(data_json_line): {}'.format(type(data_json_line)))\n",
+ " print(\"data_json_line: {}\".format(data_json_line))\n",
+ " print(\"type(data_json_line): {}\".format(type(data_json_line)))\n",
"\n",
- " review_body = data_json_line['features'][0]\n",
+ " review_body = data_json_line[\"features\"][0]\n",
" print(\"\"\"review_body: {}\"\"\".format(review_body))\n",
"\n",
" encode_plus_token = tokenizer.encode_plus(\n",
@@ -322,40 +312,41 @@
" return_token_type_ids=False,\n",
" pad_to_max_length=True,\n",
" return_attention_mask=True,\n",
- " return_tensors='pt',\n",
- " truncation=True)\n",
+ " return_tensors=\"pt\",\n",
+ " truncation=True,\n",
+ " )\n",
"\n",
- " input_ids = encode_plus_token['input_ids']\n",
- " attention_mask = encode_plus_token['attention_mask']\n",
+ " input_ids = encode_plus_token[\"input_ids\"]\n",
+ " attention_mask = encode_plus_token[\"attention_mask\"]\n",
"\n",
" output = model(input_ids, attention_mask)\n",
- " print('output: {}'.format(output))\n",
+ " print(\"output: {}\".format(output))\n",
"\n",
- " # output is a tuple: \n",
+ " # output is a tuple:\n",
" # output: (tensor([[-1.9840, -0.9870, 2.8947]], grad_fn=),\n",
- " # for torch.max() you need to pass in the tensor, output[0] \n",
+ " # for torch.max() you need to pass in the tensor, output[0]\n",
" _, prediction = torch.max(output[0], dim=1)\n",
"\n",
" predicted_class_idx = prediction.item()\n",
" predicted_class = classes[predicted_class_idx]\n",
- " print('predicted_class: {}'.format(predicted_class))\n",
+ " print(\"predicted_class: {}\".format(predicted_class))\n",
"\n",
" prediction_dict = {}\n",
- " prediction_dict['predicted_label'] = predicted_class\n",
+ " prediction_dict[\"predicted_label\"] = predicted_class\n",
"\n",
" jsonline = json.dumps(prediction_dict)\n",
- " print('jsonline: {}'.format(jsonline))\n",
+ " print(\"jsonline: {}\".format(jsonline))\n",
"\n",
" predicted_classes.append(jsonline)\n",
- " print('predicted_classes in the loop: {}'.format(predicted_classes))\n",
+ " print(\"predicted_classes in the loop: {}\".format(predicted_classes))\n",
"\n",
- "predicted_classes_jsonlines = '\\n'.join(predicted_classes) \n",
- "print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))\n",
- "print('type(predicted_classes_jsonlines): {}'.format(type(predicted_classes_jsonlines)))\n",
+ "predicted_classes_jsonlines = \"\\n\".join(predicted_classes)\n",
+ "print(\"predicted_classes_jsonlines: {}\".format(predicted_classes_jsonlines))\n",
+ "print(\"type(predicted_classes_jsonlines): {}\".format(type(predicted_classes_jsonlines)))\n",
"\n",
"predicted_classes_jsonlines_dump = json.dumps(predicted_classes_jsonlines)\n",
- "print('predicted_classes_jsonlines_dump: {}'.format(predicted_classes_jsonlines_dump))\n",
- "print('type(predicted_classes_jsonlines_dump): {}'.format(type(predicted_classes_jsonlines_dump)))"
+ "print(\"predicted_classes_jsonlines_dump: {}\".format(predicted_classes_jsonlines_dump))\n",
+ "print(\"type(predicted_classes_jsonlines_dump): {}\".format(type(predicted_classes_jsonlines_dump)))"
]
},
{
@@ -371,7 +362,7 @@
"metadata": {},
"outputs": [],
"source": [
- "transformer_pytorch_model_dir_s3_uri = 's3://{}/models/{}/transformer-pytorch/'.format(bucket, training_job_name)\n",
+ "transformer_pytorch_model_dir_s3_uri = \"s3://{}/models/{}/transformer-pytorch/\".format(bucket, training_job_name)\n",
"print(transformer_pytorch_model_dir_s3_uri)"
]
},
diff --git a/07_train/04_Evaluate_Model_Metrics.ipynb b/07_train/04_Evaluate_Model_Metrics.ipynb
index 41002425..001e7ee4 100644
--- a/07_train/04_Evaluate_Model_Metrics.ipynb
+++ b/07_train/04_Evaluate_Model_Metrics.ipynb
@@ -70,7 +70,7 @@
"bucket = sess.default_bucket()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -90,11 +90,11 @@
"source": [
"try:\n",
" training_job_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -103,7 +103,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#training_job_name='tensorflow-training-2021-01-02-06-07-04-440'"
+ "# training_job_name='tensorflow-training-2021-01-02-06-07-04-440'"
]
},
{
@@ -133,9 +133,9 @@
"try:\n",
" raw_input_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -164,11 +164,11 @@
"source": [
"try:\n",
" max_seq_length\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -197,11 +197,11 @@
"source": [
"try:\n",
" experiment_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -230,11 +230,11 @@
"source": [
"try:\n",
" trial_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -275,7 +275,7 @@
"metadata": {},
"outputs": [],
"source": [
- "model_dir_s3_uri = describe_training_job_response['ModelArtifacts']['S3ModelArtifacts'].replace('model.tar.gz', '')\n",
+ "model_dir_s3_uri = describe_training_job_response[\"ModelArtifacts\"][\"S3ModelArtifacts\"].replace(\"model.tar.gz\", \"\")\n",
"model_dir_s3_uri"
]
},
@@ -302,9 +302,9 @@
"outputs": [],
"source": [
"experiment_config = {\n",
- " 'ExperimentName': experiment_name,\n",
- " 'TrialName': trial_name,\n",
- " 'TrialComponentDisplayName': 'evaluate'\n",
+ " \"ExperimentName\": experiment_name,\n",
+ " \"TrialName\": trial_name,\n",
+ " \"TrialComponentDisplayName\": \"evaluate\",\n",
"}"
]
},
@@ -323,8 +323,8 @@
},
"outputs": [],
"source": [
- "processing_instance_type='ml.m5.xlarge'\n",
- "processing_instance_count=1"
+ "processing_instance_type = \"ml.m5.xlarge\"\n",
+ "processing_instance_count = 1"
]
},
{
@@ -368,11 +368,13 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " max_runtime_in_seconds=7200)"
+ "processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " max_runtime_in_seconds=7200,\n",
+ ")"
]
},
{
@@ -383,26 +385,26 @@
"source": [
"from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
"\n",
- "processor.run(code='evaluate_model_metrics.py',\n",
- " inputs=[\n",
- " ProcessingInput(input_name='model-tar-s3-uri',\n",
- " source=model_dir_s3_uri,\n",
- " destination='/opt/ml/processing/input/model/'),\n",
- " ProcessingInput(input_name='evaluation-data-s3-uri',\n",
- " source=raw_input_data_s3_uri,\n",
- " destination='/opt/ml/processing/input/data/')\n",
- " ],\n",
- " outputs=[\n",
- " ProcessingOutput(s3_upload_mode='EndOfJob',\n",
- " output_name='metrics',\n",
- " source='/opt/ml/processing/output/metrics'),\n",
- " ],\n",
- " arguments=[\n",
- " '--max-seq-length', str(max_seq_length)\n",
- " ],\n",
- " experiment_config=experiment_config,\n",
- " logs=True,\n",
- " wait=False)"
+ "processor.run(\n",
+ " code=\"evaluate_model_metrics.py\",\n",
+ " inputs=[\n",
+ " ProcessingInput(\n",
+ " input_name=\"model-tar-s3-uri\", source=model_dir_s3_uri, destination=\"/opt/ml/processing/input/model/\"\n",
+ " ),\n",
+ " ProcessingInput(\n",
+ " input_name=\"evaluation-data-s3-uri\",\n",
+ " source=raw_input_data_s3_uri,\n",
+ " destination=\"/opt/ml/processing/input/data/\",\n",
+ " ),\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(s3_upload_mode=\"EndOfJob\", output_name=\"metrics\", source=\"/opt/ml/processing/output/metrics\"),\n",
+ " ],\n",
+ " arguments=[\"--max-seq-length\", str(max_seq_length)],\n",
+ " experiment_config=experiment_config,\n",
+ " logs=True,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -413,7 +415,7 @@
},
"outputs": [],
"source": [
- "scikit_processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n",
+ "scikit_processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n",
"print(scikit_processing_job_name)"
]
},
@@ -427,7 +429,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Processing Job'.format(region, scikit_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Processing Job'.format(\n",
+ " region, scikit_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -440,7 +448,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, scikit_processing_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, scikit_processing_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -453,7 +467,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, scikit_processing_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n",
+ " bucket, scikit_processing_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -469,8 +489,9 @@
"metadata": {},
"outputs": [],
"source": [
- "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=scikit_processing_job_name,\n",
- " sagemaker_session=sess)\n",
+ "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n",
+ " processing_job_name=scikit_processing_job_name, sagemaker_session=sess\n",
+ ")\n",
"\n",
"processing_job_description = running_processor.describe()\n",
"\n",
@@ -483,7 +504,7 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_evaluation_metrics_job_name = processing_job_description['ProcessingJobName']\n",
+ "processing_evaluation_metrics_job_name = processing_job_description[\"ProcessingJobName\"]\n",
"print(processing_evaluation_metrics_job_name)"
]
},
@@ -522,11 +543,11 @@
"source": [
"processing_job_description = running_processor.describe()\n",
"\n",
- "output_config = processing_job_description['ProcessingOutputConfig']\n",
- "for output in output_config['Outputs']:\n",
- " if output['OutputName'] == 'metrics':\n",
- " processed_metrics_s3_uri = output['S3Output']['S3Uri']\n",
- " \n",
+ "output_config = processing_job_description[\"ProcessingOutputConfig\"]\n",
+ "for output in output_config[\"Outputs\"]:\n",
+ " if output[\"OutputName\"] == \"metrics\":\n",
+ " processed_metrics_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n",
+ "\n",
"print(processed_metrics_s3_uri)"
]
},
@@ -593,14 +614,12 @@
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
- "#pd.set_option(\"max_rows\", 100)\n",
+ "# pd.set_option(\"max_rows\", 100)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
- " sagemaker_session=sess,\n",
- " experiment_name=experiment_name,\n",
- " sort_by=\"CreationTime\",\n",
- " sort_order=\"Descending\"\n",
+ " sagemaker_session=sess, experiment_name=experiment_name, sort_by=\"CreationTime\", sort_order=\"Descending\"\n",
")\n",
"\n",
"experiment_analytics_df = experiment_analytics.dataframe()\n",
@@ -613,7 +632,7 @@
"metadata": {},
"outputs": [],
"source": [
- "trial_component_name=experiment_analytics_df.TrialComponentName[0]\n",
+ "trial_component_name = experiment_analytics_df.TrialComponentName[0]\n",
"print(trial_component_name)"
]
},
@@ -623,7 +642,7 @@
"metadata": {},
"outputs": [],
"source": [
- "trial_component_description=sm.describe_trial_component(TrialComponentName=trial_component_name)\n",
+ "trial_component_description = sm.describe_trial_component(TrialComponentName=trial_component_name)\n",
"trial_component_description"
]
},
diff --git a/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb b/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb
index aadcfc90..e28c896e 100644
--- a/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb
+++ b/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb
@@ -83,68 +83,62 @@
"import csv\n",
"from transformers import DistilBertTokenizer\n",
"\n",
- "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
+ "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
"\n",
- "DATA_COLUMN = 'review_body'\n",
- "LABEL_COLUMN = 'star_rating'\n",
+ "DATA_COLUMN = \"review_body\"\n",
+ "LABEL_COLUMN = \"star_rating\"\n",
"LABEL_VALUES = [1, 2, 3, 4, 5]\n",
"\n",
"label_map = {}\n",
"for (i, label) in enumerate(LABEL_VALUES):\n",
" label_map[label] = i\n",
"\n",
- " \n",
+ "\n",
"class InputFeatures(object):\n",
- " \"\"\"BERT feature vectors.\"\"\"\n",
- "\n",
- " def __init__(self,\n",
- " input_ids,\n",
- " input_mask,\n",
- " segment_ids,\n",
- " label_id):\n",
- " self.input_ids = input_ids\n",
- " self.input_mask = input_mask\n",
- " self.segment_ids = segment_ids\n",
- " self.label_id = label_id\n",
- " \n",
- " \n",
+ " \"\"\"BERT feature vectors.\"\"\"\n",
+ "\n",
+ " def __init__(self, input_ids, input_mask, segment_ids, label_id):\n",
+ " self.input_ids = input_ids\n",
+ " self.input_mask = input_mask\n",
+ " self.segment_ids = segment_ids\n",
+ " self.label_id = label_id\n",
+ "\n",
+ "\n",
"class Input(object):\n",
- " \"\"\"A single training/test input for sequence classification.\"\"\"\n",
- "\n",
- " def __init__(self, text, label=None):\n",
- " \"\"\"Constructs an Input.\n",
- " Args:\n",
- " text: string. The untokenized text of the first sequence. For single\n",
- " sequence tasks, only this sequence must be specified.\n",
- " label: (Optional) string. The label of the example. This should be\n",
- " specified for train and dev examples, but not for test examples.\n",
- " \"\"\"\n",
- " self.text = text\n",
- " self.label = label\n",
- " \n",
+ " \"\"\"A single training/test input for sequence classification.\"\"\"\n",
+ "\n",
+ " def __init__(self, text, label=None):\n",
+ " \"\"\"Constructs an Input.\n",
+ " Args:\n",
+ " text: string. The untokenized text of the first sequence. For single\n",
+ " sequence tasks, only this sequence must be specified.\n",
+ " label: (Optional) string. The label of the example. This should be\n",
+ " specified for train and dev examples, but not for test examples.\n",
+ " \"\"\"\n",
+ " self.text = text\n",
+ " self.label = label\n",
+ "\n",
"\n",
"def convert_input(text_input, max_seq_length):\n",
" # First, we need to preprocess our data so that it matches the data BERT was trained on:\n",
" # 1. Lowercase our text (if we're using a BERT lowercase model)\n",
" # 2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n",
" # 3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n",
- " # \n",
+ " #\n",
" # Fortunately, the Transformers tokenizer does this for us!\n",
"\n",
" tokens = tokenizer.tokenize(text_input.text)\n",
- " print('**tokens**\\n{}\\n'.format(tokens))\n",
+ " print(\"**tokens**\\n{}\\n\".format(tokens))\n",
"\n",
- " encode_plus_tokens = tokenizer.encode_plus(text_input.text,\n",
- " pad_to_max_length=True,\n",
- " max_length=max_seq_length,\n",
- " truncation=True\n",
- " )\n",
+ " encode_plus_tokens = tokenizer.encode_plus(\n",
+ " text_input.text, pad_to_max_length=True, max_length=max_seq_length, truncation=True\n",
+ " )\n",
"\n",
" # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n",
- " input_ids = encode_plus_tokens['input_ids']\n",
- " \n",
- " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n",
- " input_mask = encode_plus_tokens['attention_mask']\n",
+ " input_ids = encode_plus_tokens[\"input_ids\"]\n",
+ "\n",
+ " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n",
+ " input_mask = encode_plus_tokens[\"attention_mask\"]\n",
"\n",
" # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n",
" segment_ids = [0] * max_seq_length\n",
@@ -152,41 +146,37 @@
" # Label for each training row (`star_rating` 1 through 5)\n",
" label_id = label_map[text_input.label]\n",
"\n",
- " features = InputFeatures(\n",
- " input_ids=input_ids,\n",
- " input_mask=input_mask,\n",
- " segment_ids=segment_ids,\n",
- " label_id=label_id)\n",
+ " features = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)\n",
"\n",
- " print('**input_ids**\\n{}\\n'.format(features.input_ids))\n",
- " print('**input_mask**\\n{}\\n'.format(features.input_mask))\n",
- " print('**segment_ids**\\n{}\\n'.format(features.segment_ids))\n",
- " print('**label_id**\\n{}\\n'.format(features.label_id))\n",
+ " print(\"**input_ids**\\n{}\\n\".format(features.input_ids))\n",
+ " print(\"**input_mask**\\n{}\\n\".format(features.input_mask))\n",
+ " print(\"**segment_ids**\\n{}\\n\".format(features.segment_ids))\n",
+ " print(\"**label_id**\\n{}\\n\".format(features.label_id))\n",
"\n",
" return features\n",
"\n",
"\n",
"# We'll need to transform our data into a format that BERT understands.\n",
- "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n",
+ "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.\n",
"# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data\n",
"def transform_inputs_to_tfrecord(inputs, max_seq_length):\n",
" tf_records = []\n",
" for (input_idx, text_input) in enumerate(inputs):\n",
- " if input_idx % 10000 == 0:\n",
- " print('Writing input {} of {}\\n'.format(input_idx, len(inputs)))\n",
+ " if input_idx % 10000 == 0:\n",
+ " print(\"Writing input {} of {}\\n\".format(input_idx, len(inputs)))\n",
+ "\n",
+ " features = convert_input(text_input, max_seq_length)\n",
"\n",
- " features = convert_input(text_input, max_seq_length)\n",
- " \n",
- " all_features = collections.OrderedDict()\n",
- " all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n",
- " all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n",
- " all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n",
- " all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n",
+ " all_features = collections.OrderedDict()\n",
+ " all_features[\"input_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n",
+ " all_features[\"input_mask\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n",
+ " all_features[\"segment_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n",
+ " all_features[\"label_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n",
"\n",
- " tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n",
- " tf_records.append(tf_record.SerializeToString())\n",
+ " tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n",
+ " tf_records.append(tf_record.SerializeToString())\n",
"\n",
- " return tf_records\n"
+ " return tf_records"
]
},
{
@@ -223,17 +213,24 @@
"import pandas as pd\n",
"\n",
"data = [\n",
- " [5,\"\"\"I needed an antivirus application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\"],\n",
- " [3,\"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\"],\n",
- " [1,\"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\"]\n",
- " ]\n",
- "\n",
- "df = pd.DataFrame(data, columns=['star_rating','review_body'])\n",
+ " [\n",
+ " 5,\n",
+ " \"\"\"I needed an antivirus application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\",\n",
+ " ],\n",
+ " [\n",
+ " 3,\n",
+ " \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\",\n",
+ " ],\n",
+ " [\n",
+ " 1,\n",
+ " \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\",\n",
+ " ],\n",
+ "]\n",
+ "\n",
+ "df = pd.DataFrame(data, columns=[\"star_rating\", \"review_body\"])\n",
"\n",
"# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
- "inputs = df.apply(lambda x: Input(text = x[DATA_COLUMN], \n",
- " label = x[LABEL_COLUMN]), \n",
- " axis = 1)\n",
+ "inputs = df.apply(lambda x: Input(text=x[DATA_COLUMN], label=x[LABEL_COLUMN]), axis=1)\n",
"\n",
"max_seq_length = 64\n",
"tf_records = transform_inputs_to_tfrecord(inputs, max_seq_length)"
@@ -255,7 +252,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('**tf_records**')\n",
+ "print(\"**tf_records**\")\n",
"\n",
"for tf_record in tf_records:\n",
" print(tf_record)"
diff --git a/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb b/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb
index b57c2041..87f2ce7f 100644
--- a/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb
+++ b/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb
@@ -18,7 +18,8 @@
"outputs": [],
"source": [
"import sys\n",
- "print('Python Version %s' % sys.version)"
+ "\n",
+ "print(\"Python Version %s\" % sys.version)"
]
},
{
diff --git a/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb b/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb
index d94d081f..63dc0ec7 100644
--- a/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb
+++ b/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb
@@ -24,13 +24,13 @@
"import json\n",
"from botocore.exceptions import ClientError\n",
"\n",
- "iam = boto3.client('iam')\n",
- "sts = boto3.client('sts')\n",
- "cfn = boto3.client('cloudformation')\n",
- "eks = boto3.client('eks')\n",
+ "iam = boto3.client(\"iam\")\n",
+ "sts = boto3.client(\"sts\")\n",
+ "cfn = boto3.client(\"cloudformation\")\n",
+ "eks = boto3.client(\"eks\")\n",
"\n",
"region = boto3.Session().region_name\n",
- "cluster_name = 'workshop'"
+ "cluster_name = \"workshop\""
]
},
{
@@ -77,19 +77,16 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('fsx/fsx-csi-driver.json') as json_file:\n",
+ "with open(\"fsx/fsx-csi-driver.json\") as json_file:\n",
" data = json.load(json_file)\n",
" policy = json.dumps(data)\n",
"\n",
"try:\n",
- " response = iam.create_policy(\n",
- " PolicyName='Amazon_FSx_Lustre_CSI_Driver',\n",
- " PolicyDocument=policy\n",
- " )\n",
+ " response = iam.create_policy(PolicyName=\"Amazon_FSx_Lustre_CSI_Driver\", PolicyDocument=policy)\n",
" print(\"[OK] Policy created.\")\n",
"\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" print(\"[OK] Policy already exists.\")\n",
" else:\n",
" print(\"Error: %s\" % e)"
@@ -101,8 +98,8 @@
"metadata": {},
"outputs": [],
"source": [
- "account_id = sts.get_caller_identity()['Account']\n",
- "csi_policy_arn = 'arn:aws:iam::{}:policy/Amazon_FSx_Lustre_CSI_Driver'.format(account_id)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
+ "csi_policy_arn = \"arn:aws:iam::{}:policy/Amazon_FSx_Lustre_CSI_Driver\".format(account_id)\n",
"print(csi_policy_arn)"
]
},
@@ -145,7 +142,7 @@
"metadata": {},
"outputs": [],
"source": [
- "cf_stack_name = 'eksctl-{}-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa'.format(cluster_name)\n",
+ "cf_stack_name = \"eksctl-{}-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa\".format(cluster_name)\n",
"print(cf_stack_name)"
]
},
@@ -155,9 +152,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = cfn.list_stack_resources(\n",
- " StackName=cf_stack_name\n",
- ")\n",
+ "response = cfn.list_stack_resources(StackName=cf_stack_name)\n",
"print(response)"
]
},
@@ -167,7 +162,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_name = response['StackResourceSummaries'][0]['PhysicalResourceId']\n",
+ "iam_role_name = response[\"StackResourceSummaries\"][0][\"PhysicalResourceId\"]\n",
"print(iam_role_name)"
]
},
@@ -177,7 +172,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_arn = iam.get_role(RoleName=iam_role_name)['Role']['Arn']\n",
+ "iam_role_arn = iam.get_role(RoleName=iam_role_name)[\"Role\"][\"Arn\"]\n",
"print(iam_role_arn)"
]
},
@@ -194,7 +189,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!kubectl apply -k \"github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=master\"\n"
+ "!kubectl apply -k \"github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=master\""
]
},
{
@@ -227,7 +222,7 @@
"metadata": {},
"outputs": [],
"source": [
- "bucket = 's3://fsx-container-demo'"
+ "bucket = \"s3://fsx-container-demo\""
]
},
{
@@ -270,8 +265,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!curl -o storageclass.yaml https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/examples/kubernetes/dynamic_provisioning_s3/specs/storageclass.yaml\n",
- " "
+ "!curl -o storageclass.yaml https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/examples/kubernetes/dynamic_provisioning_s3/specs/storageclass.yaml"
]
},
{
@@ -472,7 +466,7 @@
"metadata": {},
"outputs": [],
"source": [
- "fsx = boto3.client('fsx')"
+ "fsx = boto3.client(\"fsx\")"
]
},
{
@@ -482,7 +476,7 @@
"outputs": [],
"source": [
"response = fsx.describe_file_systems()\n",
- "fsx_id = response['FileSystems'][0]['FileSystemId']\n",
+ "fsx_id = response[\"FileSystems\"][0][\"FileSystemId\"]\n",
"print(fsx_id)"
]
},
@@ -492,12 +486,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = fsx.update_file_system(\n",
- " FileSystemId=fsx_id,\n",
- " LustreConfiguration={\n",
- " 'AutoImportPolicy': 'NEW_CHANGED'\n",
- " }\n",
- ")\n",
+ "response = fsx.update_file_system(FileSystemId=fsx_id, LustreConfiguration={\"AutoImportPolicy\": \"NEW_CHANGED\"})\n",
"print(response)"
]
}
diff --git a/07_train/container-demo/01_Develop_Code_Notebook.ipynb b/07_train/container-demo/01_Develop_Code_Notebook.ipynb
index 5641c4d7..181a5adf 100644
--- a/07_train/container-demo/01_Develop_Code_Notebook.ipynb
+++ b/07_train/container-demo/01_Develop_Code_Notebook.ipynb
@@ -56,12 +56,12 @@
"metadata": {},
"outputs": [],
"source": [
- "train_data='./input/data/train'\n",
- "validation_data='./input/data/validation'\n",
- "test_data='./input/data/test'\n",
- "local_model_dir='./model/'\n",
- "num_gpus=0\n",
- "input_data_config='File'"
+ "train_data = \"./input/data/train\"\n",
+ "validation_data = \"./input/data/validation\"\n",
+ "test_data = \"./input/data/test\"\n",
+ "local_model_dir = \"./model/\"\n",
+ "num_gpus = 0\n",
+ "input_data_config = \"File\""
]
},
{
@@ -70,22 +70,22 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=8\n",
- "validation_batch_size=8\n",
- "test_batch_size=8\n",
- "train_steps_per_epoch=1\n",
- "validation_steps=1\n",
- "test_steps=1\n",
- "use_xla=True\n",
- "use_amp=False\n",
- "max_seq_length=64\n",
- "freeze_bert_layer=True\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 8\n",
+ "validation_batch_size = 8\n",
+ "test_batch_size = 8\n",
+ "train_steps_per_epoch = 1\n",
+ "validation_steps = 1\n",
+ "test_steps = 1\n",
+ "use_xla = True\n",
+ "use_amp = False\n",
+ "max_seq_length = 64\n",
+ "freeze_bert_layer = True\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -116,66 +116,66 @@
"\n",
"CLASSES = [1, 2, 3, 4, 5]\n",
"\n",
+ "\n",
"def select_data_and_label_from_record(record):\n",
- " x = {\n",
- " 'input_ids': record['input_ids'],\n",
- " 'input_mask': record['input_mask'],\n",
- " 'segment_ids': record['segment_ids']\n",
- " }\n",
+ " x = {\"input_ids\": record[\"input_ids\"], \"input_mask\": record[\"input_mask\"], \"segment_ids\": record[\"segment_ids\"]}\n",
"\n",
- " y = record['label_ids']\n",
+ " y = record[\"label_ids\"]\n",
"\n",
" return (x, y)\n",
"\n",
"\n",
- "def file_based_input_dataset_builder(channel,\n",
- " input_filenames,\n",
- " pipe_mode,\n",
- " is_training,\n",
- " drop_remainder,\n",
- " batch_size,\n",
- " epochs,\n",
- " steps_per_epoch,\n",
- " max_seq_length):\n",
+ "def file_based_input_dataset_builder(\n",
+ " channel,\n",
+ " input_filenames,\n",
+ " pipe_mode,\n",
+ " is_training,\n",
+ " drop_remainder,\n",
+ " batch_size,\n",
+ " epochs,\n",
+ " steps_per_epoch,\n",
+ " max_seq_length,\n",
+ "):\n",
"\n",
" # For training, we want a lot of parallel reading and shuffling.\n",
" # For eval, we want no shuffling and parallel reading doesn't matter.\n",
"\n",
" if pipe_mode:\n",
- " print('***** Using pipe_mode with channel {}'.format(channel))\n",
- " from sagemaker_tensorflow import PipeModeDataset\n",
- " dataset = PipeModeDataset(channel=channel,\n",
- " record_format='TFRecord')\n",
+ " print(\"***** Using pipe_mode with channel {}\".format(channel))\n",
+ " from sagemaker_tensorflow import PipeModeDataset\n",
+ "\n",
+ " dataset = PipeModeDataset(channel=channel, record_format=\"TFRecord\")\n",
" else:\n",
- " print('***** Using input_filenames {}'.format(input_filenames))\n",
+ " print(\"***** Using input_filenames {}\".format(input_filenames))\n",
" dataset = tf.data.TFRecordDataset(input_filenames)\n",
- " \n",
+ "\n",
" dataset = dataset.repeat(epochs * steps_per_epoch * 100)\n",
"\n",
" name_to_features = {\n",
- " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- " \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
- " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n",
+ " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n",
+ " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n",
" }\n",
"\n",
" def _decode_record(record, name_to_features):\n",
" \"\"\"Decodes a record to a TensorFlow example.\"\"\"\n",
" record = tf.io.parse_single_example(record, name_to_features)\n",
" return record\n",
- " \n",
+ "\n",
" dataset = dataset.apply(\n",
" tf.data.experimental.map_and_batch(\n",
- " lambda record: _decode_record(record, name_to_features),\n",
- " batch_size=batch_size,\n",
- " drop_remainder=drop_remainder,\n",
- " num_parallel_calls=tf.data.experimental.AUTOTUNE))\n",
- "\n",
- " dataset = dataset.shuffle(buffer_size=1000,\n",
- " reshuffle_each_iteration=True)\n",
- " \n",
+ " lambda record: _decode_record(record, name_to_features),\n",
+ " batch_size=batch_size,\n",
+ " drop_remainder=drop_remainder,\n",
+ " num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)\n",
+ "\n",
" row_count = 0\n",
- " print('**************** {} *****************'.format(channel))\n",
+ " print(\"**************** {} *****************\".format(channel))\n",
" for row in dataset.as_numpy_iterator():\n",
" if row_count == 1:\n",
" break\n",
@@ -184,105 +184,106 @@
" return dataset\n",
"\n",
"\n",
- "if __name__ == '__main__':\n",
- "\n",
- " args=easydict.EasyDict({\n",
- " 'train_data': train_data,\n",
- " 'validation_data': validation_data,\n",
- " 'test_data': test_data,\n",
- " 'local_model_dir': local_model_dir,\n",
- " 'num_gpus': num_gpus,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp,\n",
- " 'max_seq_length': max_seq_length,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size,\n",
- " 'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions,\n",
- " 'input_data_config': input_data_config\n",
- " })\n",
- " \n",
- " \n",
- " env_var = os.environ \n",
- " print(\"Environment Variables:\") \n",
- " pprint.pprint(dict(env_var), width = 1) \n",
- " \n",
+ "if __name__ == \"__main__\":\n",
+ "\n",
+ " args = easydict.EasyDict(\n",
+ " {\n",
+ " \"train_data\": train_data,\n",
+ " \"validation_data\": validation_data,\n",
+ " \"test_data\": test_data,\n",
+ " \"local_model_dir\": local_model_dir,\n",
+ " \"num_gpus\": num_gpus,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " \"input_data_config\": input_data_config,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " env_var = os.environ\n",
+ " print(\"Environment Variables:\")\n",
+ " pprint.pprint(dict(env_var), width=1)\n",
+ "\n",
" train_data = args.train_data\n",
- " print('train_data {}'.format(train_data))\n",
+ " print(\"train_data {}\".format(train_data))\n",
" validation_data = args.validation_data\n",
- " print('validation_data {}'.format(validation_data))\n",
+ " print(\"validation_data {}\".format(validation_data))\n",
" test_data = args.test_data\n",
- " print('test_data {}'.format(test_data)) \n",
+ " print(\"test_data {}\".format(test_data))\n",
" local_model_dir = args.local_model_dir\n",
- " print('local_model_dir {}'.format(local_model_dir)) \n",
+ " print(\"local_model_dir {}\".format(local_model_dir))\n",
" num_gpus = args.num_gpus\n",
- " print('num_gpus {}'.format(num_gpus)) \n",
+ " print(\"num_gpus {}\".format(num_gpus))\n",
" use_xla = args.use_xla\n",
- " print('use_xla {}'.format(use_xla)) \n",
+ " print(\"use_xla {}\".format(use_xla))\n",
" use_amp = args.use_amp\n",
- " print('use_amp {}'.format(use_amp)) \n",
+ " print(\"use_amp {}\".format(use_amp))\n",
" max_seq_length = args.max_seq_length\n",
- " print('max_seq_length {}'.format(max_seq_length)) \n",
+ " print(\"max_seq_length {}\".format(max_seq_length))\n",
" train_batch_size = args.train_batch_size\n",
- " print('train_batch_size {}'.format(train_batch_size)) \n",
+ " print(\"train_batch_size {}\".format(train_batch_size))\n",
" validation_batch_size = args.validation_batch_size\n",
- " print('validation_batch_size {}'.format(validation_batch_size)) \n",
+ " print(\"validation_batch_size {}\".format(validation_batch_size))\n",
" test_batch_size = args.test_batch_size\n",
- " print('test_batch_size {}'.format(test_batch_size)) \n",
+ " print(\"test_batch_size {}\".format(test_batch_size))\n",
" epochs = args.epochs\n",
- " print('epochs {}'.format(epochs)) \n",
+ " print(\"epochs {}\".format(epochs))\n",
" learning_rate = args.learning_rate\n",
- " print('learning_rate {}'.format(learning_rate)) \n",
+ " print(\"learning_rate {}\".format(learning_rate))\n",
" epsilon = args.epsilon\n",
- " print('epsilon {}'.format(epsilon)) \n",
+ " print(\"epsilon {}\".format(epsilon))\n",
" train_steps_per_epoch = args.train_steps_per_epoch\n",
- " print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) \n",
+ " print(\"train_steps_per_epoch {}\".format(train_steps_per_epoch))\n",
" validation_steps = args.validation_steps\n",
- " print('validation_steps {}'.format(validation_steps)) \n",
+ " print(\"validation_steps {}\".format(validation_steps))\n",
" test_steps = args.test_steps\n",
- " print('test_steps {}'.format(test_steps)) \n",
+ " print(\"test_steps {}\".format(test_steps))\n",
" freeze_bert_layer = args.freeze_bert_layer\n",
- " print('freeze_bert_layer {}'.format(freeze_bert_layer)) \n",
+ " print(\"freeze_bert_layer {}\".format(freeze_bert_layer))\n",
" run_validation = args.run_validation\n",
- " print('run_validation {}'.format(run_validation)) \n",
+ " print(\"run_validation {}\".format(run_validation))\n",
" run_test = args.run_test\n",
- " print('run_test {}'.format(run_test)) \n",
+ " print(\"run_test {}\".format(run_test))\n",
" run_sample_predictions = args.run_sample_predictions\n",
- " print('run_sample_predictions {}'.format(run_sample_predictions))\n",
+ " print(\"run_sample_predictions {}\".format(run_sample_predictions))\n",
" input_data_config = args.input_data_config\n",
- " print('input_data_config {}'.format(input_data_config))\n",
- " \n",
- " # Determine if PipeMode is enabled \n",
- " pipe_mode = (input_data_config.find('Pipe') >= 0)\n",
- " print('Using pipe_mode: {}'.format(pipe_mode))\n",
- " \n",
- " # Model Output \n",
- " transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')\n",
+ " print(\"input_data_config {}\".format(input_data_config))\n",
+ "\n",
+ " # Determine if PipeMode is enabled\n",
+ " pipe_mode = input_data_config.find(\"Pipe\") >= 0\n",
+ " print(\"Using pipe_mode: {}\".format(pipe_mode))\n",
+ "\n",
+ " # Model Output\n",
+ " transformer_fine_tuned_model_path = os.path.join(local_model_dir, \"transformers/fine-tuned/\")\n",
" os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)\n",
"\n",
" # SavedModel Output\n",
- " tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')\n",
- " os.makedirs(tensorflow_saved_model_path, exist_ok=True) \n",
- " \n",
+ " tensorflow_saved_model_path = os.path.join(local_model_dir, \"tensorflow/saved_model/0\")\n",
+ " os.makedirs(tensorflow_saved_model_path, exist_ok=True)\n",
+ "\n",
" distributed_strategy = tf.distribute.MirroredStrategy()\n",
"\n",
" with distributed_strategy.scope():\n",
" tf.config.optimizer.set_jit(use_xla)\n",
" tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": use_amp})\n",
"\n",
- " train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))\n",
- " print('train_data_filenames {}'.format(train_data_filenames))\n",
+ " train_data_filenames = glob(os.path.join(train_data, \"*.tfrecord\"))\n",
+ " print(\"train_data_filenames {}\".format(train_data_filenames))\n",
" train_dataset = file_based_input_dataset_builder(\n",
- " channel='train',\n",
+ " channel=\"train\",\n",
" input_filenames=train_data_filenames,\n",
" pipe_mode=pipe_mode,\n",
" is_training=True,\n",
@@ -290,7 +291,8 @@
" batch_size=train_batch_size,\n",
" epochs=epochs,\n",
" steps_per_epoch=train_steps_per_epoch,\n",
- " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n",
+ " max_seq_length=max_seq_length,\n",
+ " ).map(select_data_and_label_from_record)\n",
"\n",
" tokenizer = None\n",
" config = None\n",
@@ -298,48 +300,46 @@
"\n",
" successful_download = False\n",
" retries = 0\n",
- " while (retries < 5 and not successful_download):\n",
+ " while retries < 5 and not successful_download:\n",
" try:\n",
- " tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
- " config = DistilBertConfig.from_pretrained('distilbert-base-uncased',\n",
- " num_labels=len(CLASSES))\n",
- " model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',\n",
- " config=config)\n",
+ " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
+ " config = DistilBertConfig.from_pretrained(\"distilbert-base-uncased\", num_labels=len(CLASSES))\n",
+ " model = TFDistilBertForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", config=config)\n",
" successful_download = True\n",
- " print('Sucessfully downloaded after {} retries.'.format(retries))\n",
+ " print(\"Sucessfully downloaded after {} retries.\".format(retries))\n",
" except:\n",
" retries = retries + 1\n",
" random_sleep = random.randint(1, 30)\n",
- " print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))\n",
+ " print(\"Retry #{}. Sleeping for {} seconds\".format(retries, random_sleep))\n",
" time.sleep(random_sleep)\n",
"\n",
- " callbacks = [] \n",
- " initial_epoch_number = 0 \n",
+ " callbacks = []\n",
+ " initial_epoch_number = 0\n",
"\n",
" if not tokenizer or not model or not config:\n",
- " print('Not properly initialized...')\n",
+ " print(\"Not properly initialized...\")\n",
"\n",
" optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n",
- " print('** use_amp {}'.format(use_amp)) \n",
+ " print(\"** use_amp {}\".format(use_amp))\n",
" if use_amp:\n",
" # loss scaling is currently required when using mixed precision\n",
- " optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')\n",
- " \n",
- " print('*** OPTIMIZER {} ***'.format(optimizer))\n",
- " \n",
+ " optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, \"dynamic\")\n",
+ "\n",
+ " print(\"*** OPTIMIZER {} ***\".format(optimizer))\n",
+ "\n",
" loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
- " metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
+ " metric = tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")\n",
"\n",
" model.compile(optimizer=optimizer, loss=loss, metrics=[metric])\n",
- " print('Compiled model {}'.format(model)) \n",
+ " print(\"Compiled model {}\".format(model))\n",
" model.layers[0].trainable = not freeze_bert_layer\n",
" print(model.summary())\n",
"\n",
" if run_validation:\n",
- " validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))\n",
- " print('validation_data_filenames {}'.format(validation_data_filenames))\n",
+ " validation_data_filenames = glob(os.path.join(validation_data, \"*.tfrecord\"))\n",
+ " print(\"validation_data_filenames {}\".format(validation_data_filenames))\n",
" validation_dataset = file_based_input_dataset_builder(\n",
- " channel='validation',\n",
+ " channel=\"validation\",\n",
" input_filenames=validation_data_filenames,\n",
" pipe_mode=pipe_mode,\n",
" is_training=False,\n",
@@ -347,34 +347,39 @@
" batch_size=validation_batch_size,\n",
" epochs=epochs,\n",
" steps_per_epoch=validation_steps,\n",
- " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n",
- " \n",
- " print('Starting Training and Validation...')\n",
+ " max_seq_length=max_seq_length,\n",
+ " ).map(select_data_and_label_from_record)\n",
+ "\n",
+ " print(\"Starting Training and Validation...\")\n",
" validation_dataset = validation_dataset.take(validation_steps)\n",
- " train_and_validation_history = model.fit(train_dataset,\n",
- " shuffle=True,\n",
- " epochs=epochs,\n",
- " initial_epoch=initial_epoch_number,\n",
- " steps_per_epoch=train_steps_per_epoch,\n",
- " validation_data=validation_dataset,\n",
- " validation_steps=validation_steps,\n",
- " callbacks=callbacks) \n",
+ " train_and_validation_history = model.fit(\n",
+ " train_dataset,\n",
+ " shuffle=True,\n",
+ " epochs=epochs,\n",
+ " initial_epoch=initial_epoch_number,\n",
+ " steps_per_epoch=train_steps_per_epoch,\n",
+ " validation_data=validation_dataset,\n",
+ " validation_steps=validation_steps,\n",
+ " callbacks=callbacks,\n",
+ " )\n",
" print(train_and_validation_history)\n",
- " else: # Not running validation\n",
- " print('Starting Training (Without Validation)...')\n",
- " train_history = model.fit(train_dataset,\n",
- " shuffle=True,\n",
- " epochs=epochs,\n",
- " initial_epoch=initial_epoch_number,\n",
- " steps_per_epoch=train_steps_per_epoch,\n",
- " callbacks=callbacks) \n",
+ " else: # Not running validation\n",
+ " print(\"Starting Training (Without Validation)...\")\n",
+ " train_history = model.fit(\n",
+ " train_dataset,\n",
+ " shuffle=True,\n",
+ " epochs=epochs,\n",
+ " initial_epoch=initial_epoch_number,\n",
+ " steps_per_epoch=train_steps_per_epoch,\n",
+ " callbacks=callbacks,\n",
+ " )\n",
" print(train_history)\n",
"\n",
" if run_test:\n",
- " test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))\n",
- " print('test_data_filenames {}'.format(test_data_filenames))\n",
+ " test_data_filenames = glob(os.path.join(test_data, \"*.tfrecord\"))\n",
+ " print(\"test_data_filenames {}\".format(test_data_filenames))\n",
" test_dataset = file_based_input_dataset_builder(\n",
- " channel='test',\n",
+ " channel=\"test\",\n",
" input_filenames=test_data_filenames,\n",
" pipe_mode=pipe_mode,\n",
" is_training=False,\n",
@@ -382,56 +387,50 @@
" batch_size=test_batch_size,\n",
" epochs=epochs,\n",
" steps_per_epoch=test_steps,\n",
- " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n",
- "\n",
- " print('Starting test...')\n",
- " test_history = model.evaluate(test_dataset,\n",
- " steps=test_steps,\n",
- " callbacks=callbacks)\n",
- " \n",
- " print('Test history {}'.format(test_history))\n",
- " \n",
+ " max_seq_length=max_seq_length,\n",
+ " ).map(select_data_and_label_from_record)\n",
+ "\n",
+ " print(\"Starting test...\")\n",
+ " test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)\n",
+ "\n",
+ " print(\"Test history {}\".format(test_history))\n",
+ "\n",
" # Save the Fine-Tuned Transformers Model as a New \"Pre-Trained\" Model\n",
- " print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) \n",
+ " print(\"transformer_fine_tuned_model_path {}\".format(transformer_fine_tuned_model_path))\n",
" model.save_pretrained(transformer_fine_tuned_model_path)\n",
"\n",
" # Save the TensorFlow SavedModel for Serving Predictions\n",
- " print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) \n",
- " model.save(tensorflow_saved_model_path, save_format='tf')\n",
- " \n",
+ " print(\"tensorflow_saved_model_path {}\".format(tensorflow_saved_model_path))\n",
+ " model.save(tensorflow_saved_model_path, save_format=\"tf\")\n",
+ "\n",
" if run_sample_predictions:\n",
- " loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path,\n",
- " id2label={\n",
- " 0: 1,\n",
- " 1: 2,\n",
- " 2: 3,\n",
- " 3: 4,\n",
- " 4: 5\n",
- " },\n",
- " label2id={\n",
- " 1: 0,\n",
- " 2: 1,\n",
- " 3: 2,\n",
- " 4: 3,\n",
- " 5: 4\n",
- " })\n",
- "\n",
- " tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
+ " loaded_model = TFDistilBertForSequenceClassification.from_pretrained(\n",
+ " transformer_fine_tuned_model_path,\n",
+ " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n",
+ " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n",
+ " )\n",
+ "\n",
+ " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
"\n",
" if num_gpus >= 1:\n",
- " inference_device = 0 # GPU 0\n",
+ " inference_device = 0 # GPU 0\n",
" else:\n",
- " inference_device = -1 # CPU\n",
- " print('inference_device {}'.format(inference_device))\n",
+ " inference_device = -1 # CPU\n",
+ " print(\"inference_device {}\".format(inference_device))\n",
"\n",
- " inference_pipeline = TextClassificationPipeline(model=loaded_model, \n",
- " tokenizer=tokenizer,\n",
- " framework='tf',\n",
- " device=inference_device) \n",
+ " inference_pipeline = TextClassificationPipeline(\n",
+ " model=loaded_model, tokenizer=tokenizer, framework=\"tf\", device=inference_device\n",
+ " )\n",
"\n",
- " print(\"\"\"I loved it! I will recommend this to everyone.\"\"\", inference_pipeline(\"\"\"I loved it! I will recommend this to everyone.\"\"\"))\n",
+ " print(\n",
+ " \"\"\"I loved it! I will recommend this to everyone.\"\"\",\n",
+ " inference_pipeline(\"\"\"I loved it! I will recommend this to everyone.\"\"\"),\n",
+ " )\n",
" print(\"\"\"It's OK.\"\"\", inference_pipeline(\"\"\"It's OK.\"\"\"))\n",
- " print(\"\"\"Really bad. I hope they don't make this anymore.\"\"\", inference_pipeline(\"\"\"Really bad. I hope they don't make this anymore.\"\"\"))"
+ " print(\n",
+ " \"\"\"Really bad. I hope they don't make this anymore.\"\"\",\n",
+ " inference_pipeline(\"\"\"Really bad. I hope they don't make this anymore.\"\"\"),\n",
+ " )"
]
}
],
diff --git a/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb b/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb
index 99a730b0..1550a60d 100644
--- a/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb
+++ b/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb
@@ -44,7 +44,7 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "session = sagemaker.Session()\n",
+ "session = sagemaker.Session()\n",
"bucket = session.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -64,7 +64,7 @@
"metadata": {},
"outputs": [],
"source": [
- "processed_train_data_s3_uri='s3://fsx-container-demo/input/data/train'\n",
+ "processed_train_data_s3_uri = \"s3://fsx-container-demo/input/data/train\"\n",
"\n",
"!aws s3 ls $processed_train_data_s3_uri/"
]
@@ -75,7 +75,7 @@
"metadata": {},
"outputs": [],
"source": [
- "processed_validation_data_s3_uri='s3://fsx-container-demo/input/data/validation'\n",
+ "processed_validation_data_s3_uri = \"s3://fsx-container-demo/input/data/validation\"\n",
"\n",
"!aws s3 ls $processed_validation_data_s3_uri/"
]
@@ -88,7 +88,7 @@
},
"outputs": [],
"source": [
- "processed_test_data_s3_uri='s3://fsx-container-demo/input/data/test'\n",
+ "processed_test_data_s3_uri = \"s3://fsx-container-demo/input/data/test\"\n",
"\n",
"!aws s3 ls $processed_test_data_s3_uri/"
]
@@ -110,12 +110,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -135,22 +132,22 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=3\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=64\n",
- "test_batch_size=64\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=10\n",
- "test_steps=10\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "max_seq_length=64\n",
- "freeze_bert_layer=True\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 3\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 64\n",
+ "test_batch_size = 64\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 10\n",
+ "test_steps = 10\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "max_seq_length = 64\n",
+ "freeze_bert_layer = True\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -191,10 +188,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -225,36 +222,39 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='train.py',\n",
- " source_dir='code',\n",
- " role=role,\n",
- " instance_count=1,\n",
- " instance_type='ml.c5.9xlarge',\n",
- " use_spot_instances=True,\n",
- " max_run=3600,\n",
- " max_wait=3600,\n",
- " volume_size=1024,\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode='Pipe',\n",
- " metric_definitions=metrics_definitions\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"train.py\",\n",
+ " source_dir=\"code\",\n",
+ " role=role,\n",
+ " instance_count=1,\n",
+ " instance_type=\"ml.c5.9xlarge\",\n",
+ " use_spot_instances=True,\n",
+ " max_run=3600,\n",
+ " max_wait=3600,\n",
+ " volume_size=1024,\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=\"Pipe\",\n",
+ " metric_definitions=metrics_definitions,\n",
+ ")"
]
},
{
@@ -270,11 +270,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -284,7 +283,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -295,7 +294,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -306,7 +311,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -317,7 +328,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/07_train/container-demo/code/train.py b/07_train/container-demo/code/train.py
index 94bc8fc8..38f7e539 100644
--- a/07_train/container-demo/code/train.py
+++ b/07_train/container-demo/code/train.py
@@ -10,9 +10,9 @@
import os
import tensorflow as tf
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==2.8.0"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker-tensorflow==2.1.0.1.0.0"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
@@ -25,65 +25,64 @@
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
if row_count == 1:
break
@@ -93,159 +92,114 @@ def _decode_record(record, name_to_features):
return dataset
-if __name__ == '__main__':
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('Listing /opt...')
- for root, subFolder, files in os.walk('/opt'):
+if __name__ == "__main__":
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("Listing /opt...")
+ for root, subFolder, files in os.walk("/opt"):
for item in files:
- print('{},{},{}'.format(root, subFolder, item))
- print('Done.')
-
+ print("{},{},{}".format(root, subFolder, item))
+ print("Done.")
+
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--input_data_config',
- type=str,
- default=os.environ['SM_INPUT_DATA_CONFIG'])
- parser.add_argument('--local_model_dir',
- type=str,
- default=os.environ['SM_MODEL_DIR'])
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=64)
- parser.add_argument('--test_batch_size',
- type=int,
- default=64)
- parser.add_argument('--epochs',
- type=int,
- default=3)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00001)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=100)
- parser.add_argument('--validation_steps',
- type=int,
- default=10)
- parser.add_argument('--test_steps',
- type=int,
- default=10)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--input_data_config", type=str, default=os.environ["SM_INPUT_DATA_CONFIG"])
+ parser.add_argument("--local_model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=64)
+ parser.add_argument("--test_batch_size", type=int, default=64)
+ parser.add_argument("--epochs", type=int, default=3)
+ parser.add_argument("--learning_rate", type=float, default=0.00001)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=100)
+ parser.add_argument("--validation_steps", type=int, default=10)
+ parser.add_argument("--test_steps", type=int, default=10)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
+ print("test_data {}".format(test_data))
local_model_dir = args.local_model_dir
- print('local_model_dir {}'.format(local_model_dir))
+ print("local_model_dir {}".format(local_model_dir))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
+ print("num_gpus {}".format(num_gpus))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
input_data_config = args.input_data_config
- print('input_data_config {}'.format(input_data_config))
-
-
- # Determine if PipeMode is enabled
- pipe_mode = (input_data_config.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ print("input_data_config {}".format(input_data_config))
+
+ # Determine if PipeMode is enabled
+ pipe_mode = input_data_config.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
- os.makedirs(tensorflow_saved_model_path, exist_ok=True)
-
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
+ os.makedirs(tensorflow_saved_model_path, exist_ok=True)
+
distributed_strategy = tf.distribute.MirroredStrategy()
-
+
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -253,7 +207,8 @@ def _decode_record(record, name_to_features):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -261,50 +216,47 @@ def _decode_record(record, name_to_features):
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES))
- model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
- config=config)
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=len(CLASSES))
+ model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
+ print("Compiled model {}".format(model))
model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -312,34 +264,39 @@ def _decode_record(record, name_to_features):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -347,53 +304,47 @@ def _decode_record(record, name_to_features):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Tuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
model.save_pretrained(transformer_fine_tuned_model_path)
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path, save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, save_format="tf")
+
if run_sample_predictions:
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path,
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
+ transformer_fine_tuned_model_path,
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
if num_gpus >= 1:
- inference_device = 0 # GPU 0
+ inference_device = 0 # GPU 0
else:
- inference_device = -1 # CPU
- print('inference_device {}'.format(inference_device))
+ inference_device = -1 # CPU
+ print("inference_device {}".format(inference_device))
- inference_pipeline = TextClassificationPipeline(model=loaded_model,
- tokenizer=tokenizer,
- framework='tf',
- device=inference_device)
+ inference_pipeline = TextClassificationPipeline(
+ model=loaded_model, tokenizer=tokenizer, framework="tf", device=inference_device
+ )
- print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ inference_pipeline("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", inference_pipeline("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore."""))
\ No newline at end of file
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ inference_pipeline("""Really bad. I hope they don't make this anymore."""),
+ )
diff --git a/07_train/evaluate_model_metrics.py b/07_train/evaluate_model_metrics.py
index 024afdec..f3523174 100644
--- a/07_train/evaluate_model_metrics.py
+++ b/07_train/evaluate_model_metrics.py
@@ -4,13 +4,16 @@
from datetime import datetime
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
import pandas as pd
import os
import re
@@ -33,99 +36,99 @@
from sklearn.utils import resample
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
CLASSES = [1, 2, 3, 4, 5]
-config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
+config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+)
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--input-model', type=str,
- default='/opt/ml/processing/input/model',
+ parser.add_argument(
+ "--input-model",
+ type=str,
+ default="/opt/ml/processing/input/model",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
-
+ )
+
return parser.parse_args()
-
+
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- print('input_data: {}'.format(args.input_data))
- print('input_model: {}'.format(args.input_model))
-
- print('Listing contents of input model dir: {}'.format(args.input_model))
+ print("Current host: {}".format(args.current_host))
+
+ print("input_data: {}".format(args.input_data))
+ print("input_model: {}".format(args.input_model))
+
+ print("Listing contents of input model dir: {}".format(args.input_model))
input_files = os.listdir(args.input_model)
for file in input_files:
print(file)
- model_tar_path = '{}/model.tar.gz'.format(args.input_model)
+ model_tar_path = "{}/model.tar.gz".format(args.input_model)
model_tar = tarfile.open(model_tar_path)
model_tar.extractall(args.input_model)
- model_tar.close()
+ model_tar.close()
- model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model))
+ model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model))
print(model)
-
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=args.max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -133,81 +136,86 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
-
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
###########################################################################################
# TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz #
- ###########################################################################################
-# evaluation_data_path = '/opt/ml/processing/input/data/'
-
- print('Listing contents of input data dir: {}'.format(args.input_data))
+ ###########################################################################################
+ # evaluation_data_path = '/opt/ml/processing/input/data/'
+
+ print("Listing contents of input data dir: {}".format(args.input_data))
input_files = os.listdir(args.input_data)
- test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data)
- print('Using only {} to evaluate.'.format(test_data_path))
- df_test_reviews = pd.read_csv(test_data_path,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data)
+ print("Using only {} to evaluate.".format(test_data_path))
+ df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[
+ ["review_body", "star_rating"]
+ ]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
- y_actual = df_test_reviews['star_rating']
+ y_actual = df_test_reviews["star_rating"]
y_actual
print(classification_report(y_true=y_test, y_pred=y_actual))
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
def plot_conf_mat(cm, classes, title, cmap):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=CLASSES,
- title='Confusion Matrix',
- cmap=plt.cm.Greens)
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens)
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
- # Model Output
- metrics_path = os.path.join(args.output_data, 'metrics/')
+ # Model Output
+ metrics_path = os.path.join(args.output_data, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
report_dict = {
"metrics": {
@@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap):
evaluation_path = "{}/evaluation.json".format(metrics_path)
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))
-
- print('Listing contents of output dir: {}'.format(args.output_data))
+
+ print("Listing contents of output dir: {}".format(args.output_data))
output_files = os.listdir(args.output_data)
for file in output_files:
print(file)
- print('Listing contents of output/metrics dir: {}'.format(metrics_path))
- output_files = os.listdir('{}'.format(metrics_path))
+ print("Listing contents of output/metrics dir: {}".format(metrics_path))
+ output_files = os.listdir("{}".format(metrics_path))
for file in output_files:
print(file)
- print('Complete')
-
-
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
- process(args)
+ process(args)
diff --git a/07_train/src/inference.py b/07_train/src/inference.py
index 2975dc2d..53196737 100644
--- a/07_train/src/inference.py
+++ b/07_train/src/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/07_train/src/tf_bert_reviews.py b/07_train/src/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/07_train/src/tf_bert_reviews.py
+++ b/07_train/src/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb b/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
index 0edeb425..2250389d 100644
--- a/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
+++ b/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
@@ -33,12 +33,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -72,11 +72,11 @@
"source": [
"try:\n",
" processed_train_data_s3_uri\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -105,11 +105,11 @@
"source": [
"try:\n",
" processed_validation_data_s3_uri\n",
- " print('[OK]') \n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous sections before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous sections before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -138,11 +138,11 @@
"source": [
"try:\n",
" processed_test_data_s3_uri\n",
- " print('[OK]') \n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous sections before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous sections before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -205,12 +205,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -253,11 +250,11 @@
"source": [
"try:\n",
" max_seq_length\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -277,26 +274,26 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=3\n",
- "epsilon=0.00000001\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.4xlarge' #evt\n",
- "#train_instance_type='ml.m5.4xlarge' #bur\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "enable_sagemaker_debugger=False\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 3\n",
+ "epsilon = 0.00000001\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.4xlarge\" # evt\n",
+ "# train_instance_type='ml.m5.4xlarge' #bur\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "enable_sagemaker_debugger = False\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -323,11 +320,11 @@
"source": [
"try:\n",
" experiment_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -356,11 +353,11 @@
"source": [
"try:\n",
" trial_name\n",
- " print('[OK]') \n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -381,7 +378,7 @@
"import time\n",
"from smexperiments.trial import Trial\n",
"\n",
- "timestamp = '{}'.format(int(time.time()))\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
"\n",
"trial = Trial.load(trial_name=trial_name)\n",
"print(trial)"
@@ -395,11 +392,10 @@
"source": [
"from smexperiments.tracker import Tracker\n",
"\n",
- "tracker_optimize = Tracker.create(display_name='optimize-1', \n",
- " sagemaker_boto_client=sm)\n",
+ "tracker_optimize = Tracker.create(display_name=\"optimize-1\", sagemaker_boto_client=sm)\n",
"\n",
"optimize_trial_component_name = tracker_optimize.trial_component.trial_component_name\n",
- "print('Optimize trial component name {}'.format(optimize_trial_component_name))"
+ "print(\"Optimize trial component name {}\".format(optimize_trial_component_name))"
]
},
{
@@ -435,11 +431,11 @@
"from sagemaker.tuner import ContinuousParameter\n",
"from sagemaker.tuner import CategoricalParameter\n",
"from sagemaker.tuner import HyperparameterTuner\n",
- " \n",
+ "\n",
"hyperparameter_ranges = {\n",
- " 'learning_rate': ContinuousParameter(0.00001, 0.00005, scaling_type='Linear'),\n",
- " 'train_batch_size': CategoricalParameter([128, 256]),\n",
- " 'freeze_bert_layer': CategoricalParameter([True, False])\n",
+ " \"learning_rate\": ContinuousParameter(0.00001, 0.00005, scaling_type=\"Linear\"),\n",
+ " \"train_batch_size\": CategoricalParameter([128, 256]),\n",
+ " \"freeze_bert_layer\": CategoricalParameter([True, False]),\n",
"}"
]
},
@@ -457,10 +453,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -472,34 +468,37 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py37',\n",
- " framework_version='2.3.1',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'epsilon': epsilon,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp,\n",
- " 'max_seq_length': max_seq_length,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger, \n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " metric_definitions=metrics_definitions,\n",
- "# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py37\",\n",
+ " framework_version=\"2.3.1\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " metric_definitions=metrics_definitions,\n",
+ " # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n",
+ ")"
]
},
{
@@ -515,18 +514,18 @@
"metadata": {},
"outputs": [],
"source": [
- "objective_metric_name = 'train:accuracy'\n",
+ "objective_metric_name = \"train:accuracy\"\n",
"\n",
"tuner = HyperparameterTuner(\n",
" estimator=estimator,\n",
- " objective_type='Maximize',\n",
+ " objective_type=\"Maximize\",\n",
" objective_metric_name=objective_metric_name,\n",
" hyperparameter_ranges=hyperparameter_ranges,\n",
" metric_definitions=metrics_definitions,\n",
" max_jobs=2,\n",
" max_parallel_jobs=1,\n",
- " strategy='Bayesian',\n",
- " early_stopping_type='Auto'\n",
+ " strategy=\"Bayesian\",\n",
+ " early_stopping_type=\"Auto\",\n",
")"
]
},
@@ -543,12 +542,11 @@
"metadata": {},
"outputs": [],
"source": [
- "tuner.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " include_cls_metadata=False,\n",
- " wait=False)"
+ "tuner.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " include_cls_metadata=False,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -579,8 +577,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Hyper-Parameter Tuning Job'.format(region, tuning_job_name)))"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Hyper-Parameter Tuning Job'.format(\n",
+ " region, tuning_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -624,10 +628,7 @@
"source": [
"from sagemaker.analytics import HyperparameterTuningJobAnalytics\n",
"\n",
- "hp_results = HyperparameterTuningJobAnalytics(\n",
- " sagemaker_session=sess, \n",
- " hyperparameter_tuning_job_name=tuning_job_name\n",
- ")\n",
+ "hp_results = HyperparameterTuningJobAnalytics(sagemaker_session=sess, hyperparameter_tuning_job_name=tuning_job_name)\n",
"\n",
"df_results = hp_results.dataframe()\n",
"df_results.shape"
@@ -639,7 +640,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_results.sort_values('FinalObjectiveValue', ascending=0)"
+ "df_results.sort_values(\"FinalObjectiveValue\", ascending=0)"
]
},
{
@@ -655,7 +656,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)"
+ "df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)"
]
},
{
@@ -673,7 +674,7 @@
"metadata": {},
"outputs": [],
"source": [
- "best_learning_rate = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['learning_rate']\n",
+ "best_learning_rate = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"learning_rate\"]\n",
"print(best_learning_rate)"
]
},
@@ -683,7 +684,7 @@
"metadata": {},
"outputs": [],
"source": [
- "best_accuracy = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['FinalObjectiveValue']\n",
+ "best_accuracy = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"FinalObjectiveValue\"]\n",
"print(best_accuracy)"
]
},
@@ -693,9 +694,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tracker_optimize.log_parameters({\n",
- " 'learning_rate': float(best_learning_rate)\n",
- "})\n",
+ "tracker_optimize.log_parameters({\"learning_rate\": float(best_learning_rate)})\n",
"\n",
"# must save after logging\n",
"tracker_optimize.trial_component.save()"
@@ -707,7 +706,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tracker_optimize.log_metric('accuracy', float(best_accuracy))\n",
+ "tracker_optimize.log_metric(\"accuracy\", float(best_accuracy))\n",
"\n",
"# must save after logging\n",
"tracker_optimize.trial_component.save()"
@@ -731,7 +730,7 @@
"lineage_table = ExperimentAnalytics(\n",
" sagemaker_session=sess,\n",
" experiment_name=experiment_name,\n",
- " metric_names=['validation:accuracy'],\n",
+ " metric_names=[\"validation:accuracy\"],\n",
" sort_by=\"CreationTime\",\n",
" sort_order=\"Descending\",\n",
")\n",
diff --git a/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb b/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
index c07a2273..2e7e7cfd 100644
--- a/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
+++ b/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb
@@ -32,12 +32,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -66,11 +66,11 @@
"source": [
"try:\n",
" tuning_job_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the previous Hyperparameter Tuning notebook.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -95,9 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
- "job_description = sm.describe_hyper_parameter_tuning_job(\n",
- " HyperParameterTuningJobName=tuning_job_name\n",
- ")"
+ "job_description = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)"
]
},
{
@@ -107,15 +105,15 @@
"outputs": [],
"source": [
"if not bool(job_description):\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++') \n",
- " print('[ERROR] Please run the previous Hyperparameter Tuning notebook before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++') \n",
- "elif job_description['HyperParameterTuningJobStatus'] == 'Completed':\n",
- " print('[OK] Previous Tuning Job has completed. Please continue.')\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ "elif job_description[\"HyperParameterTuningJobStatus\"] == \"Completed\":\n",
+ " print(\"[OK] Previous Tuning Job has completed. Please continue.\")\n",
"else:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the previous Hyperparameter Tuning notebook.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -142,11 +140,11 @@
"source": [
"try:\n",
" processed_train_data_s3_uri\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -175,11 +173,11 @@
"source": [
"try:\n",
" processed_validation_data_s3_uri\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -208,11 +206,11 @@
"source": [
"try:\n",
" processed_test_data_s3_uri\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -262,12 +260,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -310,11 +305,11 @@
"source": [
"try:\n",
" max_seq_length\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -332,26 +327,26 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=3\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.4xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "enable_sagemaker_debugger=False\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 3\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.4xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "enable_sagemaker_debugger = False\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -378,11 +373,11 @@
"source": [
"try:\n",
" experiment_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -411,11 +406,11 @@
"source": [
"try:\n",
" trial_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -436,7 +431,7 @@
"import time\n",
"from smexperiments.trial import Trial\n",
"\n",
- "timestamp = '{}'.format(int(time.time()))\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
"\n",
"trial = Trial.load(trial_name=trial_name)\n",
"print(trial)"
@@ -450,11 +445,10 @@
"source": [
"from smexperiments.tracker import Tracker\n",
"\n",
- "tracker_optimize = Tracker.create(display_name='optimize-2', \n",
- " sagemaker_boto_client=sm)\n",
+ "tracker_optimize = Tracker.create(display_name=\"optimize-2\", sagemaker_boto_client=sm)\n",
"\n",
"optimize_trial_component_name = tracker_optimize.trial_component.trial_component_name\n",
- "print('Optimize trial component name {}'.format(optimize_trial_component_name))"
+ "print(\"Optimize trial component name {}\".format(optimize_trial_component_name))"
]
},
{
@@ -491,11 +485,11 @@
"from sagemaker.tuner import ContinuousParameter\n",
"from sagemaker.tuner import CategoricalParameter\n",
"from sagemaker.tuner import HyperparameterTuner\n",
- " \n",
+ "\n",
"hyperparameter_ranges = {\n",
- " 'learning_rate': ContinuousParameter(0.00015, 0.00075, scaling_type='Linear'),\n",
- " 'train_batch_size': CategoricalParameter([64, 128]), \n",
- " 'freeze_bert_layer': CategoricalParameter([True, False]),\n",
+ " \"learning_rate\": ContinuousParameter(0.00015, 0.00075, scaling_type=\"Linear\"),\n",
+ " \"train_batch_size\": CategoricalParameter([64, 128]),\n",
+ " \"freeze_bert_layer\": CategoricalParameter([True, False]),\n",
"}"
]
},
@@ -532,10 +526,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -547,34 +541,37 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py37',\n",
- " framework_version='2.3.1',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'epsilon': epsilon,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp,\n",
- " 'max_seq_length': max_seq_length,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger, \n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " metric_definitions=metrics_definitions,\n",
- "# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py37\",\n",
+ " framework_version=\"2.3.1\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " metric_definitions=metrics_definitions,\n",
+ " # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n",
+ ")"
]
},
{
@@ -598,7 +595,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Previous Tuning Job Name: {}'.format(tuning_job_name))"
+ "print(\"Previous Tuning Job Name: {}\".format(tuning_job_name))"
]
},
{
@@ -610,8 +607,9 @@
"from sagemaker.tuner import WarmStartConfig\n",
"from sagemaker.tuner import WarmStartTypes\n",
"\n",
- "warm_start_config = WarmStartConfig(warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, \n",
- " parents={tuning_job_name})"
+ "warm_start_config = WarmStartConfig(\n",
+ " warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, parents={tuning_job_name}\n",
+ ")"
]
},
{
@@ -627,19 +625,19 @@
"metadata": {},
"outputs": [],
"source": [
- "objective_metric_name = 'train:accuracy'\n",
+ "objective_metric_name = \"train:accuracy\"\n",
"\n",
"tuner = HyperparameterTuner(\n",
" estimator=estimator,\n",
- " objective_type='Maximize',\n",
+ " objective_type=\"Maximize\",\n",
" objective_metric_name=objective_metric_name,\n",
" hyperparameter_ranges=hyperparameter_ranges,\n",
" metric_definitions=metrics_definitions,\n",
" max_jobs=2,\n",
" max_parallel_jobs=1,\n",
- " strategy='Bayesian',\n",
- " early_stopping_type='Auto',\n",
- " warm_start_config=warm_start_config\n",
+ " strategy=\"Bayesian\",\n",
+ " early_stopping_type=\"Auto\",\n",
+ " warm_start_config=warm_start_config,\n",
")"
]
},
@@ -658,12 +656,11 @@
},
"outputs": [],
"source": [
- "tuner.fit({'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " include_cls_metadata=False,\n",
- " wait=False)"
+ "tuner.fit(\n",
+ " {\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " include_cls_metadata=False,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -692,27 +689,25 @@
"\n",
"tuning_job_name = tuner.latest_tuning_job.job_name\n",
"\n",
- "job_description = sm.describe_hyper_parameter_tuning_job(\n",
- " HyperParameterTuningJobName=tuning_job_name\n",
- ")\n",
+ "job_description = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)\n",
"\n",
- "status = job_description['HyperParameterTuningJobStatus']\n",
+ "status = job_description[\"HyperParameterTuningJobStatus\"]\n",
"\n",
- "print('\\n')\n",
+ "print(\"\\n\")\n",
"print(status)\n",
- "print('\\n')\n",
+ "print(\"\\n\")\n",
"pprint(job_description)\n",
"\n",
- "if status != 'Completed':\n",
- " job_count = job_description['TrainingJobStatusCounters']['Completed']\n",
- " print('Not yet complete, but {} jobs have completed.'.format(job_count))\n",
- " \n",
- " if job_description.get('BestTrainingJob', None):\n",
+ "if status != \"Completed\":\n",
+ " job_count = job_description[\"TrainingJobStatusCounters\"][\"Completed\"]\n",
+ " print(\"Not yet complete, but {} jobs have completed.\".format(job_count))\n",
+ "\n",
+ " if job_description.get(\"BestTrainingJob\", None):\n",
" print(\"Best candidate:\")\n",
- " pprint(job_description['BestTrainingJob']['TrainingJobName'])\n",
- " pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])\n",
+ " pprint(job_description[\"BestTrainingJob\"][\"TrainingJobName\"])\n",
+ " pprint(job_description[\"BestTrainingJob\"][\"FinalHyperParameterTuningJobObjectiveMetric\"])\n",
" else:\n",
- " print(\"No training jobs have reported results yet.\") "
+ " print(\"No training jobs have reported results yet.\")"
]
},
{
@@ -724,8 +719,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Hyper-Parameter Tuning Job'.format(region, tuning_job_name)))"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Hyper-Parameter Tuning Job'.format(\n",
+ " region, tuning_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -760,10 +761,7 @@
"source": [
"from sagemaker.analytics import HyperparameterTuningJobAnalytics\n",
"\n",
- "hp_results = HyperparameterTuningJobAnalytics(\n",
- " sagemaker_session=sess, \n",
- " hyperparameter_tuning_job_name=tuning_job_name\n",
- ")\n",
+ "hp_results = HyperparameterTuningJobAnalytics(sagemaker_session=sess, hyperparameter_tuning_job_name=tuning_job_name)\n",
"\n",
"df_results = hp_results.dataframe()\n",
"df_results.shape"
@@ -775,7 +773,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_results.sort_values('FinalObjectiveValue', ascending=0)"
+ "df_results.sort_values(\"FinalObjectiveValue\", ascending=0)"
]
},
{
@@ -793,7 +791,7 @@
},
"outputs": [],
"source": [
- "df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)"
+ "df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)"
]
},
{
@@ -804,7 +802,7 @@
},
"outputs": [],
"source": [
- "best_candidate_tuning_job_name = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['TrainingJobName']"
+ "best_candidate_tuning_job_name = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"TrainingJobName\"]"
]
},
{
@@ -822,7 +820,7 @@
"metadata": {},
"outputs": [],
"source": [
- "best_learning_rate = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['learning_rate']\n",
+ "best_learning_rate = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"learning_rate\"]\n",
"print(best_learning_rate)"
]
},
@@ -832,7 +830,7 @@
"metadata": {},
"outputs": [],
"source": [
- "best_accuracy = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['FinalObjectiveValue']\n",
+ "best_accuracy = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"FinalObjectiveValue\"]\n",
"print(best_accuracy)"
]
},
@@ -842,9 +840,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tracker_optimize.log_parameters({\n",
- " 'learning_rate': float(best_learning_rate)\n",
- "})\n",
+ "tracker_optimize.log_parameters({\"learning_rate\": float(best_learning_rate)})\n",
"\n",
"# must save after logging\n",
"tracker_optimize.trial_component.save()"
@@ -856,7 +852,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tracker_optimize.log_metric('accuracy', float(best_accuracy))\n",
+ "tracker_optimize.log_metric(\"accuracy\", float(best_accuracy))\n",
"\n",
"tracker_optimize.trial_component.save()"
]
@@ -886,9 +882,9 @@
"lineage_table = ExperimentAnalytics(\n",
" sagemaker_session=sess,\n",
" experiment_name=experiment_name,\n",
- " metric_names=['validation:accuracy'],\n",
+ " metric_names=[\"validation:accuracy\"],\n",
" sort_by=\"CreationTime\",\n",
- " sort_order=\"Descending\"\n",
+ " sort_order=\"Descending\",\n",
")\n",
"\n",
"lineage_df = lineage_table.dataframe()\n",
diff --git a/08_optimize/src/inference.py b/08_optimize/src/inference.py
index 2975dc2d..53196737 100644
--- a/08_optimize/src/inference.py
+++ b/08_optimize/src/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/08_optimize/src/tf_bert_reviews.py b/08_optimize/src/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/08_optimize/src/tf_bert_reviews.py
+++ b/08_optimize/src/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb b/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb
index 83315587..0f898a39 100644
--- a/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb
+++ b/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb
@@ -51,12 +51,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -65,15 +65,15 @@
"metadata": {},
"outputs": [],
"source": [
- "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n",
- " print(' [OK] AthenaML IS SUPPORTED IN {}'.format(region))\n",
- " print(' [OK] Please proceed with this notebook.')\n",
+ "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n",
+ " print(\" [OK] AthenaML IS SUPPORTED IN {}\".format(region))\n",
+ " print(\" [OK] Please proceed with this notebook.\")\n",
"else:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print(' [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!'.format(region))\n",
- " print(' [INFO] This is OK. SKIP this notebook and move ahead with the workshop.' )\n",
- " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\" [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!\".format(region))\n",
+ " print(\" [INFO] This is OK. SKIP this notebook and move ahead with the workshop.\")\n",
+ " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -102,13 +102,13 @@
"source": [
"try:\n",
" autopilot_endpoint_name\n",
- " print('[OK]') \n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print('[ERROR] There is no Autopilot Model Endpoint deployed.')\n",
- " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n",
- " print('[INFO] This notebook is not required for the rest of this workshop.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' ) "
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] There is no Autopilot Model Endpoint deployed.\")\n",
+ " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n",
+ " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -128,25 +128,25 @@
"source": [
"try:\n",
" resp = sm.describe_endpoint(EndpointName=autopilot_endpoint_name)\n",
- " status = resp['EndpointStatus']\n",
- " if status == 'InService':\n",
- " print('[OK] Your Autopilot Model Endpoint is in status: {}'.format(status))\n",
- " elif status == 'Creating':\n",
- " print('[INFO] Your Autopilot Model Endpoint is in status: {}'.format(status))\n",
- " print('[INFO] Waiting for the endpoint to be InService. Please be patient. This might take a few minutes.')\n",
- " sm.get_waiter('endpoint_in_service').wait(EndpointName=autopilot_endpoint_name) \n",
- " else: \n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print('[ERROR] Your Autopilot Model is in status: {}'.format(status))\n",
- " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n",
- " print('[INFO] This notebook is not required for the rest of this workshop.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
+ " status = resp[\"EndpointStatus\"]\n",
+ " if status == \"InService\":\n",
+ " print(\"[OK] Your Autopilot Model Endpoint is in status: {}\".format(status))\n",
+ " elif status == \"Creating\":\n",
+ " print(\"[INFO] Your Autopilot Model Endpoint is in status: {}\".format(status))\n",
+ " print(\"[INFO] Waiting for the endpoint to be InService. Please be patient. This might take a few minutes.\")\n",
+ " sm.get_waiter(\"endpoint_in_service\").wait(EndpointName=autopilot_endpoint_name)\n",
+ " else:\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Your Autopilot Model is in status: {}\".format(status))\n",
+ " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n",
+ " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"except:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print('[ERROR] There is no Autopilot Model Endpoint deployed.')\n",
- " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n",
- " print('[INFO] This notebook is not required for the rest of this workshop.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' ) "
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] There is no Autopilot Model Endpoint deployed.\")\n",
+ " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n",
+ " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -197,9 +197,9 @@
"try:\n",
" ingest_create_athena_table_tsv_passed\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -218,11 +218,11 @@
"outputs": [],
"source": [
"if not ingest_create_athena_table_tsv_passed:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++')\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -231,7 +231,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_staging_dir = 's3://{}/athena/staging'.format(bucket)"
+ "s3_staging_dir = \"s3://{}/athena/staging\".format(bucket)"
]
},
{
@@ -240,10 +240,10 @@
"metadata": {},
"outputs": [],
"source": [
- "tsv_prefix = 'amazon-reviews-pds/tsv'\n",
- "database_name = 'dsoaws'\n",
- "table_name_tsv = 'amazon_reviews_tsv'\n",
- "table_name = 'product_reviews'"
+ "tsv_prefix = \"amazon-reviews-pds/tsv\"\n",
+ "database_name = \"dsoaws\"\n",
+ "table_name_tsv = \"amazon_reviews_tsv\"\n",
+ "table_name = \"product_reviews\""
]
},
{
@@ -256,7 +256,9 @@
"CREATE TABLE IF NOT EXISTS {}.{} AS \n",
"SELECT review_id, review_body \n",
"FROM {}.{}\n",
- "\"\"\".format(database_name, table_name, database_name, table_name_tsv)\n",
+ "\"\"\".format(\n",
+ " database_name, table_name, database_name, table_name_tsv\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -269,17 +271,17 @@
"source": [
"import pandas as pd\n",
"\n",
- "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n",
+ "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n",
" conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)\n",
" pd.read_sql(statement, conn)\n",
"\n",
- " print('[OK]')\n",
- "else: \n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
- " print(' [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!'.format(region))\n",
- " print(' [INFO] This is OK. SKIP this notebook and move ahead with the workshop.' )\n",
- " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )"
+ " print(\"[OK]\")\n",
+ "else:\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\" [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!\".format(region))\n",
+ " print(\" [INFO] This is OK. SKIP this notebook and move ahead with the workshop.\")\n",
+ " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -288,8 +290,8 @@
"metadata": {},
"outputs": [],
"source": [
- "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n",
- " statement = 'SELECT * FROM {}.{} LIMIT 10'.format(database_name, table_name)\n",
+ "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n",
+ " statement = \"SELECT * FROM {}.{} LIMIT 10\".format(database_name, table_name)\n",
" conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)\n",
" df_table = pd.read_sql(statement, conn)\n",
" print(df_table)"
@@ -310,17 +312,17 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "client = boto3.client('athena')\n",
+ "client = boto3.client(\"athena\")\n",
"\n",
- "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n",
+ "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n",
" try:\n",
- " response = client.create_work_group(Name='AmazonAthenaPreviewFunctionality') \n",
+ " response = client.create_work_group(Name=\"AmazonAthenaPreviewFunctionality\")\n",
" print(response)\n",
" except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'InvalidRequestException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"InvalidRequestException\":\n",
" print(\"[OK] Workgroup already exists.\")\n",
" else:\n",
- " print('[ERROR] {}'.format(e))"
+ " print(\"[ERROR] {}\".format(e))"
]
},
{
@@ -352,7 +354,9 @@
")\n",
"SELECT review_id, review_body, predict_star_rating(REPLACE(review_body, ',', ' ')) AS predicted_star_rating \n",
" FROM {}.{} LIMIT 10\n",
- " \"\"\".format(autopilot_endpoint_name, database_name, table_name)\n",
+ " \"\"\".format(\n",
+ " autopilot_endpoint_name, database_name, table_name\n",
+ ")\n",
"\n",
"print(statement)"
]
@@ -370,8 +374,8 @@
"metadata": {},
"outputs": [],
"source": [
- "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n",
- " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir, work_group='AmazonAthenaPreviewFunctionality')\n",
+ "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n",
+ " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir, work_group=\"AmazonAthenaPreviewFunctionality\")\n",
" df = pd.read_sql(statement, conn)\n",
" print(df)"
]
@@ -389,12 +393,10 @@
"metadata": {},
"outputs": [],
"source": [
- "sm = boto3.client('sagemaker')\n",
+ "sm = boto3.client(\"sagemaker\")\n",
"\n",
"if autopilot_endpoint_name:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=autopilot_endpoint_name\n",
- " )"
+ " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)"
]
},
{
diff --git a/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb b/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb
index 8d482558..222dc164 100644
--- a/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb
+++ b/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb
@@ -34,12 +34,12 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -94,10 +94,12 @@
"source": [
"class StarRatingPredictor(Predictor):\n",
" def __init__(self, endpoint_name, sagemaker_session):\n",
- " super().__init__(endpoint_name, \n",
- " sagemaker_session=sagemaker_session, \n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer())"
+ " super().__init__(\n",
+ " endpoint_name,\n",
+ " sagemaker_session=sagemaker_session,\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ " )"
]
},
{
@@ -107,9 +109,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "pytorch_model_name = '{}-{}-{}'.format(training_job_name, 'pt', timestamp)\n",
+ "pytorch_model_name = \"{}-{}-{}\".format(training_job_name, \"pt\", timestamp)\n",
"\n",
"print(pytorch_model_name)"
]
@@ -120,14 +123,16 @@
"metadata": {},
"outputs": [],
"source": [
- "model = PyTorchModel(model_data=transformer_pytorch_model_dir_s3_uri + 'model.tar.gz',\n",
- " name=pytorch_model_name,\n",
- " role=role, \n",
- " entry_point='inference.py',\n",
- " source_dir='code-pytorch',\n",
- " framework_version='1.6.0',\n",
- " py_version='py3',\n",
- " predictor_cls=StarRatingPredictor)"
+ "model = PyTorchModel(\n",
+ " model_data=transformer_pytorch_model_dir_s3_uri + \"model.tar.gz\",\n",
+ " name=pytorch_model_name,\n",
+ " role=role,\n",
+ " entry_point=\"inference.py\",\n",
+ " source_dir=\"code-pytorch\",\n",
+ " framework_version=\"1.6.0\",\n",
+ " py_version=\"py3\",\n",
+ " predictor_cls=StarRatingPredictor,\n",
+ ")"
]
},
{
@@ -138,7 +143,7 @@
"source": [
"import time\n",
"\n",
- "pytorch_endpoint_name = '{}-{}-{}'.format(training_job_name, 'pt', timestamp)\n",
+ "pytorch_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"pt\", timestamp)\n",
"\n",
"print(pytorch_endpoint_name)"
]
@@ -149,10 +154,9 @@
"metadata": {},
"outputs": [],
"source": [
- "predictor = model.deploy(initial_instance_count=1, \n",
- " instance_type='ml.m5.4xlarge', \n",
- " endpoint_name=pytorch_endpoint_name, \n",
- " wait=False)"
+ "predictor = model.deploy(\n",
+ " initial_instance_count=1, instance_type=\"ml.m5.4xlarge\", endpoint_name=pytorch_endpoint_name, wait=False\n",
+ ")"
]
},
{
@@ -172,7 +176,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, pytorch_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, pytorch_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -183,7 +193,7 @@
"source": [
"%%time\n",
"\n",
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=pytorch_endpoint_name)"
]
},
@@ -200,7 +210,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pytorch_endpoint_arn = sm.describe_endpoint(EndpointName=pytorch_endpoint_name)['EndpointArn']\n",
+ "pytorch_endpoint_arn = sm.describe_endpoint(EndpointName=pytorch_endpoint_name)[\"EndpointArn\"]\n",
"print(pytorch_endpoint_arn)"
]
},
@@ -233,15 +243,12 @@
"source": [
"import json\n",
"\n",
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -260,12 +267,14 @@
"import csv\n",
"import pandas as pd\n",
"\n",
- "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_reviews = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"\n",
- "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n",
+ "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n",
"df_sample_reviews = df_sample_reviews.reset_index()\n",
"df_sample_reviews.shape"
]
@@ -278,14 +287,14 @@
"source": [
"import pandas as pd\n",
"\n",
+ "\n",
"def predict(review_body):\n",
- " inputs = [\n",
- " {\"features\": [review_body]}\n",
- " ]\n",
+ " inputs = [{\"features\": [review_body]}]\n",
" predicted_classes = predictor.predict(inputs)\n",
- " return predicted_classes[0]['predicted_label']\n",
- " \n",
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ " return predicted_classes[0][\"predicted_label\"]\n",
+ "\n",
+ "\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews.head(5)"
]
},
diff --git a/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
index 483b2f83..8f415cc4 100644
--- a/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
+++ b/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
@@ -28,12 +28,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -53,11 +53,11 @@
"source": [
"try:\n",
" training_job_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -162,7 +162,7 @@
"\n",
"timestamp = int(time.time())\n",
"\n",
- "tensorflow_model_name = '{}-{}-{}'.format(training_job_name, 'tf', timestamp)\n",
+ "tensorflow_model_name = \"{}-{}-{}\".format(training_job_name, \"tf\", timestamp)\n",
"\n",
"print(tensorflow_model_name)"
]
@@ -185,7 +185,7 @@
"outputs": [],
"source": [
"# requires enough disk space for tensorflow, transformers, and bert downloads\n",
- "instance_type = 'ml.m5.4xlarge' # evt "
+ "instance_type = \"ml.m5.4xlarge\" # evt"
]
},
{
@@ -196,12 +196,14 @@
"source": [
"from sagemaker.tensorflow.model import TensorFlowModel\n",
"\n",
- "tensorflow_model = TensorFlowModel(name=tensorflow_model_name,\n",
- " source_dir='code',\n",
- " entry_point='inference.py',\n",
- " model_data='s3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name),\n",
- " role=role,\n",
- " framework_version='2.3.1')"
+ "tensorflow_model = TensorFlowModel(\n",
+ " name=tensorflow_model_name,\n",
+ " source_dir=\"code\",\n",
+ " entry_point=\"inference.py\",\n",
+ " model_data=\"s3://{}/{}/output/model.tar.gz\".format(bucket, training_job_name),\n",
+ " role=role,\n",
+ " framework_version=\"2.3.1\",\n",
+ ")"
]
},
{
@@ -210,7 +212,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tensorflow_endpoint_name = '{}-{}-{}'.format(training_job_name, 'tf', timestamp)\n",
+ "tensorflow_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"tf\", timestamp)\n",
"\n",
"print(tensorflow_endpoint_name)"
]
@@ -223,10 +225,12 @@
},
"outputs": [],
"source": [
- "tensorflow_model.deploy(endpoint_name=tensorflow_endpoint_name,\n",
- " initial_instance_count=1, # Should use >=2 for high(er) availability \n",
- " instance_type=instance_type,\n",
- " wait=False)"
+ "tensorflow_model.deploy(\n",
+ " endpoint_name=tensorflow_endpoint_name,\n",
+ " initial_instance_count=1, # Should use >=2 for high(er) availability\n",
+ " instance_type=instance_type,\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -239,7 +243,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, tensorflow_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -257,7 +267,7 @@
"source": [
"%%time\n",
"\n",
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=tensorflow_endpoint_name)"
]
},
@@ -274,7 +284,7 @@
"metadata": {},
"outputs": [],
"source": [
- "tensorflow_endpoint_arn = sm.describe_endpoint(EndpointName=tensorflow_endpoint_name)['EndpointArn']\n",
+ "tensorflow_endpoint_arn = sm.describe_endpoint(EndpointName=tensorflow_endpoint_name)[\"EndpointArn\"]\n",
"print(tensorflow_endpoint_arn)"
]
},
@@ -315,15 +325,17 @@
"from sagemaker.tensorflow.model import TensorFlowPredictor\n",
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
- " \n",
- "predictor = TensorFlowPredictor(endpoint_name=tensorflow_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer())"
+ "\n",
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=tensorflow_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -357,15 +369,12 @@
"metadata": {},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -383,11 +392,13 @@
"source": [
"import csv\n",
"\n",
- "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
- "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=5)\n",
+ "df_reviews = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
+ "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=5)\n",
"df_sample_reviews = df_sample_reviews.reset_index()\n",
"df_sample_reviews.shape"
]
@@ -400,14 +411,14 @@
"source": [
"import pandas as pd\n",
"\n",
+ "\n",
"def predict(review_body):\n",
- " inputs = [\n",
- " {\"features\": [review_body]}\n",
- " ]\n",
+ " inputs = [{\"features\": [review_body]}]\n",
" predicted_classes = predictor.predict(inputs)\n",
- " return predicted_classes[0]['predicted_label']\n",
- " \n",
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ " return predicted_classes[0][\"predicted_label\"]\n",
+ "\n",
+ "\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews.head(5)"
]
},
diff --git a/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
index efd0e83a..0145e5a8 100644
--- a/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
+++ b/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
@@ -17,13 +17,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "autoscale = boto3.Session().client(service_name='application-autoscaling', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "autoscale = boto3.Session().client(service_name=\"application-autoscaling\", region_name=region)"
]
},
{
@@ -34,7 +34,7 @@
},
"outputs": [],
"source": [
- "%store -r tensorflow_endpoint_name "
+ "%store -r tensorflow_endpoint_name"
]
},
{
@@ -45,11 +45,11 @@
"source": [
"try:\n",
" tensorflow_endpoint_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the previous notebook before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the previous notebook before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -77,17 +77,17 @@
"outputs": [],
"source": [
"autoscale.register_scalable_target(\n",
- " ServiceNamespace='sagemaker',\n",
+ " ServiceNamespace=\"sagemaker\",\n",
" ResourceId=\"endpoint/\" + tensorflow_endpoint_name + \"/variant/AllTraffic\",\n",
- " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n",
+ " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n",
" MinCapacity=1,\n",
" MaxCapacity=2,\n",
" RoleARN=role,\n",
" SuspendedState={\n",
- " 'DynamicScalingInSuspended': False,\n",
- " 'DynamicScalingOutSuspended': False,\n",
- " 'ScheduledScalingSuspended': False\n",
- " }\n",
+ " \"DynamicScalingInSuspended\": False,\n",
+ " \"DynamicScalingOutSuspended\": False,\n",
+ " \"ScheduledScalingSuspended\": False,\n",
+ " },\n",
")"
]
},
@@ -99,7 +99,7 @@
"source": [
"# check the target is available\n",
"autoscale.describe_scalable_targets(\n",
- " ServiceNamespace='sagemaker',\n",
+ " ServiceNamespace=\"sagemaker\",\n",
" MaxResults=100,\n",
")"
]
@@ -111,19 +111,19 @@
"outputs": [],
"source": [
"autoscale.put_scaling_policy(\n",
- " PolicyName='bert-reviews-autoscale-policy',\n",
- " ServiceNamespace='sagemaker',\n",
+ " PolicyName=\"bert-reviews-autoscale-policy\",\n",
+ " ServiceNamespace=\"sagemaker\",\n",
" ResourceId=\"endpoint/\" + tensorflow_endpoint_name + \"/variant/AllTraffic\",\n",
- " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n",
- " PolicyType='TargetTrackingScaling',\n",
+ " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n",
+ " PolicyType=\"TargetTrackingScaling\",\n",
" TargetTrackingScalingPolicyConfiguration={\n",
- " 'TargetValue': 2.0,\n",
- " 'PredefinedMetricSpecification': {\n",
- " 'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance',\n",
+ " \"TargetValue\": 2.0,\n",
+ " \"PredefinedMetricSpecification\": {\n",
+ " \"PredefinedMetricType\": \"SageMakerVariantInvocationsPerInstance\",\n",
" },\n",
- " 'ScaleOutCooldown': 60,\n",
- " 'ScaleInCooldown': 300,\n",
- " }\n",
+ " \"ScaleOutCooldown\": 60,\n",
+ " \"ScaleInCooldown\": 300,\n",
+ " },\n",
")"
]
},
@@ -137,7 +137,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, tensorflow_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -148,7 +154,7 @@
"source": [
"%%time\n",
"\n",
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=tensorflow_endpoint_name)"
]
},
@@ -170,14 +176,16 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=tensorflow_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer()) "
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=tensorflow_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -213,7 +221,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, tensorflow_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -224,16 +238,13 @@
},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"for i in range(0, 100000):\n",
" predicted_classes = predictor.predict(inputs)\n",
"\n",
" for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
diff --git a/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb b/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb
index 03f149e9..b6370504 100644
--- a/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb
+++ b/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb
@@ -47,13 +47,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "cw = boto3.Session().client(service_name='cloudwatch', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "cw = boto3.Session().client(service_name=\"cloudwatch\", region_name=region)"
]
},
{
@@ -78,14 +78,12 @@
"metadata": {},
"outputs": [],
"source": [
- "try: \n",
+ "try:\n",
" autopilot_endpoint_name\n",
- " sm.delete_endpoint(\n",
- " EndpointName=autopilot_endpoint_name\n",
- " )\n",
- " print('Autopilot Endpoint has been deleted to save resources. This is good.') \n",
+ " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)\n",
+ " print(\"Autopilot Endpoint has been deleted to save resources. This is good.\")\n",
"except:\n",
- " print('Endpoints are cleaned up. This is good. Keep moving forward!')"
+ " print(\"Endpoints are cleaned up. This is good. Keep moving forward!\")"
]
},
{
@@ -193,7 +191,7 @@
" version=\"2.3.1\",\n",
" py_version=\"py37\",\n",
" instance_type=\"ml.m5.4xlarge\",\n",
- " image_scope=\"inference\"\n",
+ " image_scope=\"inference\",\n",
")\n",
"print(inference_image_uri)"
]
@@ -205,14 +203,14 @@
"outputs": [],
"source": [
"import time\n",
- "timestamp = '{}'.format(int(time.time()))\n",
"\n",
- "model_a_name = '{}-{}-{}'.format(training_job_name, 'varianta', timestamp)\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
+ "\n",
+ "model_a_name = \"{}-{}-{}\".format(training_job_name, \"varianta\", timestamp)\n",
"\n",
- "sess.create_model_from_job(name=model_a_name,\n",
- " training_job_name=training_job_name,\n",
- " role=role,\n",
- " image_uri=inference_image_uri)"
+ "sess.create_model_from_job(\n",
+ " name=model_a_name, training_job_name=training_job_name, role=role, image_uri=inference_image_uri\n",
+ ")"
]
},
{
@@ -234,12 +232,11 @@
"metadata": {},
"outputs": [],
"source": [
- "model_b_name = '{}-{}-{}'.format(training_job_name, 'variantb', timestamp)\n",
+ "model_b_name = \"{}-{}-{}\".format(training_job_name, \"variantb\", timestamp)\n",
"\n",
- "sess.create_model_from_job(name=model_b_name,\n",
- " training_job_name=training_job_name,\n",
- " role=role,\n",
- " image_uri=inference_image_uri)"
+ "sess.create_model_from_job(\n",
+ " name=model_b_name, training_job_name=training_job_name, role=role, image_uri=inference_image_uri\n",
+ ")"
]
},
{
@@ -271,25 +268,28 @@
"source": [
"from sagemaker.session import production_variant\n",
"\n",
- "timestamp = '{}'.format(int(time.time()))\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
"\n",
- "endpoint_config_name = '{}-{}-{}'.format(training_job_name, 'abtest', timestamp)\n",
+ "endpoint_config_name = \"{}-{}-{}\".format(training_job_name, \"abtest\", timestamp)\n",
"\n",
- "variantA = production_variant(model_name=model_a_name,\n",
- " instance_type='ml.m5.4xlarge',\n",
- " initial_instance_count=1,\n",
- " variant_name='VariantA',\n",
- " initial_weight=50)\n",
+ "variantA = production_variant(\n",
+ " model_name=model_a_name,\n",
+ " instance_type=\"ml.m5.4xlarge\",\n",
+ " initial_instance_count=1,\n",
+ " variant_name=\"VariantA\",\n",
+ " initial_weight=50,\n",
+ ")\n",
"\n",
- "variantB = production_variant(model_name=model_b_name,\n",
- " instance_type='ml.m5.4xlarge',\n",
- " initial_instance_count=1,\n",
- " variant_name='VariantB',\n",
- " initial_weight=50)\n",
+ "variantB = production_variant(\n",
+ " model_name=model_b_name,\n",
+ " instance_type=\"ml.m5.4xlarge\",\n",
+ " initial_instance_count=1,\n",
+ " variant_name=\"VariantB\",\n",
+ " initial_weight=50,\n",
+ ")\n",
"\n",
"endpoint_config = sm.create_endpoint_config(\n",
- " EndpointConfigName=endpoint_config_name,\n",
- " ProductionVariants=[variantA, variantB]\n",
+ " EndpointConfigName=endpoint_config_name, ProductionVariants=[variantA, variantB]\n",
")"
]
},
@@ -301,7 +301,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review REST Endpoint Configuration'.format(region, endpoint_config_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review REST Endpoint Configuration'.format(\n",
+ " region, endpoint_config_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -310,11 +316,9 @@
"metadata": {},
"outputs": [],
"source": [
- "model_ab_endpoint_name = '{}-{}-{}'.format(training_job_name, 'abtest', timestamp)\n",
+ "model_ab_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"abtest\", timestamp)\n",
"\n",
- "endpoint_response = sm.create_endpoint(\n",
- " EndpointName=model_ab_endpoint_name,\n",
- " EndpointConfigName=endpoint_config_name)"
+ "endpoint_response = sm.create_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=endpoint_config_name)"
]
},
{
@@ -384,7 +388,7 @@
"source": [
"from smexperiments.trial import Trial\n",
"\n",
- "timestamp = '{}'.format(int(time.time()))\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
"\n",
"trial = Trial.load(trial_name=trial_name)\n",
"print(trial)"
@@ -398,11 +402,10 @@
"source": [
"from smexperiments.tracker import Tracker\n",
"\n",
- "tracker_deploy = Tracker.create(display_name='deploy', \n",
- " sagemaker_boto_client=sm)\n",
+ "tracker_deploy = Tracker.create(display_name=\"deploy\", sagemaker_boto_client=sm)\n",
"\n",
"deploy_trial_component_name = tracker_deploy.trial_component.trial_component_name\n",
- "print('Deploy trial component name {}'.format(deploy_trial_component_name))"
+ "print(\"Deploy trial component name {}\".format(deploy_trial_component_name))"
]
},
{
@@ -434,9 +437,11 @@
"metadata": {},
"outputs": [],
"source": [
- "tracker_deploy.log_parameters({\n",
- " 'endpoint_name': model_ab_endpoint_name,\n",
- "})\n",
+ "tracker_deploy.log_parameters(\n",
+ " {\n",
+ " \"endpoint_name\": model_ab_endpoint_name,\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"tracker_deploy.trial_component.save()"
@@ -453,7 +458,7 @@
"lineage_table = ExperimentAnalytics(\n",
" sagemaker_session=sess,\n",
" experiment_name=experiment_name,\n",
- " metric_names=['validation:accuracy'],\n",
+ " metric_names=[\"validation:accuracy\"],\n",
" sort_by=\"CreationTime\",\n",
" sort_order=\"Ascending\",\n",
")\n",
@@ -479,7 +484,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review REST Endpoint'.format(\n",
+ " region, model_ab_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -495,7 +506,7 @@
"metadata": {},
"outputs": [],
"source": [
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=model_ab_endpoint_name)"
]
},
@@ -523,14 +534,16 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=model_ab_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer()) "
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=model_ab_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -564,15 +577,12 @@
"metadata": {},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -590,11 +600,13 @@
"source": [
"import csv\n",
"\n",
- "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
- "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n",
+ "df_reviews = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
+ "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n",
"df_sample_reviews = df_sample_reviews.reset_index()\n",
"df_sample_reviews.shape"
]
@@ -607,14 +619,14 @@
"source": [
"import pandas as pd\n",
"\n",
+ "\n",
"def predict(review_body):\n",
- " inputs = [\n",
- " {\"features\": [review_body]}\n",
- " ]\n",
+ " inputs = [{\"features\": [review_body]}]\n",
" predicted_classes = predictor.predict(inputs)\n",
- " return predicted_classes[0]['predicted_label']\n",
- " \n",
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ " return predicted_classes[0][\"predicted_label\"]\n",
+ "\n",
+ "\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews.head(5)"
]
},
@@ -633,7 +645,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review REST Endpoint Performance Metrics'.format(region, model_ab_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review REST Endpoint Performance Metrics'.format(\n",
+ " region, model_ab_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -656,12 +674,10 @@
"import boto3\n",
"import pandas as pd\n",
"\n",
- "def get_invocation_metrics_for_endpoint_variant(endpoint_name,\n",
- " namespace_name,\n",
- " metric_name,\n",
- " variant_name,\n",
- " start_time,\n",
- " end_time):\n",
+ "\n",
+ "def get_invocation_metrics_for_endpoint_variant(\n",
+ " endpoint_name, namespace_name, metric_name, variant_name, start_time, end_time\n",
+ "):\n",
" metrics = cw.get_metric_statistics(\n",
" Namespace=namespace_name,\n",
" MetricName=metric_name,\n",
@@ -669,55 +685,48 @@
" EndTime=end_time,\n",
" Period=60,\n",
" Statistics=[\"Sum\"],\n",
- " Dimensions=[\n",
- " {\n",
- " \"Name\": \"EndpointName\",\n",
- " \"Value\": endpoint_name\n",
- " },\n",
- " {\n",
- " \"Name\": \"VariantName\",\n",
- " \"Value\": variant_name\n",
- " }\n",
- " ]\n",
+ " Dimensions=[{\"Name\": \"EndpointName\", \"Value\": endpoint_name}, {\"Name\": \"VariantName\", \"Value\": variant_name}],\n",
" )\n",
"\n",
- " if metrics['Datapoints']:\n",
- " return pd.DataFrame(metrics[\"Datapoints\"])\\\n",
- " .sort_values(\"Timestamp\")\\\n",
- " .set_index(\"Timestamp\")\\\n",
- " .drop(\"Unit\", axis=1)\\\n",
- " .rename(columns={\"Sum\": variant_name})\n",
+ " if metrics[\"Datapoints\"]:\n",
+ " return (\n",
+ " pd.DataFrame(metrics[\"Datapoints\"])\n",
+ " .sort_values(\"Timestamp\")\n",
+ " .set_index(\"Timestamp\")\n",
+ " .drop(\"Unit\", axis=1)\n",
+ " .rename(columns={\"Sum\": variant_name})\n",
+ " )\n",
" else:\n",
" return pd.DataFrame()\n",
"\n",
"\n",
- "def plot_endpoint_metrics_for_variants(endpoint_name,\n",
- " namespace_name,\n",
- " metric_name,\n",
- " start_time=None):\n",
+ "def plot_endpoint_metrics_for_variants(endpoint_name, namespace_name, metric_name, start_time=None):\n",
" try:\n",
" start_time = start_time or datetime.now() - timedelta(minutes=60)\n",
" end_time = datetime.now()\n",
"\n",
- " metrics_variantA = get_invocation_metrics_for_endpoint_variant(endpoint_name=model_ab_endpoint_name, \n",
- " namespace_name=namespace_name,\n",
- " metric_name=metric_name,\n",
- " variant_name=variantA[\"VariantName\"], \n",
- " start_time=start_time, \n",
- " end_time=end_time)\n",
- "\n",
- " metrics_variantB = get_invocation_metrics_for_endpoint_variant(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name=namespace_name,\n",
- " metric_name=metric_name, \n",
- " variant_name=variantB[\"VariantName\"], \n",
- " start_time=start_time, \n",
- " end_time=end_time)\n",
+ " metrics_variantA = get_invocation_metrics_for_endpoint_variant(\n",
+ " endpoint_name=model_ab_endpoint_name,\n",
+ " namespace_name=namespace_name,\n",
+ " metric_name=metric_name,\n",
+ " variant_name=variantA[\"VariantName\"],\n",
+ " start_time=start_time,\n",
+ " end_time=end_time,\n",
+ " )\n",
+ "\n",
+ " metrics_variantB = get_invocation_metrics_for_endpoint_variant(\n",
+ " endpoint_name=model_ab_endpoint_name,\n",
+ " namespace_name=namespace_name,\n",
+ " metric_name=metric_name,\n",
+ " variant_name=variantB[\"VariantName\"],\n",
+ " start_time=start_time,\n",
+ " end_time=end_time,\n",
+ " )\n",
"\n",
" metrics_variants = metrics_variantA.join(metrics_variantB, how=\"outer\")\n",
" metrics_variants.plot()\n",
" except:\n",
- " pass\n",
- " "
+ " pass"
]
},
{
@@ -737,13 +746,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(20)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='/aws/sagemaker/Endpoints',\n",
- " metric_name='CPUUtilization')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n",
+ ")"
]
},
{
@@ -753,13 +763,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='Invocations')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n",
+ ")"
]
},
{
@@ -769,13 +780,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='InvocationsPerInstance')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n",
+ ")"
]
},
{
@@ -785,13 +797,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='ModelLatency')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n",
+ ")"
]
},
{
@@ -812,13 +825,13 @@
"source": [
"updated_endpoint_config = [\n",
" {\n",
- " 'VariantName': variantA['VariantName'],\n",
- " 'DesiredWeight': 0,\n",
+ " \"VariantName\": variantA[\"VariantName\"],\n",
+ " \"DesiredWeight\": 0,\n",
" },\n",
" {\n",
- " 'VariantName': variantB['VariantName'],\n",
- " 'DesiredWeight': 100,\n",
- " }\n",
+ " \"VariantName\": variantB[\"VariantName\"],\n",
+ " \"DesiredWeight\": 100,\n",
+ " },\n",
"]"
]
},
@@ -829,8 +842,7 @@
"outputs": [],
"source": [
"sm.update_endpoint_weights_and_capacities(\n",
- " EndpointName=model_ab_endpoint_name,\n",
- " DesiredWeightsAndCapacities=updated_endpoint_config\n",
+ " EndpointName=model_ab_endpoint_name, DesiredWeightsAndCapacities=updated_endpoint_config\n",
")"
]
},
@@ -844,7 +856,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review REST Endpoint'.format(\n",
+ " region, model_ab_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -861,7 +879,7 @@
"metadata": {},
"outputs": [],
"source": [
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=model_ab_endpoint_name)"
]
},
@@ -878,7 +896,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews.head(5)"
]
},
@@ -899,13 +917,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(20)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='/aws/sagemaker/Endpoints',\n",
- " metric_name='CPUUtilization')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n",
+ ")"
]
},
{
@@ -915,13 +934,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='Invocations')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n",
+ ")"
]
},
{
@@ -931,13 +951,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='InvocationsPerInstance')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n",
+ ")"
]
},
{
@@ -947,13 +968,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='ModelLatency')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n",
+ ")"
]
},
{
@@ -975,21 +997,23 @@
"outputs": [],
"source": [
"import time\n",
- "timestamp = '{}'.format(int(time.time()))\n",
"\n",
- "updated_endpoint_config_name = '{}-{}'.format(training_job_name, timestamp)\n",
+ "timestamp = \"{}\".format(int(time.time()))\n",
+ "\n",
+ "updated_endpoint_config_name = \"{}-{}\".format(training_job_name, timestamp)\n",
"\n",
"updated_endpoint_config = sm.create_endpoint_config(\n",
" EndpointConfigName=updated_endpoint_config_name,\n",
" ProductionVariants=[\n",
" {\n",
- " 'VariantName': variantB['VariantName'],\n",
- " 'ModelName': model_b_name, # Only specify variant B to remove variant A\n",
- " 'InstanceType':'ml.m5.4xlarge',\n",
- " 'InitialInstanceCount': 1,\n",
- " 'InitialVariantWeight': 100\n",
+ " \"VariantName\": variantB[\"VariantName\"],\n",
+ " \"ModelName\": model_b_name, # Only specify variant B to remove variant A\n",
+ " \"InstanceType\": \"ml.m5.4xlarge\",\n",
+ " \"InitialInstanceCount\": 1,\n",
+ " \"InitialVariantWeight\": 100,\n",
" }\n",
- " ])"
+ " ],\n",
+ ")"
]
},
{
@@ -1000,10 +1024,7 @@
},
"outputs": [],
"source": [
- "sm.update_endpoint(\n",
- " EndpointName=model_ab_endpoint_name,\n",
- " EndpointConfigName=updated_endpoint_config_name\n",
- ")"
+ "sm.update_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=updated_endpoint_config_name)"
]
},
{
@@ -1021,7 +1042,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review REST Endpoint'.format(\n",
+ " region, model_ab_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1040,7 +1067,7 @@
},
"outputs": [],
"source": [
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=model_ab_endpoint_name)"
]
},
@@ -1057,7 +1084,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews"
]
},
@@ -1078,13 +1105,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(20)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='/aws/sagemaker/Endpoints',\n",
- " metric_name='CPUUtilization')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n",
+ ")"
]
},
{
@@ -1094,13 +1122,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='Invocations')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n",
+ ")"
]
},
{
@@ -1110,13 +1139,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='InvocationsPerInstance')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n",
+ ")"
]
},
{
@@ -1126,13 +1156,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
+ "\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"time.sleep(5)\n",
- "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n",
- " namespace_name='AWS/SageMaker', \n",
- " metric_name='ModelLatency')"
+ "plot_endpoint_metrics_for_variants(\n",
+ " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n",
+ ")"
]
},
{
@@ -1160,9 +1191,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sm.delete_endpoint(\n",
- " EndpointName=model_ab_endpoint_name\n",
- ")"
+ "sm.delete_endpoint(EndpointName=model_ab_endpoint_name)"
]
},
{
diff --git a/09_deploy/code-pytorch/inference.py b/09_deploy/code-pytorch/inference.py
index 72075ffd..85ce8084 100644
--- a/09_deploy/code-pytorch/inference.py
+++ b/09_deploy/code-pytorch/inference.py
@@ -10,67 +10,69 @@
logger.addHandler(logging.StreamHandler(sys.stdout))
###################################
-### VARIABLES
+### VARIABLES
###################################
max_seq_length = 64
classes = [1, 2, 3, 4, 5]
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
###################################
-### SAGEMKAER LOAD MODEL FUNCTION
-###################################
+### SAGEMKAER LOAD MODEL FUNCTION
+###################################
-# You need to put in config.json from saved fine-tuned Hugging Face model in code/
+# You need to put in config.json from saved fine-tuned Hugging Face model in code/
# Reference it in the inference container at /opt/ml/model/code
# The model needs to be called 'model.pth' per https://github.com/aws/sagemaker-pytorch-inference-toolkit/blob/6936c08581e26ff3bac26824b1e4946ec68ffc85/src/sagemaker_pytorch_serving_container/torchserve.py#L45
+
def model_fn(model_dir):
- config = DistilBertConfig.from_json_file('/opt/ml/model/code/config.json')
-
- model_path = '{}/{}'.format(model_dir, 'model.pth')
+ config = DistilBertConfig.from_json_file("/opt/ml/model/code/config.json")
+
+ model_path = "{}/{}".format(model_dir, "model.pth")
model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
-
+
return model
###################################
-### SAGEMKAER PREDICT FUNCTION
-###################################
+### SAGEMKAER PREDICT FUNCTION
+###################################
+
def predict_fn(input_data, model):
model.eval()
- print('input_data: {}'.format(input_data))
- print('type(input_data): {}'.format(type(input_data)))
-
- data_str = input_data.decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ print("input_data: {}".format(input_data))
+ print("type(input_data): {}".format(type(input_data)))
+
+ data_str = input_data.decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
predicted_classes = []
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0]: review_body
# features[1..n]: is anything else (we can define the order ourselves)
- # Example:
- # {"features": ["The best gift ever", "Gift Cards"]}
+ # Example:
+ # {"features": ["The best gift ever", "Gift Cards"]}
#
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
+
encode_plus_token = tokenizer.encode_plus(
review_body,
max_length=max_seq_length,
@@ -78,49 +80,53 @@ def predict_fn(input_data, model):
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
- return_tensors='pt',
- truncation=True)
+ return_tensors="pt",
+ truncation=True,
+ )
- input_ids = encode_plus_token['input_ids']
- attention_mask = encode_plus_token['attention_mask']
+ input_ids = encode_plus_token["input_ids"]
+ attention_mask = encode_plus_token["attention_mask"]
output = model(input_ids, attention_mask)
- print('output: {}'.format(output))
+ print("output: {}".format(output))
- # output is a tuple:
+ # output is a tuple:
# output: (tensor([[-1.9840, -0.9870, 2.8947]], grad_fn=),
- # for torch.max() you need to pass in the tensor, output[0]
+ # for torch.max() you need to pass in the tensor, output[0]
_, prediction = torch.max(output[0], dim=1)
predicted_class_idx = prediction.item()
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
+ prediction_dict["predicted_label"] = predicted_class
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
+ print("jsonline: {}".format(jsonline))
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
return predicted_classes_jsonlines
###################################
-### SAGEMKAER MODEL INPUT FUNCTION
-###################################
+### SAGEMKAER MODEL INPUT FUNCTION
+###################################
+
-def input_fn(serialized_input_data, content_type='application/jsonlines'):
+def input_fn(serialized_input_data, content_type="application/jsonlines"):
return serialized_input_data
+
+###################################
+### SAGEMKAER MODEL OUTPUT FUNCTION
###################################
-### SAGEMKAER MODEL OUTPUT FUNCTION
-###################################
-def output_fn(prediction_output, accept='application/jsonlines'):
+
+def output_fn(prediction_output, accept="application/jsonlines"):
return prediction_output, accept
diff --git a/09_deploy/code/inference.py b/09_deploy/code/inference.py
index 2975dc2d..53196737 100644
--- a/09_deploy/code/inference.py
+++ b/09_deploy/code/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/09_deploy/common/docker_utils.py b/09_deploy/common/docker_utils.py
index 5c34c7b0..54870328 100644
--- a/09_deploy/common/docker_utils.py
+++ b/09_deploy/common/docker_utils.py
@@ -29,7 +29,7 @@
IMAGE_TEMPLATE = "{account}.dkr.ecr.{region}.amazonaws.com/{image_name}:{version}"
-def build_and_push_docker_image(repository_name, dockerfile='Dockerfile', build_args={}):
+def build_and_push_docker_image(repository_name, dockerfile="Dockerfile", build_args={}):
"""Builds a docker image from the specified dockerfile, and pushes it to
ECR. Handles things like ECR login, creating the repository.
@@ -42,15 +42,15 @@ def build_and_push_docker_image(repository_name, dockerfile='Dockerfile', build_
return ecr_tag
-def _build_from_dockerfile(repository_name, dockerfile='Dockerfile', build_args={}):
- build_cmd = ['docker', 'build', '-t', repository_name, '-f', dockerfile, '.']
- for k,v in build_args.items():
- build_cmd += ['--build-arg', '%s=%s' % (k,v)]
+def _build_from_dockerfile(repository_name, dockerfile="Dockerfile", build_args={}):
+ build_cmd = ["docker", "build", "-t", repository_name, "-f", dockerfile, "."]
+ for k, v in build_args.items():
+ build_cmd += ["--build-arg", "%s=%s" % (k, v)]
print("Building docker image %s from %s" % (repository_name, dockerfile))
_execute(build_cmd)
print("Done building docker image %s" % repository_name)
-
+
def _find_base_image_in_dockerfile(dockerfile):
dockerfile_lines = open(dockerfile).readlines()
@@ -72,14 +72,14 @@ def push(tag, aws_account=None, aws_region=None):
(string): ECR repo image that was pushed
"""
session = boto3.Session()
- aws_account = aws_account or session.client("sts").get_caller_identity()['Account']
+ aws_account = aws_account or session.client("sts").get_caller_identity()["Account"]
aws_region = aws_region or session.region_name
try:
- repository_name, version = tag.split(':')
+ repository_name, version = tag.split(":")
except ValueError: # split failed because no :
repository_name = tag
version = "latest"
- ecr_client = session.client('ecr', region_name=aws_region)
+ ecr_client = session.client("ecr", region_name=aws_region)
_create_ecr_repo(ecr_client, repository_name)
_ecr_login(ecr_client, aws_account)
@@ -89,11 +89,11 @@ def push(tag, aws_account=None, aws_region=None):
def _push(aws_account, aws_region, tag):
- ecr_repo = '%s.dkr.ecr.%s.amazonaws.com' % (aws_account, aws_region)
- ecr_tag = '%s/%s' % (ecr_repo, tag)
- _execute(['docker', 'tag', tag, ecr_tag])
+ ecr_repo = "%s.dkr.ecr.%s.amazonaws.com" % (aws_account, aws_region)
+ ecr_tag = "%s/%s" % (ecr_repo, tag)
+ _execute(["docker", "tag", tag, ecr_tag])
print("Pushing docker image to ECR repository %s/%s\n" % (ecr_repo, tag))
- _execute(['docker', 'push', ecr_tag])
+ _execute(["docker", "push", ecr_tag])
print("Done pushing %s" % ecr_tag)
return ecr_tag
@@ -111,34 +111,34 @@ def _create_ecr_repo(ecr_client, repository_name):
def _ecr_login(ecr_client, aws_account):
auth = ecr_client.get_authorization_token(registryIds=[aws_account])
- authorization_data = auth['authorizationData'][0]
+ authorization_data = auth["authorizationData"][0]
- raw_token = base64.b64decode(authorization_data['authorizationToken'])
- token = raw_token.decode('utf-8').strip('AWS:')
- ecr_url = auth['authorizationData'][0]['proxyEndpoint']
+ raw_token = base64.b64decode(authorization_data["authorizationToken"])
+ token = raw_token.decode("utf-8").strip("AWS:")
+ ecr_url = auth["authorizationData"][0]["proxyEndpoint"]
- cmd = ['docker', 'login', '-u', 'AWS', '-p', token, ecr_url]
+ cmd = ["docker", "login", "-u", "AWS", "-p", token, ecr_url]
_execute(cmd, quiet=True)
print("Logged into ECR")
def _ecr_login_if_needed(image):
- ecr_client = boto3.client('ecr')
+ ecr_client = boto3.client("ecr")
# Only ECR images need login
- if not ('dkr.ecr' in image and 'amazonaws.com' in image):
+ if not ("dkr.ecr" in image and "amazonaws.com" in image):
return
# do we have the image?
- if _check_output('docker images -q %s' % image).strip():
+ if _check_output("docker images -q %s" % image).strip():
return
- aws_account = image.split('.')[0]
+ aws_account = image.split(".")[0]
_ecr_login(ecr_client, aws_account)
@contextlib.contextmanager
-def _tmpdir(suffix='', prefix='tmp', dir=None): # type: (str, str, str) -> None
+def _tmpdir(suffix="", prefix="tmp", dir=None): # type: (str, str, str) -> None
"""Create a temporary directory with a context manager. The file is deleted when the context exits.
The prefix, suffix, and dir arguments are the same as for mkstemp().
@@ -160,10 +160,8 @@ def _tmpdir(suffix='', prefix='tmp', dir=None): # type: (str, str, str) -> None
def _execute(command, quiet=False):
if not quiet:
- print("$ %s" % ' '.join(command))
- process = subprocess.Popen(command,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT)
+ print("$ %s" % " ".join(command))
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
try:
_stream_output(process)
except RuntimeError as e:
diff --git a/09_deploy/common/env_utils.py b/09_deploy/common/env_utils.py
index 516af621..6d512731 100644
--- a/09_deploy/common/env_utils.py
+++ b/09_deploy/common/env_utils.py
@@ -6,10 +6,12 @@
gym.logger.set_level(40)
-class VectoredGymEnvironment():
+
+class VectoredGymEnvironment:
"""
Envrioment class to run multiple similations and collect rollout data
"""
+
def __init__(self, registered_gym_env, num_of_envs=1):
self.envs_initialized = False
self.initialized_envs = {}
@@ -19,14 +21,11 @@ def __init__(self, registered_gym_env, num_of_envs=1):
self.data_rows = []
self.initialize_envs(num_of_envs, registered_gym_env)
-
+
def is_initialized(self):
return self.envs_initialized
-
- def initialize_envs(
- self,
- num_of_envs,
- registered_gym_env):
+
+ def initialize_envs(self, num_of_envs, registered_gym_env):
"""Initialize multiple Openai gym environments.
Each envrionment will start with a different random seed.
@@ -51,7 +50,7 @@ def get_environment_states(self):
def dump_environment_states(self, dir_path, file_name):
"""Dumping current states of all the envrionments into file
-
+
Arguments:
dir_path {str} -- Directory path of the target file
file_name {str} -- File name of the target file
@@ -59,43 +58,42 @@ def dump_environment_states(self, dir_path, file_name):
data_folder = Path(dir_path)
file_path = data_folder / file_name
- with open(file_path, 'w') as outfile:
+ with open(file_path, "w") as outfile:
for state in self.env_states.values():
json.dump(list(state), outfile)
- outfile.write('\n')
+ outfile.write("\n")
def get_environment_ids(self):
return list(self.initialized_envs.keys())
-
+
def step(self, environment_id, action):
local_env = self.initialized_envs[environment_id]
observation, reward, done, info = local_env.step(action)
self.env_states[environment_id] = observation
return observation, reward, done, info
-
+
def reset(self, environment_id):
- self.env_states[environment_id] = \
- self.initialized_envs[environment_id].reset()
+ self.env_states[environment_id] = self.initialized_envs[environment_id].reset()
return self.env_states[environment_id]
def reset_all_envs(self):
print("Resetting all the environments...")
- for i in range(0, self.num_of_envs):
+ for i in range(0, self.num_of_envs):
environment_id = "environment_" + str(i)
self.reset(environment_id)
-
+
def close(self, environment_id):
self.initialized_envs[environment_id].close()
return
-
+
def render(self, environment_id):
self.initialized_envs[environment_id].render()
return
def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, action_prob, num_episodes):
"""Collect rollouts with given steps from one environment
-
+
Arguments:
environment_id {str} -- Environment id for the environment
action_prob {list} -- Action probabilities of the simulated policy
@@ -116,8 +114,9 @@ def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, ac
cur_state_features = self.env_states[environment_id]
_, reward, done, _ = self.step(environment_id, action)
cumulative_rewards += reward
- episode_id = int(environment_id.split('_')[-1]) + \
- self.num_of_envs * self.env_reset_counter[environment_id]
+ episode_id = (
+ int(environment_id.split("_")[-1]) + self.num_of_envs * self.env_reset_counter[environment_id]
+ )
if not done:
data_item.extend([action, action_prob, episode_id, reward, 0.0])
else:
@@ -131,7 +130,7 @@ def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, ac
def collect_rollouts_for_single_env_with_given_steps(self, environment_id, action_prob, num_steps):
"""Collect rollouts with given steps from one environment
-
+
Arguments:
environment_id {str} -- Environment id for the environment
action_prob {list} -- Action probabilities of the simulated policy
@@ -148,8 +147,7 @@ def collect_rollouts_for_single_env_with_given_steps(self, environment_id, actio
action = np.random.choice(len(action_prob), p=action_prob)
cur_state_features = self.env_states[environment_id]
_, reward, done, _ = self.step(environment_id, action)
- episode_id = int(environment_id.split('_')[-1]) + \
- self.num_of_envs * self.env_reset_counter[environment_id]
+ episode_id = int(environment_id.split("_")[-1]) + self.num_of_envs * self.env_reset_counter[environment_id]
data_item.extend([action, action_prob, episode_id, reward])
for j in range(len(cur_state_features)):
data_item.append(cur_state_features[j])
@@ -158,25 +156,27 @@ def collect_rollouts_for_single_env_with_given_steps(self, environment_id, actio
self.reset(environment_id)
self.env_reset_counter[environment_id] += 1
- def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes=None, action_probs=None, file_name=None):
+ def collect_rollouts_with_given_action_probs(
+ self, num_steps=None, num_episodes=None, action_probs=None, file_name=None
+ ):
"""Collect rollouts from all the initiated environments with given action probs
-
+
Keyword Arguments:
num_steps {int} -- Number of steps to run rollouts (default: {None})
num_episodes {int} -- Number of episodes to run rollouts (default: {None})
action_probs {list} -- Action probs for the policy (default: {None})
file_name {str} -- Batch transform output that contain predictions of probs (default: {None})
-
+
Returns:
[Dataframe] -- Dataframe that contains the rollout data from all envs
"""
if file_name is not None:
assert action_probs is None
- json_lines = [json.loads(line.rstrip('\n')) for line in open(file_name) if line is not '']
+ json_lines = [json.loads(line.rstrip("\n")) for line in open(file_name) if line is not ""]
action_probs = []
for line in json_lines:
- if line.get('SageMakerOutput') is not None:
- action_probs.append(line['SageMakerOutput'].get("predictions")[0])
+ if line.get("SageMakerOutput") is not None:
+ action_probs.append(line["SageMakerOutput"].get("predictions")[0])
else:
action_probs.append(line.get("predictions")[0])
@@ -184,9 +184,7 @@ def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes=
for index, environment_id in enumerate(self.get_environment_ids()):
if num_steps is not None:
assert num_episodes is None
- self.collect_rollouts_for_single_env_with_given_steps(
- environment_id, action_probs[index], num_steps
- )
+ self.collect_rollouts_for_single_env_with_given_steps(environment_id, action_probs[index], num_steps)
else:
assert num_episodes is not None
self.collect_rollouts_for_single_env_with_given_episodes(
@@ -194,18 +192,18 @@ def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes=
)
col_names = self._create_col_names()
- df = pd.DataFrame(self.data_rows, columns = col_names)
+ df = pd.DataFrame(self.data_rows, columns=col_names)
return df
def _create_col_names(self):
"""Create column names of dataframe that can be consumed by Coach
-
+
Returns:
[list] -- List of column names
"""
- col_names = ['action', 'all_action_probabilities', 'episode_id', 'reward', 'cumulative_rewards']
+ col_names = ["action", "all_action_probabilities", "episode_id", "reward", "cumulative_rewards"]
for i in range(self.state_dims):
- col_names.append('state_feature_' + str(i))
+ col_names.append("state_feature_" + str(i))
- return col_names
\ No newline at end of file
+ return col_names
diff --git a/09_deploy/common/markdown_helper.py b/09_deploy/common/markdown_helper.py
index f545cffd..66d67260 100644
--- a/09_deploy/common/markdown_helper.py
+++ b/09_deploy/common/markdown_helper.py
@@ -11,6 +11,7 @@
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
+
def generate_s3_write_permission_for_sagemaker_role(role):
role_name = role.split("/")[-1]
url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name
@@ -19,6 +20,7 @@ def generate_s3_write_permission_for_sagemaker_role(role):
text += "3. Search and select `AmazonKinesisVideoStreamsFullAccess` policy\n"
return text
+
def generate_kinesis_create_permission_for_sagemaker_role(role):
role_name = role.split("/")[-1]
url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name
@@ -27,6 +29,7 @@ def generate_kinesis_create_permission_for_sagemaker_role(role):
text += "3. Search and select `AmazonS3FullAccess` policy\n"
return text
+
def generate_help_for_s3_endpoint_permissions(role):
role_name = role.split("/")[-1]
url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name
@@ -138,12 +141,14 @@ def generate_robomaker_links(job_arns, aws_region):
simulation_ids = [job_arn.split("/")[-1] for job_arn in job_arns]
robomaker_links = []
for simulation_id in simulation_ids:
- robomaker_link = "https://%s.console.aws.amazon.com/robomaker/home?region=%s#simulationJobs/%s" % (aws_region,
- aws_region,
- simulation_id)
+ robomaker_link = "https://%s.console.aws.amazon.com/robomaker/home?region=%s#simulationJobs/%s" % (
+ aws_region,
+ aws_region,
+ simulation_id,
+ )
robomaker_links.append(robomaker_link)
- markdown_content = '> Click on the following links for visualization of simulation jobs on RoboMaker Console\n'
+ markdown_content = "> Click on the following links for visualization of simulation jobs on RoboMaker Console\n"
for i in range(len(robomaker_links)):
markdown_content += "- [Simulation %s](%s) \n" % (i + 1, robomaker_links[i])
@@ -152,12 +157,16 @@ def generate_robomaker_links(job_arns, aws_region):
def create_s3_endpoint_manually(aws_region, default_vpc):
- url = "https://%s.console.aws.amazon.com/vpc/home?region=%s#Endpoints:sort=vpcEndpointId" % (aws_region, aws_region)
+ url = "https://%s.console.aws.amazon.com/vpc/home?region=%s#Endpoints:sort=vpcEndpointId" % (
+ aws_region,
+ aws_region,
+ )
text = ">VPC S3 endpoint creation failed. Please do the following to create an endpoint manually:\n"
text += "1. Go to [VPC console | Endpoints](%s)\n" % url
text += "2. Click on `Create Endpoint`. Select Service Name as `com.amazonaws.%s.s3`.\n" % (aws_region)
text += "3. Next, select your Default VPC: `%s` and click the checkbox against the main Route Table ID\n" % (
- default_vpc)
+ default_vpc
+ )
text += "4. Select `Full Access` in policy and click on `Create Endpoint`\n"
text += "5. That should be it! Now wait for a few seconds before proceeding to the next cell."
return text
@@ -174,6 +183,7 @@ def generate_help_for_administrator_policy(role):
text += "6. Once this is complete, you are all set."
return text
+
def generate_help_for_experiment_manager_permissions(role):
role_name = role.split("/")[-1]
url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name
@@ -222,4 +232,3 @@ def generate_help_for_experiment_manager_permissions(role):
},```\n"""
text += "4. Now wait for a few minutes before executing this cell again!"
return text
-
diff --git a/09_deploy/common/misc.py b/09_deploy/common/misc.py
index 45ad3c78..e51551bd 100644
--- a/09_deploy/common/misc.py
+++ b/09_deploy/common/misc.py
@@ -26,10 +26,19 @@
import boto3
import json
-
-def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='',
- aws_account=None, aws_region=None, timeout=1200, limit=20,
- fetch_only=None, training_job_name=None):
+
+def wait_for_s3_object(
+ s3_bucket,
+ key,
+ local_dir,
+ local_prefix="",
+ aws_account=None,
+ aws_region=None,
+ timeout=1200,
+ limit=20,
+ fetch_only=None,
+ training_job_name=None,
+):
"""
Keep polling s3 object until it is generated.
Pulling down latest data to local directory with short key
@@ -50,15 +59,15 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='',
A list of all downloaded files, as local filenames
"""
session = boto3.Session()
- aws_account = aws_account or session.client("sts").get_caller_identity()['Account']
+ aws_account = aws_account or session.client("sts").get_caller_identity()["Account"]
aws_region = aws_region or session.region_name
- s3 = session.resource('s3')
- sagemaker = session.client('sagemaker')
+ s3 = session.resource("s3")
+ sagemaker = session.client("sagemaker")
bucket = s3.Bucket(s3_bucket)
objects = []
- print("Waiting for s3://%s/%s..." % (s3_bucket, key), end='', flush=True)
+ print("Waiting for s3://%s/%s..." % (s3_bucket, key), end="", flush=True)
start_time = time.time()
cnt = 0
while len(objects) == 0:
@@ -67,7 +76,7 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='',
objects = list(filter(fetch_only, objects))
if objects:
continue
- print('.', end='', flush=True)
+ print(".", end="", flush=True)
time.sleep(5)
cnt += 1
if cnt % 80 == 0:
@@ -75,12 +84,17 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='',
if time.time() > start_time + timeout:
raise FileNotFoundError("S3 object s3://%s/%s never appeared after %d seconds" % (s3_bucket, key, timeout))
if training_job_name:
- training_job_status = sagemaker.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus']
- if training_job_status == 'Failed':
- raise RuntimeError("Training job {} failed while waiting for S3 object s3://{}/{}"
- .format(training_job_name, s3_bucket, key))
-
- print('\n', end='', flush=True)
+ training_job_status = sagemaker.describe_training_job(TrainingJobName=training_job_name)[
+ "TrainingJobStatus"
+ ]
+ if training_job_status == "Failed":
+ raise RuntimeError(
+ "Training job {} failed while waiting for S3 object s3://{}/{}".format(
+ training_job_name, s3_bucket, key
+ )
+ )
+
+ print("\n", end="", flush=True)
if len(objects) > limit:
print("Only downloading %d of %d files" % (limit, len(objects)))
@@ -89,7 +103,7 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='',
fetched_files = []
for obj in objects:
print("Downloading %s" % obj.key)
- local_path = os.path.join(local_dir, local_prefix, obj.key.split('/')[-1])
+ local_path = os.path.join(local_dir, local_prefix, obj.key.split("/")[-1])
obj.Object().download_file(local_path)
fetched_files.append(local_path)
@@ -106,38 +120,30 @@ def get_execution_role(role_name="sagemaker", aws_account=None, aws_region=None)
aws_region (string): aws region where the repo is located
"""
session = boto3.Session()
- aws_account = aws_account or session.client("sts").get_caller_identity()['Account']
+ aws_account = aws_account or session.client("sts").get_caller_identity()["Account"]
aws_region = aws_region or session.region_name
- assume_role_policy_document = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Principal": {
- "Service": ["sagemaker.amazonaws.com", "robomaker.amazonaws.com"]
- },
- "Action": "sts:AssumeRole"
- }
- ]
- })
-
- client = session.client('iam')
+ assume_role_policy_document = json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {"Service": ["sagemaker.amazonaws.com", "robomaker.amazonaws.com"]},
+ "Action": "sts:AssumeRole",
+ }
+ ],
+ }
+ )
+
+ client = session.client("iam")
try:
client.get_role(RoleName=role_name)
except client.exceptions.NoSuchEntityException:
- client.create_role(
- RoleName=role_name,
- AssumeRolePolicyDocument=str(assume_role_policy_document)
- )
+ client.create_role(RoleName=role_name, AssumeRolePolicyDocument=str(assume_role_policy_document))
print("Created new sagemaker execution role: %s" % role_name)
- client.attach_role_policy(
- PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess',
- RoleName=role_name
- )
-
- return client.get_role(RoleName=role_name)['Role']['Arn']
-
+ client.attach_role_policy(PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", RoleName=role_name)
+ return client.get_role(RoleName=role_name)["Role"]["Arn"]
diff --git a/09_deploy/common/sagemaker_rl/coach_launcher.py b/09_deploy/common/sagemaker_rl/coach_launcher.py
index 4eff7a5f..4708e8d3 100644
--- a/09_deploy/common/sagemaker_rl/coach_launcher.py
+++ b/09_deploy/common/sagemaker_rl/coach_launcher.py
@@ -5,7 +5,7 @@
from rl_coach.base_parameters import VisualizationParameters, TaskParameters, Frameworks
from rl_coach.utils import short_dynamic_import
from rl_coach.core_types import SelectedPhaseOnlyDumpFilter, MaxDumpFilter, RunPhase
-import rl_coach.core_types
+import rl_coach.core_types
from rl_coach import logger
from rl_coach.logger import screen
import argparse
@@ -22,22 +22,22 @@
screen.set_use_colors(False) # Simple text logging so it looks good in CloudWatch
+
class CoachConfigurationList(ConfigurationList):
- """Helper Object for converting CLI arguments (or SageMaker hyperparameters)
+ """Helper Object for converting CLI arguments (or SageMaker hyperparameters)
into Coach configuration.
"""
# Being security-paranoid and not instantiating any arbitrary string the customer passes in
ALLOWED_TYPES = {
- 'Frames': rl_coach.core_types.Frames,
- 'EnvironmentSteps': rl_coach.core_types.EnvironmentSteps,
- 'EnvironmentEpisodes': rl_coach.core_types.EnvironmentEpisodes,
- 'TrainingSteps': rl_coach.core_types.TrainingSteps,
- 'Time': rl_coach.core_types.Time,
+ "Frames": rl_coach.core_types.Frames,
+ "EnvironmentSteps": rl_coach.core_types.EnvironmentSteps,
+ "EnvironmentEpisodes": rl_coach.core_types.EnvironmentEpisodes,
+ "TrainingSteps": rl_coach.core_types.TrainingSteps,
+ "Time": rl_coach.core_types.Time,
}
-
class SageMakerCoachPresetLauncher(CoachLauncher):
"""Base class for training RL tasks using RL-Coach.
Customers subclass this to define specific kinds of workloads, overriding these methods as needed.
@@ -47,7 +47,6 @@ def __init__(self):
super().__init__()
self.hyperparams = None
-
def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
"""Overrides the default CLI parsing.
Sets the configuration parameters for what a SageMaker run should do.
@@ -58,20 +57,20 @@ def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace
args, _ = parser.parse_known_args(args=empty_arg_list)
parser = self.sagemaker_argparser()
sage_args, unknown = parser.parse_known_args()
-
+
# Now fill in the args that we care about.
sagemaker_job_name = os.environ.get("sagemaker_job_name", "sagemaker-experiment")
args.experiment_name = logger.get_experiment_name(sagemaker_job_name)
-
+
# Override experiment_path used for outputs
- args.experiment_path = '/opt/ml/output/intermediate'
- rl_coach.logger.experiment_path = '/opt/ml/output/intermediate' # for gifs
+ args.experiment_path = "/opt/ml/output/intermediate"
+ rl_coach.logger.experiment_path = "/opt/ml/output/intermediate" # for gifs
- args.checkpoint_save_dir = '/opt/ml/output/data/checkpoint'
- args.checkpoint_save_secs = 10 # should avoid hardcoding
+ args.checkpoint_save_dir = "/opt/ml/output/data/checkpoint"
+ args.checkpoint_save_secs = 10 # should avoid hardcoding
# onnx for deployment for mxnet (not tensorflow)
- save_model = (sage_args.save_model == 1)
- backend = os.getenv('COACH_BACKEND', 'tensorflow')
+ save_model = sage_args.save_model == 1
+ backend = os.getenv("COACH_BACKEND", "tensorflow")
if save_model and backend == "mxnet":
args.export_onnx_graph = True
@@ -92,7 +91,7 @@ def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace
name = name[2:]
else:
raise ValueError("Unknown command-line argument %s" % name)
- val = unknown[i+1]
+ val = unknown[i + 1]
self.map_hyperparameter(name, val)
return args
@@ -107,29 +106,29 @@ def map_hyperparameter(self, name, value):
else:
raise ValueError("Unknown hyperparameter %s" % name)
-
def apply_hyperparameter(self, name, value):
"""Save this hyperparameter to be applied to the graph_manager object when
it's ready.
"""
- print("Applying RL hyperparameter %s=%s" % (name,value))
+ print("Applying RL hyperparameter %s=%s" % (name, value))
self.hyperparameters.store(name, value)
-
def default_preset_name(self):
"""
Sub-classes will typically return a single hard-coded string.
"""
try:
- #TODO: remove this after converting all samples.
+ # TODO: remove this after converting all samples.
default_preset = self.DEFAULT_PRESET
screen.warning("Deprecated configuration of default preset. Please implement default_preset_name()")
return default_preset
except:
pass
- raise NotImplementedError("Sub-classes must specify the name of the default preset "+
- "for this RL problem. This will be the name of a python "+
- "file (without .py) that defines a graph_manager variable")
+ raise NotImplementedError(
+ "Sub-classes must specify the name of the default preset "
+ + "for this RL problem. This will be the name of a python "
+ + "file (without .py) that defines a graph_manager variable"
+ )
def sagemaker_argparser(self) -> argparse.ArgumentParser:
"""
@@ -138,27 +137,32 @@ def sagemaker_argparser(self) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
# Arguably this would be cleaner if we copied the config from the base class argparser.
- parser.add_argument('-n', '--num_workers',
- help="(int) Number of workers for multi-process based agents, e.g. A3C",
- default=1,
- type=int)
- parser.add_argument('-p', '--RLCOACH_PRESET',
- help="(string) Name of the file with the RLCoach preset",
- default=self.default_preset_name(),
- type=str)
- parser.add_argument('--save_model',
- help="(int) Flag to save model artifact after training finish",
- default=0,
- type=int)
+ parser.add_argument(
+ "-n",
+ "--num_workers",
+ help="(int) Number of workers for multi-process based agents, e.g. A3C",
+ default=1,
+ type=int,
+ )
+ parser.add_argument(
+ "-p",
+ "--RLCOACH_PRESET",
+ help="(string) Name of the file with the RLCoach preset",
+ default=self.default_preset_name(),
+ type=str,
+ )
+ parser.add_argument(
+ "--save_model", help="(int) Flag to save model artifact after training finish", default=0, type=int
+ )
return parser
def path_of_main_launcher(self):
"""
A bit of python magic to find the path of the file that launched the current process.
"""
- main_mod = sys.modules['__main__']
+ main_mod = sys.modules["__main__"]
try:
- launcher_file = os.path.abspath(sys.modules['__main__'].__file__)
+ launcher_file = os.path.abspath(sys.modules["__main__"].__file__)
return os.path.dirname(launcher_file)
except AttributeError:
# If __main__.__file__ is missing, then we're probably in an interactive python shell
@@ -167,7 +171,7 @@ def path_of_main_launcher(self):
def preset_from_name(self, preset_name):
preset_path = self.path_of_main_launcher()
print("Loading preset %s from %s" % (preset_name, preset_path))
- preset_path = os.path.join(self.path_of_main_launcher(),preset_name) + '.py:graph_manager'
+ preset_path = os.path.join(self.path_of_main_launcher(), preset_name) + ".py:graph_manager"
graph_manager = short_dynamic_import(preset_path, ignore_module_case=True)
return graph_manager
@@ -178,56 +182,63 @@ def get_graph_manager_from_args(self, args):
self.hyperparameters.apply_subset(graph_manager, "rl.")
# Set framework
# Note: Some graph managers (e.g. HAC preset) create multiple agents and the attribute is called agents_params
- if hasattr(graph_manager, 'agent_params'):
+ if hasattr(graph_manager, "agent_params"):
for network_parameters in graph_manager.agent_params.network_wrappers.values():
network_parameters.framework = args.framework
- elif hasattr(graph_manager, 'agents_params'):
+ elif hasattr(graph_manager, "agents_params"):
for ap in graph_manager.agents_params:
for network_parameters in ap.network_wrappers.values():
network_parameters.framework = args.framework
return graph_manager
def _save_tf_model(self):
- ckpt_dir = '/opt/ml/output/data/checkpoint'
- model_dir = '/opt/ml/model'
+ ckpt_dir = "/opt/ml/output/data/checkpoint"
+ model_dir = "/opt/ml/model"
# Re-Initialize from the checkpoint so that you will have the latest models up.
- tf.train.init_from_checkpoint(ckpt_dir,
- {'main_level/agent/online/network_0/': 'main_level/agent/online/network_0'})
- tf.train.init_from_checkpoint(ckpt_dir,
- {'main_level/agent/online/network_1/': 'main_level/agent/online/network_1'})
+ tf.train.init_from_checkpoint(
+ ckpt_dir, {"main_level/agent/online/network_0/": "main_level/agent/online/network_0"}
+ )
+ tf.train.init_from_checkpoint(
+ ckpt_dir, {"main_level/agent/online/network_1/": "main_level/agent/online/network_1"}
+ )
# Create a new session with a new tf graph.
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(tf.global_variables_initializer()) # initialize the checkpoint.
# This is the node that will accept the input.
- input_nodes = tf.get_default_graph().get_tensor_by_name('main_level/agent/main/online/' + \
- 'network_0/observation/observation:0')
+ input_nodes = tf.get_default_graph().get_tensor_by_name(
+ "main_level/agent/main/online/" + "network_0/observation/observation:0"
+ )
# This is the node that will produce the output.
- output_nodes = tf.get_default_graph().get_operation_by_name('main_level/agent/main/online/' + \
- 'network_1/ppo_head_0/policy')
+ output_nodes = tf.get_default_graph().get_operation_by_name(
+ "main_level/agent/main/online/" + "network_1/ppo_head_0/policy"
+ )
# Save the model as a servable model.
- tf.saved_model.simple_save(session=sess,
- export_dir='model',
- inputs={"observation": input_nodes},
- outputs={"policy": output_nodes.outputs[0]})
+ tf.saved_model.simple_save(
+ session=sess,
+ export_dir="model",
+ inputs={"observation": input_nodes},
+ outputs={"policy": output_nodes.outputs[0]},
+ )
# Move to the appropriate folder. Don't mind the directory, this just works.
# rl-cart-pole is the name of the model. Remember it.
- shutil.move('model/', model_dir + '/model/tf-model/00000001/')
+ shutil.move("model/", model_dir + "/model/tf-model/00000001/")
# EASE will pick it up and upload to the right path.
print("Success")
def _save_onnx_model(self):
from .onnx_utils import fix_onnx_model
- ckpt_dir = '/opt/ml/output/data/checkpoint'
- model_dir = '/opt/ml/model'
+
+ ckpt_dir = "/opt/ml/output/data/checkpoint"
+ model_dir = "/opt/ml/model"
# find latest onnx file
# currently done by name, expected to be changed in future release of coach.
- glob_pattern = os.path.join(ckpt_dir, '*.onnx')
+ glob_pattern = os.path.join(ckpt_dir, "*.onnx")
onnx_files = [file for file in glob.iglob(glob_pattern, recursive=True)]
if len(onnx_files) > 0:
- extract_step = lambda string: int(re.search('/(\d*)_Step.*', string, re.IGNORECASE).group(1))
+ extract_step = lambda string: int(re.search("/(\d*)_Step.*", string, re.IGNORECASE).group(1))
onnx_files.sort(key=extract_step)
latest_onnx_file = onnx_files[-1]
# move to model directory
@@ -237,10 +248,10 @@ def _save_onnx_model(self):
fix_onnx_model(filepath_to)
else:
screen.warning("No ONNX files found in {}".format(ckpt_dir))
-
+
@classmethod
def train_main(cls):
- """Entrypoint for training.
+ """Entrypoint for training.
Parses command-line arguments and starts training.
"""
trainer = cls()
@@ -250,10 +261,10 @@ def train_main(cls):
parser = trainer.sagemaker_argparser()
sage_args, unknown = parser.parse_known_args()
if sage_args.save_model == 1:
- backend = os.getenv('COACH_BACKEND', 'tensorflow')
- if backend == 'tensorflow':
+ backend = os.getenv("COACH_BACKEND", "tensorflow")
+ if backend == "tensorflow":
trainer._save_tf_model()
- if backend == 'mxnet':
+ if backend == "mxnet":
trainer._save_onnx_model()
@@ -265,14 +276,15 @@ class SageMakerCoachLauncher(SageMakerCoachPresetLauncher):
def __init__(self):
super().__init__()
screen.warning("DEPRECATION WARNING: Please switch to SageMakerCoachPresetLauncher")
- #TODO: Remove this whole class when nobody's using it any more.
+ # TODO: Remove this whole class when nobody's using it any more.
def define_environment(self):
- return NotImplementedEror("Sub-class must define environment e.g. GymVectorEnvironment(level='your_module:YourClass')")
+ return NotImplementedEror(
+ "Sub-class must define environment e.g. GymVectorEnvironment(level='your_module:YourClass')"
+ )
def get_graph_manager_from_args(self, args):
- """Returns the GraphManager object for coach to use to train by calling improve()
- """
+ """Returns the GraphManager object for coach to use to train by calling improve()"""
# NOTE: TaskParameters are not configurable at this time.
# Visualization
@@ -306,8 +318,10 @@ def config_schedule(self, schedule_params):
pass
def define_agent(self):
- raise NotImplementedError("Subclass must create define_agent() method which returns an AgentParameters object. e.g.\n" \
- " return rl_coach.agents.dqn_agent.DQNAgentParameters()");
+ raise NotImplementedError(
+ "Subclass must create define_agent() method which returns an AgentParameters object. e.g.\n"
+ " return rl_coach.agents.dqn_agent.DQNAgentParameters()"
+ )
def config_visualization(self, vis_params):
vis_params.dump_gifs = True
diff --git a/09_deploy/common/sagemaker_rl/configuration_list.py b/09_deploy/common/sagemaker_rl/configuration_list.py
index 6768d0c3..ac4b92ce 100644
--- a/09_deploy/common/sagemaker_rl/configuration_list.py
+++ b/09_deploy/common/sagemaker_rl/configuration_list.py
@@ -8,14 +8,13 @@ class ConfigurationList(object):
def __init__(self):
"""Args:
- - arg_list [list]: list of arguments on the command-line like [key1, value1, key2, value2, ...]
- - prefix [str]: Prefix for every key that must be present, e.g. "--" for common command-line args
+ - arg_list [list]: list of arguments on the command-line like [key1, value1, key2, value2, ...]
+ - prefix [str]: Prefix for every key that must be present, e.g. "--" for common command-line args
"""
self.hp_dict = {}
def store(self, name, value):
- """Store a key/value hyperparameter combination
- """
+ """Store a key/value hyperparameter combination"""
self.hp_dict[name] = value
def apply_subset(self, config_object, prefix):
@@ -31,7 +30,7 @@ def apply_subset(self, config_object, prefix):
for key, val in list(self.hp_dict.items()):
if key.startswith(prefix):
logging.debug("Configuring %s with %s=%s" % (prefix, key, val))
- subkey = key[ len(prefix): ]
+ subkey = key[len(prefix) :]
msg = "%s%s=%s" % (prefix, subkey, val)
try:
self._set_rl_property_value(config_object, subkey, val, prefix)
@@ -41,20 +40,19 @@ def apply_subset(self, config_object, prefix):
del self.hp_dict[key]
def _set_rl_property_value(self, obj, key, val, path=""):
- """Sets a property on obj to val, or to a sub-object within obj if key looks like "foo.bar"
- """
+ """Sets a property on obj to val, or to a sub-object within obj if key looks like "foo.bar" """
if key.find(".") >= 0:
- top_key, sub_keys = key_list = key.split(".",1)
+ top_key, sub_keys = key_list = key.split(".", 1)
if top_key.startswith("__"):
raise ValueError("Attempting to set unsafe property name %s" % top_key)
- if isinstance(obj,dict):
+ if isinstance(obj, dict):
sub_obj = obj[top_key]
else:
sub_obj = obj.__dict__[top_key]
# Recurse
- return self._set_rl_property_value(sub_obj, sub_keys, val, "%s.%s" % (path,top_key) )
+ return self._set_rl_property_value(sub_obj, sub_keys, val, "%s.%s" % (path, top_key))
else:
- key, val = self._parse_type(key,val)
+ key, val = self._parse_type(key, val)
if key.startswith("__"):
raise ValueError("Attempting to set unsafe property name %s" % key)
if isinstance(obj, dict):
@@ -63,8 +61,7 @@ def _set_rl_property_value(self, obj, key, val, path=""):
obj.__dict__[key] = val
def _autotype(self, val):
- """Converts string to an int or float as possible.
- """
+ """Converts string to an int or float as possible."""
if type(val) == dict:
return val
if type(val) == list:
@@ -96,6 +93,8 @@ def _parse_type(self, key, val):
key, obj_type = key.split(":", 1)
cls = self.ALLOWED_TYPES.get(obj_type)
if not cls:
- raise ValueError("Unrecognized object type %s. Allowed values are %s" % (obj_type, self.ALLOWED_TYPES.keys()))
+ raise ValueError(
+ "Unrecognized object type %s. Allowed values are %s" % (obj_type, self.ALLOWED_TYPES.keys())
+ )
val = cls(val)
return key, val
diff --git a/09_deploy/common/sagemaker_rl/docker_utils.py b/09_deploy/common/sagemaker_rl/docker_utils.py
index d9111b6b..eced1d2f 100644
--- a/09_deploy/common/sagemaker_rl/docker_utils.py
+++ b/09_deploy/common/sagemaker_rl/docker_utils.py
@@ -1,6 +1,7 @@
import socket
import time
+
def get_ip_from_host(timeout=100, host_name=None):
counter = 0
ip_address = None
@@ -17,8 +18,11 @@ def get_ip_from_host(timeout=100, host_name=None):
time.sleep(1)
if counter == timeout and not ip_address:
- error_string = "Platform Error: Could not retrieve IP address \
- for %s in past %s seconds" % (host_name, timeout)
+ error_string = (
+ "Platform Error: Could not retrieve IP address \
+ for %s in past %s seconds"
+ % (host_name, timeout)
+ )
raise RuntimeError(error_string)
- return ip_address
\ No newline at end of file
+ return ip_address
diff --git a/09_deploy/common/sagemaker_rl/mpi_launcher.py b/09_deploy/common/sagemaker_rl/mpi_launcher.py
index 5fe9f169..5d8f0146 100644
--- a/09_deploy/common/sagemaker_rl/mpi_launcher.py
+++ b/09_deploy/common/sagemaker_rl/mpi_launcher.py
@@ -38,21 +38,18 @@ def _change_hostname(current_host):
def _start_ssh_daemon():
- """Starts the ssh deamon
- """
+ """Starts the ssh deamon"""
subprocess.Popen(["/usr/sbin/sshd", "-D"])
def _setup_mpi_environment(env):
- """Setup MPI environment, i.e. executing change hostname scrip and starting ssh deamon.
- """
+ """Setup MPI environment, i.e. executing change hostname scrip and starting ssh deamon."""
_change_hostname(env.current_host)
_start_ssh_daemon()
def _can_connect(host, port, s):
- """Checks if the connection to provided ``host`` and ``port`` is possible or not.
- """
+ """Checks if the connection to provided ``host`` and ``port`` is possible or not."""
try:
print("Testing connection to host {}".format(host))
s.connect((host, port))
@@ -86,15 +83,18 @@ def _create_mpi_script(env, train_script, train_script_args):
python_cmd.extend(hyperparameters)
python_cmd.extend(channels)
- content = textwrap.dedent("""#!/usr/bin/env bash
+ content = textwrap.dedent(
+ """#!/usr/bin/env bash
touch /mpi_is_running
%s
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}
-""" % ' '.join(python_cmd))
+"""
+ % " ".join(python_cmd)
+ )
- with open(_MPI_SCRIPT, 'w') as w:
+ with open(_MPI_SCRIPT, "w") as w:
w.write(content)
st = os.stat(_MPI_SCRIPT)
@@ -104,11 +104,11 @@ def _create_mpi_script(env, train_script, train_script_args):
class MPIMaster(object):
"""MPI Master
- Args:
- env (TrainingEnv): an instance of the training environment.
- process_per_host (int): Number of processes per host to be executed by MPI
- instance_type (str): Type of instance used for this job. It will be "local" for local mode. Its used to
- perform different setup for local mode or sagemaker mode.
+ Args:
+ env (TrainingEnv): an instance of the training environment.
+ process_per_host (int): Number of processes per host to be executed by MPI
+ instance_type (str): Type of instance used for this job. It will be "local" for local mode. Its used to
+ perform different setup for local mode or sagemaker mode.
"""
def __init__(self, env, process_per_host, instance_type):
@@ -117,8 +117,7 @@ def __init__(self, env, process_per_host, instance_type):
self.instance_type = instance_type
def _wait_for_worker_nodes_to_start_sshd(self, hosts, interval=1, timeout_in_seconds=180):
- """Wait for worker nodes to start their ssh deamon to allow MPI communication.
- """
+ """Wait for worker nodes to start their ssh deamon to allow MPI communication."""
with timeout(seconds=timeout_in_seconds):
while hosts:
print("hosts that aren't SSHable yet: {}".format(str(hosts)))
@@ -130,8 +129,7 @@ def _wait_for_worker_nodes_to_start_sshd(self, hosts, interval=1, timeout_in_sec
time.sleep(interval)
def _run_mpi_on_all_nodes(self):
- """Run MPI command to execute MPI_SCRIPT on all hosts.
- """
+ """Run MPI command to execute MPI_SCRIPT on all hosts."""
mpi_command = self._build_mpi_command()
cmd = shlex.split(mpi_command)
@@ -139,44 +137,50 @@ def _run_mpi_on_all_nodes(self):
print("MPI Command: {}".format(mpi_command))
with open(_MPI_SCRIPT) as f:
- print('Running user script:\n\n%s', f.read())
+ print("Running user script:\n\n%s", f.read())
subprocess.check_call(cmd)
def _build_mpi_command(self):
- """Build MPI command.
- """
+ """Build MPI command."""
num_hosts = len(self.env.hosts)
num_processes = self.process_per_host * num_hosts
# By default, use one process per GPU, or one process per node (if training with CPU).
- host_list = self.env.hosts if self.process_per_host == 1 else \
- [host + ':{}'.format(self.process_per_host) for host in self.env.hosts]
-
- print("Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format(self.env.hosts, host_list,
- self.process_per_host,
- num_processes))
- credential_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN']
+ host_list = (
+ self.env.hosts
+ if self.process_per_host == 1
+ else [host + ":{}".format(self.process_per_host) for host in self.env.hosts]
+ )
+
+ print(
+ "Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format(
+ self.env.hosts, host_list, self.process_per_host, num_processes
+ )
+ )
+ credential_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"]
interface_name = interface_name = self.env.network_interface_name
if self.instance_type == "local":
interface_name = "eth0"
- print('network interface name:' + interface_name + " " + str(self.instance_type))
-
- mpi_command = 'mpirun --host {}'.format(",".join(host_list)) \
- + " -np {} ".format(num_processes) \
- + " --allow-run-as-root" \
- + " --display-map" \
- + " --tag-output" \
- + " -mca btl_tcp_if_include {}".format(interface_name) \
- + " -mca oob_tcp_if_include {}".format(interface_name) \
- + " -x NCCL_SOCKET_IFNAME={}".format(interface_name) \
- + " --mca plm_rsh_no_tree_spawn 1" \
- + " -mca orte_abort_on_non_zero_status 1" \
- + " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO" \
- + " -x LD_LIBRARY_PATH -x PATH" \
- + " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY)
+ print("network interface name:" + interface_name + " " + str(self.instance_type))
+
+ mpi_command = (
+ "mpirun --host {}".format(",".join(host_list))
+ + " -np {} ".format(num_processes)
+ + " --allow-run-as-root"
+ + " --display-map"
+ + " --tag-output"
+ + " -mca btl_tcp_if_include {}".format(interface_name)
+ + " -mca oob_tcp_if_include {}".format(interface_name)
+ + " -x NCCL_SOCKET_IFNAME={}".format(interface_name)
+ + " --mca plm_rsh_no_tree_spawn 1"
+ + " -mca orte_abort_on_non_zero_status 1"
+ + " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO"
+ + " -x LD_LIBRARY_PATH -x PATH"
+ + " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY)
+ )
for v in credential_vars:
if v in os.environ:
@@ -194,8 +198,7 @@ def __call__(self):
self._run_mpi_on_all_nodes()
def is_master(self, hosts, current_host):
- """Checks if the current host is master or worker.
- """
+ """Checks if the current host is master or worker."""
print("Hosts: " + str(hosts) + " current host: " + str(current_host))
return current_host == sorted(list(hosts))[0]
@@ -205,14 +208,12 @@ class MPIWorker(object):
@retry(stop_max_delay=30000 * 1000, wait_fixed=1000, retry_on_result=lambda result: result is False)
def _wait_for_mpi_to_start_running(self):
- """Wait and retry loop until the MPI training starts on this worker.
- """
+ """Wait and retry loop until the MPI training starts on this worker."""
return os.path.isfile(_MPI_IS_RUNNING)
@retry(wait_fixed=5000, retry_on_result=lambda result: result is False)
def _wait_until_mpi_stops_running(self):
- """Wait and retry loop until the MPI training is finished on this worker.
- """
+ """Wait and retry loop until the MPI training is finished on this worker."""
return os.path.isfile(_MPI_IS_FINISHED)
def __call__(self, env):
@@ -248,7 +249,7 @@ def timeout(seconds=0, minutes=0, hours=0):
limit = seconds + 60 * minutes + 3600 * hours
def handler(signum, frame): # pylint: disable=W0613
- raise TimeoutError('timed out after {} seconds'.format(limit))
+ raise TimeoutError("timed out after {} seconds".format(limit))
try:
signal.signal(signal.SIGALRM, handler)
@@ -280,8 +281,7 @@ def __init__(self, train_script, train_script_args=None, num_of_processes_per_ho
def mpi_run(self):
env = sagemaker_containers.training_env()
- print("MPI requested with process per hosts: {}"
- .format(self._num_of_processes_per_host))
+ print("MPI requested with process per hosts: {}".format(self._num_of_processes_per_host))
_setup_mpi_environment(env)
_create_mpi_script(env, self._train_script, self._train_script_args)
diff --git a/09_deploy/common/sagemaker_rl/onnx_utils.py b/09_deploy/common/sagemaker_rl/onnx_utils.py
index 1840bed7..d5712db4 100644
--- a/09_deploy/common/sagemaker_rl/onnx_utils.py
+++ b/09_deploy/common/sagemaker_rl/onnx_utils.py
@@ -23,23 +23,19 @@ def get_correct_outputs(model):
else:
raise Exception("Can't determine the RL Agent used from the ONNX graph provided.")
-
+
def make_output(node_name, shape):
"""
Given a node name and output shape, will construct the correct Protobuf object.
"""
- return helper.make_tensor_value_info(
- name=node_name,
- elem_type=TensorProto.FLOAT,
- shape=shape
- )
+ return helper.make_tensor_value_info(name=node_name, elem_type=TensorProto.FLOAT, shape=shape)
def ppo_continuous_outputs(model):
"""
Collects the output nodes for continuous PPO.
"""
- # determine number of actions
+ # determine number of actions
log_std_node_name = "generalmodel0_singlemodel1_scaledgradhead0_continuousppohead0_log_std"
log_std_node = [i for i in model.graph.input if i.name == log_std_node_name][0]
num_actions = log_std_node.type.tensor_type.shape.dim[0].dim_value
@@ -59,7 +55,7 @@ def ppo_discrete_outputs(model):
"""
Collects the output nodes for discrete PPO.
"""
- # determine number of actions
+ # determine number of actions
bias_node_name = "generalmodel0_singlemodel1_scaledgradhead0_discreteppohead0_dense0_bias"
bias_node = [i for i in model.graph.input if i.name == bias_node_name][0]
num_actions = bias_node.type.tensor_type.shape.dim[0].dim_value
@@ -77,21 +73,23 @@ def save_model(model, output_nodes, filepath):
"""
Given an in memory model, will save to disk at given filepath.
"""
- new_graph = helper.make_graph(nodes=model.graph.node,
- name='new_graph',
- inputs=model.graph.input,
- outputs=output_nodes,
- initializer=model.graph.initializer)
+ new_graph = helper.make_graph(
+ nodes=model.graph.node,
+ name="new_graph",
+ inputs=model.graph.input,
+ outputs=output_nodes,
+ initializer=model.graph.initializer,
+ )
checker.check_graph(new_graph)
new_model = helper.make_model(new_graph)
with open(filepath, "wb") as file_handle:
serialized = new_model.SerializeToString()
file_handle.write(serialized)
-
+
def fix_onnx_model(filepath):
"""
- Applies an inplace fix to ONNX file from Coach.
+ Applies an inplace fix to ONNX file from Coach.
"""
model = onnx.load_model(filepath)
output_nodes = get_correct_outputs(model)
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py
index 8f3f3013..018c8d5c 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py
@@ -2,7 +2,8 @@
from boto3.dynamodb.conditions import Key
from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException
-logger=logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
+
class ExperimentDbClient(object):
def __init__(self, table_session):
@@ -10,151 +11,138 @@ def __init__(self, table_session):
def get_experiment_record(self, experiment_id):
response = self.table_session.query(
- ConsistentRead=True,
- KeyConditionExpression=Key('experiment_id').eq(experiment_id)
+ ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id)
)
- for i in response['Items']:
+ for i in response["Items"]:
return i
return None
def create_new_experiment_record(self, record):
try:
- self.table_session.put_item(
- Item=record,
- ConditionExpression='attribute_not_exists(experiment_id)'
- )
+ self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(experiment_id)")
except Exception as e:
if "ConditionalCheckFailedException" in str(e):
raise RecordAlreadyExistsException()
raise e
def update_experiment_record(self, record):
- self.table_session.put_item(
- Item=record
- )
+ self.table_session.put_item(Item=record)
def delete_item(self, experiment_id):
logger.warning("Deleting experiment record...")
- self.table_session.delete_item(
- Key={
- "experiment_id": experiment_id
- }
- )
+ self.table_session.delete_item(Key={"experiment_id": experiment_id})
#### Update states for training workflow
def update_training_workflow_metadata_with_validation(
- self,
- experiment_id,
- training_workflow_metadata,
- expected_current_next_model_to_train_id
- ):
- '''
+ self, experiment_id, training_workflow_metadata, expected_current_next_model_to_train_id
+ ):
+ """
Updates ExperimentDb record for experiment_id with new training_workflow_metadata,
while validating, next_model_to_train_id is as expected in the old record.
- '''
+ """
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET training_workflow_metadata = :new_val',
- ConditionExpression='training_workflow_metadata.next_model_to_train_id = :exp_model_id',
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET training_workflow_metadata = :new_val",
+ ConditionExpression="training_workflow_metadata.next_model_to_train_id = :exp_model_id",
ExpressionAttributeValues={
- ':new_val': training_workflow_metadata,
- ':exp_model_id': expected_current_next_model_to_train_id
- }
+ ":new_val": training_workflow_metadata,
+ ":exp_model_id": expected_current_next_model_to_train_id,
+ },
)
def update_experiment_training_state(self, experiment_id, training_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET training_workflow_metadata.training_state = :val',
- ExpressionAttributeValues={':val': training_state}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET training_workflow_metadata.training_state = :val",
+ ExpressionAttributeValues={":val": training_state},
)
def update_experiment_last_trained_model_id(self, experiment_id, last_trained_model_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET training_workflow_metadata.last_trained_model_id = :val',
- ExpressionAttributeValues={':val': last_trained_model_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET training_workflow_metadata.last_trained_model_id = :val",
+ ExpressionAttributeValues={":val": last_trained_model_id},
)
def update_experiment_next_model_to_train_id(self, experiment_id, next_model_to_train_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET training_workflow_metadata.next_model_to_train_id = :val',
- ExpressionAttributeValues={':val': next_model_to_train_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET training_workflow_metadata.next_model_to_train_id = :val",
+ ExpressionAttributeValues={":val": next_model_to_train_id},
)
#### Update states for hosting workflow
def update_experiment_hosting_state(self, experiment_id, hosting_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET hosting_workflow_metadata.hosting_state = :val',
- ExpressionAttributeValues={':val': hosting_state}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET hosting_workflow_metadata.hosting_state = :val",
+ ExpressionAttributeValues={":val": hosting_state},
)
def update_experiment_last_hosted_model_id(self, experiment_id, last_hosted_model_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET hosting_workflow_metadata.last_hosted_model_id = :val',
- ExpressionAttributeValues={':val': last_hosted_model_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET hosting_workflow_metadata.last_hosted_model_id = :val",
+ ExpressionAttributeValues={":val": last_hosted_model_id},
)
def update_experiment_next_model_to_host_id(self, experiment_id, next_model_to_host_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET hosting_workflow_metadata.next_model_to_host_id = :val',
- ExpressionAttributeValues={':val': next_model_to_host_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET hosting_workflow_metadata.next_model_to_host_id = :val",
+ ExpressionAttributeValues={":val": next_model_to_host_id},
)
def update_experiment_hosting_endpoint(self, experiment_id, hosting_endpoint):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET hosting_workflow_metadata.hosting_endpoint = :val',
- ExpressionAttributeValues={':val': hosting_endpoint}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET hosting_workflow_metadata.hosting_endpoint = :val",
+ ExpressionAttributeValues={":val": hosting_endpoint},
)
#### Update states for joining workflow
def update_experiment_joining_state(self, experiment_id, joining_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET joining_workflow_metadata.joining_state = :val',
- ExpressionAttributeValues={':val': joining_state}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET joining_workflow_metadata.joining_state = :val",
+ ExpressionAttributeValues={":val": joining_state},
)
def update_experiment_last_joined_job_id(self, experiment_id, last_joined_job_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET joining_workflow_metadata.last_joined_job_id = :val',
- ExpressionAttributeValues={':val': last_joined_job_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET joining_workflow_metadata.last_joined_job_id = :val",
+ ExpressionAttributeValues={":val": last_joined_job_id},
)
def update_experiment_next_join_job_id(self, experiment_id, next_join_job_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET joining_workflow_metadata.next_join_job_id = :val',
- ExpressionAttributeValues={':val': next_join_job_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET joining_workflow_metadata.next_join_job_id = :val",
+ ExpressionAttributeValues={":val": next_join_job_id},
)
#### Update states for evaluation workflow
def update_experiment_evaluation_state(self, experiment_id, evaluation_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET evaluation_workflow_metadata.evaluation_state = :val',
- ExpressionAttributeValues={':val': evaluation_state}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET evaluation_workflow_metadata.evaluation_state = :val",
+ ExpressionAttributeValues={":val": evaluation_state},
)
def update_experiment_last_evaluation_job_id(self, experiment_id, last_evaluation_job_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET evaluation_workflow_metadata.last_evaluation_job_id = :val',
- ExpressionAttributeValues={':val': last_evaluation_job_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET evaluation_workflow_metadata.last_evaluation_job_id = :val",
+ ExpressionAttributeValues={":val": last_evaluation_job_id},
)
def update_experiment_next_evaluation_job_id(self, experiment_id, next_evaluation_job_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id},
- UpdateExpression=f'SET evaluation_workflow_metadata.next_evaluation_job_id = :val',
- ExpressionAttributeValues={':val': next_evaluation_job_id}
+ Key={"experiment_id": experiment_id},
+ UpdateExpression=f"SET evaluation_workflow_metadata.next_evaluation_job_id = :val",
+ ExpressionAttributeValues={":val": next_evaluation_job_id},
)
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py
index 9299266e..9d7a7605 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py
@@ -2,7 +2,8 @@
from boto3.dynamodb.conditions import Key
from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException
-logger=logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
+
class JoinDbClient(object):
def __init__(self, table_session):
@@ -17,35 +18,29 @@ def check_join_job_record_exists(self, experiment_id, join_job_id):
def get_join_job_record(self, experiment_id, join_job_id):
response = self.table_session.query(
ConsistentRead=True,
- KeyConditionExpression=Key('experiment_id').eq(experiment_id) & Key('join_job_id').eq(join_job_id)
+ KeyConditionExpression=Key("experiment_id").eq(experiment_id) & Key("join_job_id").eq(join_job_id),
)
- for i in response['Items']:
+ for i in response["Items"]:
return i
return None
def create_new_join_job_record(self, record):
try:
- self.table_session.put_item(
- Item=record,
- ConditionExpression='attribute_not_exists(join_job_id)'
- )
+ self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(join_job_id)")
except Exception as e:
if "ConditionalCheckFailedException" in str(e):
raise RecordAlreadyExistsException()
raise e
def update_join_job_record(self, record):
- self.table_session.put_item(
- Item=record
- )
+ self.table_session.put_item(Item=record)
def get_all_join_job_records_of_experiment(self, experiment_id):
response = self.table_session.query(
- ConsistentRead=True,
- KeyConditionExpression=Key('experiment_id').eq(experiment_id)
+ ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id)
)
- if response['Items']:
- return response['Items']
+ if response["Items"]:
+ return response["Items"]
else:
return None
@@ -54,69 +49,64 @@ def batch_delete_items(self, experiment_id, join_job_id_list):
with self.table_session.batch_writer() as batch:
for join_job_id in join_job_id_list:
logger.debug(f"Deleting join job record {join_job_id}...")
- batch.delete_item(
- Key={
- 'experiment_id': experiment_id,
- 'join_job_id': join_job_id
- }
- )
+ batch.delete_item(Key={"experiment_id": experiment_id, "join_job_id": join_job_id})
def update_join_job_current_state(self, experiment_id, join_job_id, current_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET current_state = :val',
- ExpressionAttributeValues={':val': current_state}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET current_state = :val",
+ ExpressionAttributeValues={":val": current_state},
)
- def update_join_job_input_obs_data_s3_path(self, experiment_id,
- join_job_id, input_obs_data_s3_path):
+ def update_join_job_input_obs_data_s3_path(self, experiment_id, join_job_id, input_obs_data_s3_path):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET input_obs_data_s3_path = :val',
- ExpressionAttributeValues={':val': input_obs_data_s3_path}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET input_obs_data_s3_path = :val",
+ ExpressionAttributeValues={":val": input_obs_data_s3_path},
)
-
- def update_join_job_input_reward_data_s3_path(self, experiment_id,
- join_job_id, input_reward_data_s3_path):
+
+ def update_join_job_input_reward_data_s3_path(self, experiment_id, join_job_id, input_reward_data_s3_path):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET input_reward_data_s3_path = :val',
- ExpressionAttributeValues={':val': input_reward_data_s3_path}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET input_reward_data_s3_path = :val",
+ ExpressionAttributeValues={":val": input_reward_data_s3_path},
)
def update_join_job_join_query_ids(self, experiment_id, join_job_id, join_query_ids):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET join_query_ids = :val',
- ExpressionAttributeValues={':val': join_query_ids}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET join_query_ids = :val",
+ ExpressionAttributeValues={":val": join_query_ids},
)
def update_join_job_obs_end_time(self, experiment_id, join_job_id, obs_end_time):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET obs_end_time = :val',
- ExpressionAttributeValues={':val': obs_end_time}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET obs_end_time = :val",
+ ExpressionAttributeValues={":val": obs_end_time},
)
def update_join_job_obs_start_time(self, experiment_id, join_job_id, obs_start_time):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET obs_start_time = :val',
- ExpressionAttributeValues={':val': obs_start_time}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET obs_start_time = :val",
+ ExpressionAttributeValues={":val": obs_start_time},
)
- def update_join_job_output_joined_eval_data_s3_path(self, experiment_id,
- join_job_id, output_joined_eval_data_s3_path):
+ def update_join_job_output_joined_eval_data_s3_path(
+ self, experiment_id, join_job_id, output_joined_eval_data_s3_path
+ ):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET output_joined_eval_data_s3_path = :val',
- ExpressionAttributeValues={':val': output_joined_eval_data_s3_path}
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET output_joined_eval_data_s3_path = :val",
+ ExpressionAttributeValues={":val": output_joined_eval_data_s3_path},
)
- def update_join_job_output_joined_train_data_s3_path(self, experiment_id,
- join_job_id, output_joined_train_data_s3_path):
+ def update_join_job_output_joined_train_data_s3_path(
+ self, experiment_id, join_job_id, output_joined_train_data_s3_path
+ ):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'join_job_id': join_job_id},
- UpdateExpression=f'SET output_joined_train_data_s3_path = :val',
- ExpressionAttributeValues={':val': output_joined_train_data_s3_path}
- )
\ No newline at end of file
+ Key={"experiment_id": experiment_id, "join_job_id": join_job_id},
+ UpdateExpression=f"SET output_joined_train_data_s3_path = :val",
+ ExpressionAttributeValues={":val": output_joined_train_data_s3_path},
+ )
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py
index 11510e59..7d14d496 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py
@@ -4,12 +4,14 @@
from boto3.dynamodb.conditions import Key
from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException
-logger=logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
+
class ModelDbClient:
"""
- TODO: Deprecate and embed this class in ModelRecord.
+ TODO: Deprecate and embed this class in ModelRecord.
"""
+
def __init__(self, table_session):
self.table_session = table_session
@@ -22,9 +24,9 @@ def check_model_record_exists(self, experiment_id, model_id):
def get_model_record(self, experiment_id, model_id):
response = self.table_session.query(
ConsistentRead=True,
- KeyConditionExpression=Key('experiment_id').eq(experiment_id) & Key('model_id').eq(model_id)
+ KeyConditionExpression=Key("experiment_id").eq(experiment_id) & Key("model_id").eq(model_id),
)
- for i in response['Items']:
+ for i in response["Items"]:
return i
return None
@@ -38,52 +40,46 @@ def get_model_record_with_retry(self, experiment_id, model_id, retry_gap=5):
def create_new_model_record(self, record):
try:
- self.table_session.put_item(
- Item=record,
- ConditionExpression='attribute_not_exists(model_id)'
- )
+ self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(model_id)")
except Exception as e:
if "ConditionalCheckFailedException" in str(e):
raise RecordAlreadyExistsException()
raise e
-
+
def update_model_job_state(self, model_record):
self.update_model_record(model_record)
-
+
def update_model_as_pending(self, model_record):
# TODO: a model can only be put to pending, from pending state.
self.update_model_record(model_record)
-
+
def update_model_as_failed(self, model_record):
self.update_model_record(model_record)
def update_model_eval_job_state(self, model_record):
- # TODO: conditional check to verify model is in *ing state while updating...
+ # TODO: conditional check to verify model is in *ing state while updating...
# Not Trained or some final state.
self.update_model_record(model_record)
def update_model_eval_as_pending(self, model_record):
- # TODO: a model eval_state can only be put to pending, from pending state
+ # TODO: a model eval_state can only be put to pending, from pending state
# or a final state. (coz of reruns of evaluation)
self.update_model_record(model_record)
def update_model_eval_as_failed(self, model_record):
- # TODO: conditional check to verify model is in *ing state while updating...
+ # TODO: conditional check to verify model is in *ing state while updating...
# Not Trained or some final state.
self.update_model_record(model_record)
def update_model_record(self, record):
- self.table_session.put_item(
- Item=record
- )
+ self.table_session.put_item(Item=record)
def get_all_model_records_of_experiment(self, experiment_id):
response = self.table_session.query(
- ConsistentRead=True,
- KeyConditionExpression=Key('experiment_id').eq(experiment_id)
+ ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id)
)
- if response['Items']:
- return response['Items']
+ if response["Items"]:
+ return response["Items"]
else:
return None
@@ -92,88 +88,82 @@ def batch_delete_items(self, experiment_id, model_id_list):
with self.table_session.batch_writer() as batch:
for model_id in model_id_list:
logger.debug(f"Deleting model record '{model_id}'...")
- batch.delete_item(
- Key={
- 'experiment_id': experiment_id,
- 'model_id': model_id
- }
- )
+ batch.delete_item(Key={"experiment_id": experiment_id, "model_id": model_id})
def update_model_input_model_id(self, experiment_id, model_id, input_model_id):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET input_model_id = :val',
- ExpressionAttributeValues={':val': input_model_id}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET input_model_id = :val",
+ ExpressionAttributeValues={":val": input_model_id},
)
def update_model_input_data_s3_prefix(self, experiment_id, model_id, input_data_s3_prefix):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET input_data_s3_prefix = :val',
- ExpressionAttributeValues={':val': input_data_s3_prefix}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET input_data_s3_prefix = :val",
+ ExpressionAttributeValues={":val": input_data_s3_prefix},
)
+
def update_model_s3_model_output_path(self, experiment_id, model_id, s3_model_output_path):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET s3_model_output_path = :val',
- ExpressionAttributeValues={':val': s3_model_output_path}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET s3_model_output_path = :val",
+ ExpressionAttributeValues={":val": s3_model_output_path},
)
def update_model_train_state(self, experiment_id, model_id, train_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET train_state = :val',
- ExpressionAttributeValues={':val': train_state}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET train_state = :val",
+ ExpressionAttributeValues={":val": train_state},
)
-
+
def update_model_eval_state(self, experiment_id, model_id, eval_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET eval_state = :val',
- ExpressionAttributeValues={':val': eval_state}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET eval_state = :val",
+ ExpressionAttributeValues={":val": eval_state},
)
def update_model_eval_scores(self, experiment_id, model_id, eval_scores):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET eval_scores = :val',
- ExpressionAttributeValues={':val': eval_scores}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET eval_scores = :val",
+ ExpressionAttributeValues={":val": eval_scores},
)
def update_model_eval_scores_and_state(self, experiment_id, model_id, eval_scores, eval_state):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET eval_scores = :score_val, eval_state = :state_val',
- ExpressionAttributeValues={
- ':score_val': eval_scores,
- ':state_val': eval_state
- }
- )
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET eval_scores = :score_val, eval_state = :state_val",
+ ExpressionAttributeValues={":score_val": eval_scores, ":state_val": eval_state},
+ )
def update_model_training_start_time(self, experiment_id, model_id, training_start_time):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET training_start_time = :val',
- ExpressionAttributeValues={':val': training_start_time}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET training_start_time = :val",
+ ExpressionAttributeValues={":val": training_start_time},
)
def update_model_training_end_time(self, experiment_id, model_id, training_end_time):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
- UpdateExpression=f'SET training_end_time = :val',
- ExpressionAttributeValues={':val': training_end_time}
+ Key={"experiment_id": experiment_id, "model_id": model_id},
+ UpdateExpression=f"SET training_end_time = :val",
+ ExpressionAttributeValues={":val": training_end_time},
)
- def update_model_training_stats(self, experiment_id, model_id,
- s3_model_output_path, training_start_time, training_end_time, train_state):
+ def update_model_training_stats(
+ self, experiment_id, model_id, s3_model_output_path, training_start_time, training_end_time, train_state
+ ):
self.table_session.update_item(
- Key={'experiment_id': experiment_id, 'model_id': model_id},
+ Key={"experiment_id": experiment_id, "model_id": model_id},
UpdateExpression=f"SET s3_model_output_path = :path_val, training_start_time = :start_time_val, "
f"training_end_time = :end_time_val, train_state = :state_val",
ExpressionAttributeValues={
- ':path_val': s3_model_output_path,
- ':start_time_val': training_start_time,
- ':end_time_val': training_end_time,
- ':state_val': train_state
- }
- )
\ No newline at end of file
+ ":path_val": s3_model_output_path,
+ ":start_time_val": training_start_time,
+ ":end_time_val": training_end_time,
+ ":state_val": train_state,
+ },
+ )
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py
index 1855e790..d22cc96a 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py
@@ -1,8 +1,10 @@
class RecordAlreadyExistsException(Exception):
pass
+
class ConcurrentModificationException(Exception):
pass
+
class ConditionalCheckFailure(Exception):
- pass
\ No newline at end of file
+ pass
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py
index a46a65f1..334226f7 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py
@@ -1,20 +1,26 @@
class UnhandledWorkflowException(Exception):
pass
+
class SageMakerTrainingJobException(Exception):
pass
+
class SageMakerHostingException(Exception):
pass
+
class WorkflowJoiningJobException(Exception):
pass
+
class EvalScoreNotAvailableException(Exception):
pass
+
class JoinQueryIdsNotAvailableException(Exception):
pass
+
class InvalidUsageException(Exception):
- pass
\ No newline at end of file
+ pass
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py
index b09fb0a0..001cfeed 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py
@@ -17,18 +17,15 @@
logger = logging.getLogger(__name__)
+
class ResourceManager(object):
"""A resource manager entity to manage computing resource creation
and cleanup for the experiment.
"""
- def __init__(
- self,
- resource_config,
- boto_session=None
- ):
+ def __init__(self, resource_config, boto_session=None):
"""Initialize a resource manager entity given a resource config
-
+
Args:
resource_config (dict): A dictionary containing configuration
of the computing resource
@@ -56,7 +53,7 @@ def __init__(
@property
def firehose_bucket(self):
- if hasattr(self, 'firehose_s3_bucket_name'):
+ if hasattr(self, "firehose_s3_bucket_name"):
return self.firehose_s3_bucket_name
account = self.boto_session.client("sts").get_caller_identity()["Account"]
region = self.boto_session.region_name
@@ -71,32 +68,36 @@ def create_shared_resource_if_not_exist(self):
and IAM role to grant relevant resource permission
"""
if self._usable_shared_cf_stack_exists():
- logger.info("Using Resources in CloudFormation stack named: {} " \
- "for Shared Resources.".format(self.shared_resource_stack_name))
+ logger.info(
+ "Using Resources in CloudFormation stack named: {} "
+ "for Shared Resources.".format(self.shared_resource_stack_name)
+ )
else:
- logger.info("Creating a new CloudFormation stack for Shared Resources. " \
- "You can always reuse this StackName in your other experiments")
+ logger.info(
+ "Creating a new CloudFormation stack for Shared Resources. "
+ "You can always reuse this StackName in your other experiments"
+ )
self._create_new_cloudformation_stack()
# use Output Resources Names from CloudFromation stack
- self.exp_db_table_name = self._get_cf_output_by_key('ExperimentDbTableName')
- self.join_db_table_name = self._get_cf_output_by_key('JoinDbTableName')
+ self.exp_db_table_name = self._get_cf_output_by_key("ExperimentDbTableName")
+ self.join_db_table_name = self._get_cf_output_by_key("JoinDbTableName")
self.model_db_table_name = self._get_cf_output_by_key("ModelDbTableName")
- self.iam_role_arn = self._get_cf_output_by_key('IAMRoleArn')
-
+ self.iam_role_arn = self._get_cf_output_by_key("IAMRoleArn")
+
# initialize DynamoDb clients!
- experiment_db_session = self.boto_session.resource('dynamodb').Table(self.exp_db_table_name)
+ experiment_db_session = self.boto_session.resource("dynamodb").Table(self.exp_db_table_name)
self.exp_db_client = ExperimentDbClient(experiment_db_session)
- join_db_session = self.boto_session.resource('dynamodb').Table(self.join_db_table_name)
+ join_db_session = self.boto_session.resource("dynamodb").Table(self.join_db_table_name)
self.join_db_client = JoinDbClient(join_db_session)
- model_db_session = self.boto_session.resource('dynamodb').Table(self.model_db_table_name)
+ model_db_session = self.boto_session.resource("dynamodb").Table(self.model_db_table_name)
self.model_db_client = ModelDbClient(model_db_session)
def _usable_shared_cf_stack_exists(self):
"""Check if the shared cf stack exist and is usable
-
+
Returns:
bool: Whether the shared cf stack is usable
"""
@@ -104,44 +105,54 @@ def _usable_shared_cf_stack_exists(self):
# CF stack in one of [CREATE|UPDATE|ROLLBACK]_COMPLETE state
try:
stack_name = self.shared_resource_stack_name
- response = self.cf_client.describe_stacks(
- StackName=stack_name)["Stacks"]
+ response = self.cf_client.describe_stacks(StackName=stack_name)["Stacks"]
if len(response) == 0:
return False
except Exception as e:
if "UnauthorizedOperation" in str(e):
- raise Exception("You are unauthorized to describe a CloudFormation Stack. Please update your Role with "
- " appropriate permissions.")
+ raise Exception(
+ "You are unauthorized to describe a CloudFormation Stack. Please update your Role with "
+ " appropriate permissions."
+ )
elif "ValidationError" in str(e):
# stack doesn't exists
return False
else:
raise e
-
+
stack_details = response[0]
- stack_status = stack_details['StackStatus']
- if stack_status in ['UPDATE_COMPLETE', 'CREATE_COMPLETE']:
+ stack_status = stack_details["StackStatus"]
+ if stack_status in ["UPDATE_COMPLETE", "CREATE_COMPLETE"]:
return True
elif stack_status in ["DELETE_COMPLETE"]:
return False
elif stack_status in ["ROLLBACK_COMPLETE"]:
- logger.error(f"Stack with name {stack_name} is in {stack_status} state! Please delete/ stabilize/ or "
- "or update Config.yaml to create a new stack")
- raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
- f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}"
+ logger.error(
+ f"Stack with name {stack_name} is in {stack_status} state! Please delete/ stabilize/ or "
+ "or update Config.yaml to create a new stack"
+ )
+ raise Exception(
+ f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
+ f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}"
)
elif "FAILED" in stack_status:
- logger.error(f"Stack with name {stack_name} in {stack_status} state! Please delete the stack"
- " or update Config.yaml to create a new stack")
- raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
- f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}"
+ logger.error(
+ f"Stack with name {stack_name} in {stack_status} state! Please delete the stack"
+ " or update Config.yaml to create a new stack"
+ )
+ raise Exception(
+ f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
+ f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}"
)
elif "DELETE" in stack_status:
# already checked DELETE_COMPLETE above
- logger.error("Stack with name {} is in {} state! Cannot continue further!" \
- " Please wait for the delete to complete".format(stack_name, stack_status))
- raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
- f"Please retry after the stack gets Deleted/or debug the stack here: {self._get_cf_stack_events_link()}"
+ logger.error(
+ "Stack with name {} is in {} state! Cannot continue further!"
+ " Please wait for the delete to complete".format(stack_name, stack_status)
+ )
+ raise Exception(
+ f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. "
+ f"Please retry after the stack gets Deleted/or debug the stack here: {self._get_cf_stack_events_link()}"
)
elif "CREATE" in stack_status:
# one of the create statuses!
@@ -150,15 +161,14 @@ def _usable_shared_cf_stack_exists(self):
self._wait_for_cf_stack_create_to_complete()
return True
else:
- # assume stack in modifying. wait for it to goto
+ # assume stack in modifying. wait for it to goto
logger.info("Stack in {} state. Waiting for it's to end in successful state...".format(stack_status))
self._wait_for_cf_stack_update_to_complete()
return True
-
def _create_new_cloudformation_stack(self):
"""Create a new cloudformation stack
-
+
Returns:
bool: whether successfully create a new cloudformation stack
"""
@@ -167,9 +177,9 @@ def _create_new_cloudformation_stack(self):
parameters = [
{
"ParameterKey": "IAMRoleName",
- "ParameterValue": self._get_iam_role_property('role_name', 'role_for_cl'),
+ "ParameterValue": self._get_iam_role_property("role_name", "role_for_cl"),
"UsePreviousValue": True,
- "ResolvedValue": "string"
+ "ResolvedValue": "string",
},
]
parameters.extend(self._get_cloudformation_parameters_for_db())
@@ -179,28 +189,30 @@ def _create_new_cloudformation_stack(self):
StackName=cf_stack_name,
TemplateBody=self._parse_template(),
Parameters=parameters,
- Capabilities=[
- 'CAPABILITY_NAMED_IAM'
- ]
+ Capabilities=["CAPABILITY_NAMED_IAM"],
)
logger.info("Creating CloudFormation Stack for shared resource!")
self._wait_for_cf_stack_create_to_complete()
return True
- except Exception as e:
+ except Exception as e:
if "UnauthorizedOperation" in str(e):
- raise Exception("You are unauthorized to create a CloudFormation Stack. Please update your Role with "
- " appropriate permissions.")
+ raise Exception(
+ "You are unauthorized to create a CloudFormation Stack. Please update your Role with "
+ " appropriate permissions."
+ )
elif "AlreadyExists" in str(e):
# it came here it means it must be in one for "CREATING states"
- logger.warn(f"A stack with name {cf_stack_name} already exists. Reusing the stack" \
- " resources for this experiment")
+ logger.warn(
+ f"A stack with name {cf_stack_name} already exists. Reusing the stack"
+ " resources for this experiment"
+ )
self._wait_for_cf_stack_create_to_complete()
return False
- raise(e)
+ raise (e)
def _get_cf_stack_events_link(self):
"""Get events link for the given shared cf stack
-
+
Returns:
str: events link for the cf stack
"""
@@ -209,43 +221,31 @@ def _get_cf_stack_events_link(self):
return f"https://{region}.console.aws.amazon.com/cloudformation/home?region={region}#/stacks/events?stackId={self.shared_resource_stack_name}"
def _wait_for_cf_stack_create_to_complete(self):
- """Wait until the cf stack creation complete
- """
- cf_waiter = self.cf_client.get_waiter('stack_create_complete')
+ """Wait until the cf stack creation complete"""
+ cf_waiter = self.cf_client.get_waiter("stack_create_complete")
logger.info("Waiting for stack to get to CREATE_COMPLETE state....")
try:
- cf_waiter.wait(
- StackName=self.shared_resource_stack_name,
- WaiterConfig={
- 'Delay': 10,
- 'MaxAttempts': 60
- }
- )
+ cf_waiter.wait(StackName=self.shared_resource_stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60})
except Exception as e:
logger.error(e)
logger.error("Failed to Create Stack with name {} ".format(self.shared_resource_stack_name))
- raise Exception(f"Failed to Create Shared Resource Stack. "
- f"Please debug the stack here: {self._get_cf_stack_events_link()}"
+ raise Exception(
+ f"Failed to Create Shared Resource Stack. "
+ f"Please debug the stack here: {self._get_cf_stack_events_link()}"
)
def _wait_for_cf_stack_update_to_complete(self):
- """Wait until the cf stack update complete
- """
- cf_waiter = self.cf_client.get_waiter('stack_update_complete')
+ """Wait until the cf stack update complete"""
+ cf_waiter = self.cf_client.get_waiter("stack_update_complete")
logger.info("Waiting for stack to get to Successful Update state....")
try:
- cf_waiter.wait(
- StackName=self.shared_resource_stack_name,
- WaiterConfig={
- 'Delay': 10,
- 'MaxAttempts': 6
- }
- )
+ cf_waiter.wait(StackName=self.shared_resource_stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 6})
except Exception as e:
logger.error(e)
logger.error("Failed to use Stack with name {} ".format(self.shared_resource_stack_name))
- raise Exception(f"The provided CloudFormation Stack for Shared Resource is unstable. "
- f"Please debug the stack here: {self._get_cf_stack_events_link()}"
+ raise Exception(
+ f"The provided CloudFormation Stack for Shared Resource is unstable. "
+ f"Please debug the stack here: {self._get_cf_stack_events_link()}"
)
def _parse_template(self):
@@ -275,32 +275,32 @@ def _get_cloudformation_parameters_for_db(self):
"ParameterKey": parameter_prefix + "Name",
"ParameterValue": self._get_resource_property(parameter_prefix, "table_name"),
"UsePreviousValue": True,
- "ResolvedValue": "string"
- },
+ "ResolvedValue": "string",
+ },
{
"ParameterKey": parameter_prefix + "RCU",
- "ParameterValue": self._get_resource_property(parameter_prefix, "rcu", '5'),
+ "ParameterValue": self._get_resource_property(parameter_prefix, "rcu", "5"),
"UsePreviousValue": True,
- "ResolvedValue": "string"
- },
+ "ResolvedValue": "string",
+ },
{
"ParameterKey": parameter_prefix + "WCU",
- "ParameterValue": self._get_resource_property(parameter_prefix, "wcu", '5'),
+ "ParameterValue": self._get_resource_property(parameter_prefix, "wcu", "5"),
"UsePreviousValue": True,
- "ResolvedValue": "string"
- }
+ "ResolvedValue": "string",
+ },
]
json_parameter_list.extend(json_params)
return json_parameter_list
def _get_resource_property(self, resource_name, property_name, default_value=None):
"""Get property value of given resource
-
+
Args:
- resource_name (str): Name of the resource
+ resource_name (str): Name of the resource
property_name (str): Name of the property
default_value (str): Default value of the property
-
+
Returns:
str: Property value of the resource
"""
@@ -320,43 +320,43 @@ def _get_experiment_db_property(self, property_name, default_value=None):
Args:
property_name (str): name of property
default_value (): default value of the property
-
+
Returns:
value of the property
"""
experiment_db_config = self._resource_config.get("shared_resource").get("experiment_db")
return experiment_db_config.get(property_name, default_value)
-
+
def _get_model_db_property(self, property_name, default_value=None):
"""Return property value of model table
Args:
property_name (str): name of property
default_value (): default value of the property
-
+
Returns:
value of the property
"""
model_db_config = self._resource_config.get("shared_resource").get("model_db")
return model_db_config.get(property_name, default_value)
- def _get_join_db_property(self, property_name,default_value=None):
+ def _get_join_db_property(self, property_name, default_value=None):
"""Return property value of join table
Args:
property_name (str): name of property
default_value (): default value of the property
-
+
Returns:
value of the property
- """
+ """
join_db_config = self._resource_config.get("shared_resource").get("join_db")
return join_db_config.get(property_name, default_value)
-
+
def _get_iam_role_property(self, property_name, default_value=None):
"""Return property value of iam role
Args:
property_name (str): name of property
default_value (): default value of the property
-
+
Returns:
value of the property
"""
@@ -365,25 +365,30 @@ def _get_iam_role_property(self, property_name, default_value=None):
def _get_cf_output_by_key(self, output_key):
"""Return cf output value of given output key
-
+
Args:
output_key (str): key of a specific output
-
+
Returns:
str: value of the output key
"""
- stack_json = self.cf_client.describe_stacks(
- StackName=self.shared_resource_stack_name
- )["Stacks"][0]
-
+ stack_json = self.cf_client.describe_stacks(StackName=self.shared_resource_stack_name)["Stacks"][0]
+
# validate stack has been successfully updater
- if stack_json["StackStatus"] not in \
- ["CREATE_COMPLETE", "UPDATE_COMPLETE",
- "ROLLBACK_COMPLETE", "UPDATE_ROLLBACK_COMPLETE"]:
- logger.error("Looks like Resource CF Stack is in {} state. " \
- "Cannot continue forward. ".format(stack_json["StackStatus"]))
- raise Exception("Please wait while the Shared Resources Stack gets into a usable state." \
- "Currently in state {}!".format(stack_json["StackStatus"]))
+ if stack_json["StackStatus"] not in [
+ "CREATE_COMPLETE",
+ "UPDATE_COMPLETE",
+ "ROLLBACK_COMPLETE",
+ "UPDATE_ROLLBACK_COMPLETE",
+ ]:
+ logger.error(
+ "Looks like Resource CF Stack is in {} state. "
+ "Cannot continue forward. ".format(stack_json["StackStatus"])
+ )
+ raise Exception(
+ "Please wait while the Shared Resources Stack gets into a usable state."
+ "Currently in state {}!".format(stack_json["StackStatus"])
+ )
stack_outputs = stack_json["Outputs"]
for stack_output in stack_outputs:
@@ -393,33 +398,33 @@ def _get_cf_output_by_key(self, output_key):
def _wait_for_active_firehose(self, stream_name):
"""Wait until the firehose stream creation complete and be active
-
+
Args:
stream_name (str): stream name of the firehose
"""
- status = 'CREATING'
+ status = "CREATING"
timeout = 60 * 2
- while status != 'ACTIVE' and timeout >= 0:
+ while status != "ACTIVE" and timeout >= 0:
logger.info("Creating firehose delivery stream...")
try:
result = self.firehose_client.describe_delivery_stream(DeliveryStreamName=stream_name)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError(f"Failed to describe delivery stream '{stream_name}' "
- f"with error {error_code}: {message}")
- status = result['DeliveryStreamDescription']['DeliveryStreamStatus']
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError(
+ f"Failed to describe delivery stream '{stream_name}' " f"with error {error_code}: {message}"
+ )
+ status = result["DeliveryStreamDescription"]["DeliveryStreamStatus"]
time.sleep(10)
timeout = timeout - 10
- if status == 'ACTIVE':
+ if status == "ACTIVE":
logger.info(f"Successfully created delivery stream '{stream_name}'")
else:
raise RuntimeError(f"Failed to create delivery stream '{stream_name}'")
- def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix,
- buffer_size=128, buffer_time=60):
+ def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix, buffer_size=128, buffer_time=60):
"""Initiate a firehose stream with given config
-
+
Args:
stream_name (str): name of the firehose stream
s3_bucket (str): s3 bucket for delivering the firehose streaming data
@@ -429,42 +434,39 @@ def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix,
buffer_time (int): buffer time(s) in firehose before pushing
data to s3 destination
"""
- exist_delivery_streams = self.firehose_client.list_delivery_streams(Limit=1000)['DeliveryStreamNames']
+ exist_delivery_streams = self.firehose_client.list_delivery_streams(Limit=1000)["DeliveryStreamNames"]
if stream_name in exist_delivery_streams:
- logger.warning(f"Delivery stream {stream_name} already exist. "
- "No new delivery stream created.")
+ logger.warning(f"Delivery stream {stream_name} already exist. " "No new delivery stream created.")
else:
firehose_role_arn = self.iam_role_arn
s3_bucket_arn = f"arn:aws:s3:::{s3_bucket}"
s3_config = {
- 'BucketARN': s3_bucket_arn,
- 'RoleARN': firehose_role_arn,
- 'Prefix': s3_prefix.strip() + '/',
- 'BufferingHints': {
- 'IntervalInSeconds': buffer_time,
- 'SizeInMBs': buffer_size
- },
+ "BucketARN": s3_bucket_arn,
+ "RoleARN": firehose_role_arn,
+ "Prefix": s3_prefix.strip() + "/",
+ "BufferingHints": {"IntervalInSeconds": buffer_time, "SizeInMBs": buffer_size},
}
try:
self.firehose_client.create_delivery_stream(
DeliveryStreamName=stream_name,
- DeliveryStreamType='DirectPut',
- ExtendedS3DestinationConfiguration=s3_config
+ DeliveryStreamType="DirectPut",
+ ExtendedS3DestinationConfiguration=s3_config,
)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError(f"Failed to create delivery stream '{stream_name}' "
- f"with error {error_code}: {message}")
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError(
+ f"Failed to create delivery stream '{stream_name}' " f"with error {error_code}: {message}"
+ )
# check if delivery stream created
self._wait_for_active_firehose(stream_name)
def create_firehose_stream_if_not_exists(self, stream_name, s3_prefix):
"""Create firehose stream with given stream name
-
+
Arguments:
stream_name (str): name of the firehose stream
s3_prefix (str): s3 prefix path for delivering the firehose data
@@ -475,29 +477,28 @@ def create_firehose_stream_if_not_exists(self, stream_name, s3_prefix):
def delete_firehose_stream(self, stream_name):
"""Delete the firehose with given stream name
-
+
Args:
stream_name (str): name of the firehose stream
"""
logger.warning(f"Deleting firehose stream '{stream_name}'...")
try:
- self.firehose_client.delete_delivery_stream(
- DeliveryStreamName=stream_name
- )
+ self.firehose_client.delete_delivery_stream(DeliveryStreamName=stream_name)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError(f"Failed to delete delivery stream '{stream_name}' "
- f"with error {error_code}: {message}")
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError(
+ f"Failed to delete delivery stream '{stream_name}' " f"with error {error_code}: {message}"
+ )
def _create_s3_bucket_if_not_exist(self, prefix):
"""Create s3 bucket if not exist
-
+
Args:
prefix (str): A bucket name prefix, followed by region name
- and account id
-
+ and account id
+
Returns:
str: s3 bucket name
"""
@@ -513,9 +514,7 @@ def _create_s3_bucket_if_not_exist(self, prefix):
if region == "us-east-1":
s3.create_bucket(Bucket=s3_bucket_name)
else:
- s3.create_bucket(
- Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region}
- )
+ s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region})
logger.info("Successfully create S3 bucket '{}' for storing {} data".format(s3_bucket_name, prefix))
except ClientError as e:
error_code = e.response["Error"]["Code"]
@@ -523,9 +522,7 @@ def _create_s3_bucket_if_not_exist(self, prefix):
if error_code == "BucketAlreadyOwnedByYou":
pass
- elif (
- error_code == "OperationAborted" and "conflicting conditional operation" in message
- ):
+ elif error_code == "OperationAborted" and "conflicting conditional operation" in message:
# If this bucket is already being concurrently created, we don't need to create it again.
pass
elif error_code == "TooManyBuckets":
@@ -533,8 +530,8 @@ def _create_s3_bucket_if_not_exist(self, prefix):
s3.meta.client.head_bucket(Bucket=s3_bucket_name)
else:
raise
-
- s3_waiter = s3_client.get_waiter('bucket_exists')
+
+ s3_waiter = s3_client.get_waiter("bucket_exists")
s3_waiter.wait(Bucket=s3_bucket_name)
return s3_bucket_name
@@ -548,14 +545,16 @@ def __init__(self, endpoint_name, sagemaker_session=None):
with the Amazon SageMaker APIs and any other AWS services needed.
"""
self.endpoint_name = endpoint_name
- self._realtime_predictor = sagemaker.predictor.Predictor(endpoint_name,
- serializer=sagemaker.serializers.JSONSerializer(),
- deserializer=sagemaker.deserializers.JSONDeserializer(),
- sagemaker_session=sagemaker_session)
+ self._realtime_predictor = sagemaker.predictor.Predictor(
+ endpoint_name,
+ serializer=sagemaker.serializers.JSONSerializer(),
+ deserializer=sagemaker.deserializers.JSONDeserializer(),
+ sagemaker_session=sagemaker_session,
+ )
def get_action(self, obs=None):
"""Get prediction from the endpoint
-
+
Args:
obs (list/str): observation of the environment
@@ -567,32 +566,31 @@ def get_action(self, obs=None):
sample_prob: sample probability distribution used for data split
"""
payload = {}
- payload['request_type'] = "observation"
- payload['observation'] = obs
+ payload["request_type"] = "observation"
+ payload["observation"] = obs
response = self._realtime_predictor.predict(payload)
- action = response['action']
- action_prob = response['action_prob']
- event_id = response['event_id']
- model_id = response['model_id']
- sample_prob = response['sample_prob']
+ action = response["action"]
+ action_prob = response["action_prob"]
+ event_id = response["event_id"]
+ model_id = response["model_id"]
+ sample_prob = response["sample_prob"]
return action, event_id, model_id, action_prob, sample_prob
def get_hosted_model_id(self):
"""Return hostdd model id in the hosting endpoint
-
+
Returns:
str: model id of the model being hosted
"""
payload = {}
- payload['request_type'] = "model_id"
- payload['observation'] = None
+ payload["request_type"] = "model_id"
+ payload["observation"] = None
response = self._realtime_predictor.predict(payload)
- model_id = response['model_id']
+ model_id = response["model_id"]
return model_id
def delete_endpoint(self):
- """Delete the Sagemaker endpoint
- """
+ """Delete the Sagemaker endpoint"""
logger.warning(f"Deleting hosting endpoint '{self.endpoint_name}'...")
- self._realtime_predictor.delete_endpoint()
\ No newline at end of file
+ self._realtime_predictor.delete_endpoint()
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py b/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py
index f907383a..d9d5a930 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py
@@ -2,34 +2,30 @@
import json
-class CloudWatchLogger():
-
+class CloudWatchLogger:
def __init__(self, cw_client, region_name):
self.region_name = region_name
self.cw_client = cw_client
-
+
def get_cloudwatch_dashboard_details(self, experiment_id):
# update for non-commercial region
cw_dashboard_url = f"https://{self.region_name}.console.aws.amazon.com/cloudwatch/home?region={self.region_name}#dashboards:name={experiment_id};start=PT1H"
text = f"You can monitor your Training/Hosting evaluation metrics on this [CloudWatch Dashboard]({cw_dashboard_url})"
- text += "\n\n(Note: This would need Trained/Hosted Models to be evaluated in order to publish Evaluation Scores)"
+ text += (
+ "\n\n(Note: This would need Trained/Hosted Models to be evaluated in order to publish Evaluation Scores)"
+ )
return text
-
- def publish_latest_hosting_information(
- self,
- experiment_id,
- latest_hosted_model_id,
- latest_hosted_model_score
- ):
+
+ def publish_latest_hosting_information(self, experiment_id, latest_hosted_model_id, latest_hosted_model_score):
self.cw_client.put_metric_data(
Namespace=experiment_id,
MetricData=[
{
"MetricName": "latest_hosted_model_id_continuous",
"Timestamp": time.time(),
- "Value": int(latest_hosted_model_id.split('-')[-1])
+ "Value": int(latest_hosted_model_id.split("-")[-1]),
}
- ]
+ ],
)
self.cw_client.put_metric_data(
Namespace=experiment_id,
@@ -37,26 +33,21 @@ def publish_latest_hosting_information(
{
"MetricName": "latest_hosted_model_score_continuous",
"Timestamp": time.time(),
- "Value": float(latest_hosted_model_score)
+ "Value": float(latest_hosted_model_score),
}
- ]
+ ],
)
-
- def publish_latest_training_information(
- self,
- experiment_id,
- latest_trained_model_id,
- latest_trained_model_score
- ):
+
+ def publish_latest_training_information(self, experiment_id, latest_trained_model_id, latest_trained_model_score):
self.cw_client.put_metric_data(
Namespace=experiment_id,
MetricData=[
{
"MetricName": "latest_trained_model_id_continuous",
"Timestamp": time.time(),
- "Value": int(latest_trained_model_id.split('-')[-1])
+ "Value": int(latest_trained_model_id.split("-")[-1]),
}
- ]
+ ],
)
self.cw_client.put_metric_data(
Namespace=experiment_id,
@@ -64,16 +55,13 @@ def publish_latest_training_information(
{
"MetricName": "latest_trained_model_score_continuous",
"Timestamp": time.time(),
- "Value": float(latest_trained_model_score)
+ "Value": float(latest_trained_model_score),
}
- ]
+ ],
)
-
+
def publish_newly_trained_model_eval_information(
- self,
- experiment_id,
- new_trained_model_id,
- new_trained_model_score
+ self, experiment_id, new_trained_model_id, new_trained_model_score
):
self.cw_client.put_metric_data(
Namespace=experiment_id,
@@ -81,9 +69,9 @@ def publish_newly_trained_model_eval_information(
{
"MetricName": "newly_trained_model_id",
"Timestamp": time.time(),
- "Value": int(new_trained_model_id.split('-')[-1])
+ "Value": int(new_trained_model_id.split("-")[-1]),
}
- ]
+ ],
)
self.cw_client.put_metric_data(
Namespace=experiment_id,
@@ -91,45 +79,28 @@ def publish_newly_trained_model_eval_information(
{
"MetricName": "newly_trained_model_score",
"Timestamp": time.time(),
- "Value": float(new_trained_model_score)
+ "Value": float(new_trained_model_score),
}
- ]
+ ],
)
-
- def publish_rewards_for_simulation(
- self,
- experiment_id,
- reported_rewards_sum
- ):
+
+ def publish_rewards_for_simulation(self, experiment_id, reported_rewards_sum):
self.cw_client.put_metric_data(
Namespace=experiment_id,
MetricData=[
{
"MetricName": "reported_rewards_score",
"Timestamp": time.time(),
- "Value": float(reported_rewards_sum)
+ "Value": float(reported_rewards_sum),
}
- ]
+ ],
)
- def create_cloudwatch_dashboard_from_experiment_id(
- self,
- experiment_id
- ):
- cw_json = self.get_cloudwatch_dashboard_json_for_experiment_id(
- experiment_id,
- self.region_name
- )
- self.cw_client.put_dashboard(
- DashboardName=experiment_id,
- DashboardBody=cw_json
- )
+ def create_cloudwatch_dashboard_from_experiment_id(self, experiment_id):
+ cw_json = self.get_cloudwatch_dashboard_json_for_experiment_id(experiment_id, self.region_name)
+ self.cw_client.put_dashboard(DashboardName=experiment_id, DashboardBody=cw_json)
- def get_cloudwatch_dashboard_json_for_experiment_id(
- self,
- experiment_id,
- region_name
- ):
+ def get_cloudwatch_dashboard_json_for_experiment_id(self, experiment_id, region_name):
dashboard_json = {
"widgets": [
{
@@ -143,17 +114,15 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
[
experiment_id,
"latest_hosted_model_id_continuous",
- {
- "label": "(ModelId suffix part only)"
- }
+ {"label": "(ModelId suffix part only)"},
]
],
"view": "singleValue",
"region": region_name,
"title": "Currently Hosted Model Id",
"period": 60,
- "stat": "Maximum"
- }
+ "stat": "Maximum",
+ },
},
{
"type": "metric",
@@ -162,19 +131,13 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"width": 9,
"height": 3,
"properties": {
- "metrics": [
- [
- experiment_id,
- "latest_hosted_model_score_continuous",
- {"label": "EvalScore" }
- ]
- ],
+ "metrics": [[experiment_id, "latest_hosted_model_score_continuous", {"label": "EvalScore"}]],
"view": "singleValue",
"region": region_name,
"title": "Currently Hosted Model Eval Score (On latest data)",
"period": 60,
- "stat": "Minimum"
- }
+ "stat": "Minimum",
+ },
},
{
"type": "metric",
@@ -184,11 +147,7 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"height": 3,
"properties": {
"metrics": [
- [
- experiment_id,
- "latest_trained_model_id_continuous",
- { "label": "(ModelId suffix only)" }
- ]
+ [experiment_id, "latest_trained_model_id_continuous", {"label": "(ModelId suffix only)"}]
],
"view": "singleValue",
"region": region_name,
@@ -196,8 +155,8 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"stat": "Maximum",
"period": 60,
"setPeriodToTimeRange": False,
- "stacked": True
- }
+ "stacked": True,
+ },
},
{
"type": "metric",
@@ -206,19 +165,13 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"width": 9,
"height": 3,
"properties": {
- "metrics": [
- [
- experiment_id,
- "latest_trained_model_score_continuous",
- { "label": "EvalScore" }
- ]
- ],
+ "metrics": [[experiment_id, "latest_trained_model_score_continuous", {"label": "EvalScore"}]],
"view": "singleValue",
"region": region_name,
"title": "Latest Trained Model Eval Score",
"period": 60,
- "stat": "Maximum"
- }
+ "stat": "Maximum",
+ },
},
{
"type": "metric",
@@ -227,26 +180,15 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"width": 9,
"height": 9,
"properties": {
- "metrics": [
- [
- experiment_id,
- "newly_trained_model_score",
- {"label": "EvalScore" }
- ]
- ],
+ "metrics": [[experiment_id, "newly_trained_model_score", {"label": "EvalScore"}]],
"view": "timeSeries",
"stacked": False,
"region": region_name,
"stat": "Maximum",
"period": 60,
"title": "New Model Eval Score Over Time",
- "yAxis": {
- "left": {
- "min": 0,
- "max": 1
- }
- }
- }
+ "yAxis": {"left": {"min": 0, "max": 1}},
+ },
},
{
"type": "metric",
@@ -255,31 +197,18 @@ def get_cloudwatch_dashboard_json_for_experiment_id(
"width": 9,
"height": 9,
"properties": {
- "metrics": [
- [
- experiment_id,
- "reported_rewards_score",
- {"label": "Rewards" }
- ]
- ],
+ "metrics": [[experiment_id, "reported_rewards_score", {"label": "Rewards"}]],
"view": "timeSeries",
"stacked": False,
"region": region_name,
"stat": "Maximum",
"period": 60,
"title": "Experiment's Reported Rewards",
- "yAxis": {
- "left": {
- "min": 0,
- "max": 1
- }
- },
+ "yAxis": {"left": {"min": 0, "max": 1}},
"liveData": True,
- "legend": {
- "position": "bottom"
- }
- }
- }
+ "legend": {"position": "bottom"},
+ },
+ },
]
}
return json.dumps(dashboard_json)
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py
index a1de2316..f018c352 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py
@@ -1,16 +1,17 @@
-class ExperimentRecord():
- '''
+class ExperimentRecord:
+ """
This class captures all the data that is needed to run a experiment
for Continuosly Training and Updating models on SageMaker
- '''
+ """
+
def __init__(
- self,
- experiment_id,
- training_workflow_metadata={},
- hosting_workflow_metadata={},
- joining_workflow_metadata={},
- evaluation_workflow_metadata={}
- ):
+ self,
+ experiment_id,
+ training_workflow_metadata={},
+ hosting_workflow_metadata={},
+ joining_workflow_metadata={},
+ evaluation_workflow_metadata={},
+ ):
# unique id common across all experiments in the account
self.experiment_id = experiment_id
@@ -26,13 +27,13 @@ def __init__(
self._last_hosted_model_id = hosting_workflow_metadata.get("last_hosted_model_id", None)
self._next_model_to_host_id = hosting_workflow_metadata.get("next_model_to_host_id", None)
self._hosting_endpoint = hosting_workflow_metadata.get("hosting_endpoint", None)
-
+
# joining workflow metadata
self.joining_workflow_metadata = joining_workflow_metadata
self._joining_state = joining_workflow_metadata.get("joining_state", None)
self._last_joined_job_id = joining_workflow_metadata.get("last_joined_job_id", None)
self._next_join_job_id = joining_workflow_metadata.get("next_join_job_id", None)
-
+
# evaluation workflow metadata
self.evaluation_workflow_metadata = evaluation_workflow_metadata
self._evaluation_state = evaluation_workflow_metadata.get("evaluation_state", None)
@@ -58,11 +59,11 @@ def to_ddb_record(self):
self.evaluation_workflow_metadata["next_evaluation_job_id"] = self._next_evaluation_job_id
return {
- 'experiment_id': self.experiment_id,
- 'training_workflow_metadata': self.training_workflow_metadata,
- 'hosting_workflow_metadata': self.hosting_workflow_metadata,
- 'joining_workflow_metadata': self.joining_workflow_metadata,
- 'evaluation_workflow_metadata': self.evaluation_workflow_metadata
+ "experiment_id": self.experiment_id,
+ "training_workflow_metadata": self.training_workflow_metadata,
+ "hosting_workflow_metadata": self.hosting_workflow_metadata,
+ "joining_workflow_metadata": self.joining_workflow_metadata,
+ "evaluation_workflow_metadata": self.evaluation_workflow_metadata,
}
@classmethod
@@ -72,5 +73,5 @@ def load_from_ddb_record(cls, record):
record["training_workflow_metadata"],
record["hosting_workflow_metadata"],
record["joining_workflow_metadata"],
- record["evaluation_workflow_metadata"]
- )
\ No newline at end of file
+ record["evaluation_workflow_metadata"],
+ )
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py
index c6841a5d..5cab8320 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py
@@ -1,22 +1,25 @@
from datetime import datetime
-class JoinJobRecord():
- '''
+
+class JoinJobRecord:
+ """
This class captures all the data that is needed to run a joining job
for Continuosly Training and Updating models on SageMaker
- '''
+ """
+
def __init__(
- self,
- experiment_id,
- join_job_id,
- current_state=None,
- input_obs_data_s3_path=None,
- obs_start_time=None,
- obs_end_time=None,
- input_reward_data_s3_path=None,
- output_joined_train_data_s3_path=None,
- output_joined_eval_data_s3_path=None,
- join_query_ids=[]):
+ self,
+ experiment_id,
+ join_job_id,
+ current_state=None,
+ input_obs_data_s3_path=None,
+ obs_start_time=None,
+ obs_end_time=None,
+ input_reward_data_s3_path=None,
+ output_joined_train_data_s3_path=None,
+ output_joined_eval_data_s3_path=None,
+ join_query_ids=[],
+ ):
self.experiment_id = experiment_id
self.join_job_id = join_job_id
@@ -32,29 +35,31 @@ def __init__(
self._join_query_ids = join_query_ids
def to_ddb_record(self):
- obs_start_time_str = self._obs_start_time.strftime("%Y-%m-%d-%H") if \
- self._obs_start_time is not None else None
- obs_end_time_str = self._obs_end_time.strftime("%Y-%m-%d-%H") if \
- self._obs_end_time is not None else None
+ obs_start_time_str = self._obs_start_time.strftime("%Y-%m-%d-%H") if self._obs_start_time is not None else None
+ obs_end_time_str = self._obs_end_time.strftime("%Y-%m-%d-%H") if self._obs_end_time is not None else None
return {
- 'experiment_id': self.experiment_id,
- 'join_job_id': self.join_job_id,
- 'current_state': self._current_state,
- 'input_obs_data_s3_path': self._input_obs_data_s3_path,
- 'obs_start_time': obs_start_time_str,
- 'obs_end_time': obs_end_time_str,
- 'input_reward_data_s3_path': self._input_reward_data_s3_path,
- 'output_joined_train_data_s3_path': self._output_joined_train_data_s3_path,
- 'output_joined_eval_data_s3_path': self._output_joined_eval_data_s3_path,
- 'join_query_ids': self._join_query_ids
+ "experiment_id": self.experiment_id,
+ "join_job_id": self.join_job_id,
+ "current_state": self._current_state,
+ "input_obs_data_s3_path": self._input_obs_data_s3_path,
+ "obs_start_time": obs_start_time_str,
+ "obs_end_time": obs_end_time_str,
+ "input_reward_data_s3_path": self._input_reward_data_s3_path,
+ "output_joined_train_data_s3_path": self._output_joined_train_data_s3_path,
+ "output_joined_eval_data_s3_path": self._output_joined_eval_data_s3_path,
+ "join_query_ids": self._join_query_ids,
}
@classmethod
def load_from_ddb_record(cls, record):
- obs_start_time = datetime.strptime(record["obs_start_time"], "%Y-%m-%d-%H") if \
- record["obs_start_time"] is not None else None
- obs_end_time = datetime.strptime(record["obs_end_time"], "%Y-%m-%d-%H") if \
- record["obs_end_time"] is not None else None
+ obs_start_time = (
+ datetime.strptime(record["obs_start_time"], "%Y-%m-%d-%H")
+ if record["obs_start_time"] is not None
+ else None
+ )
+ obs_end_time = (
+ datetime.strptime(record["obs_end_time"], "%Y-%m-%d-%H") if record["obs_end_time"] is not None else None
+ )
return JoinJobRecord(
record["experiment_id"],
@@ -66,8 +71,8 @@ def load_from_ddb_record(cls, record):
record["input_reward_data_s3_path"],
record["output_joined_train_data_s3_path"],
record["output_joined_eval_data_s3_path"],
- record["join_query_ids"]
- )
+ record["join_query_ids"],
+ )
def get_input_obs_data_s3_path(self):
return self._input_obs_data_s3_path
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py
index 00ec9df8..14ba8589 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py
@@ -1,23 +1,25 @@
-class ModelRecord():
- '''
+class ModelRecord:
+ """
This class captures all the data that is needed to run a training job
for Continuosly Training and Updating models on SageMaker
- '''
+ """
+
def __init__(
- self,
- experiment_id,
- model_id,
- train_state=None,
- evaluation_job_name=None,
- eval_state=None,
- eval_scores={},
- input_model_id=None,
- input_data_s3_prefix=None,
- manifest_file_path=None,
- eval_data_s3_path=None,
- s3_model_output_path=None,
- training_start_time=None,
- training_end_time=None):
+ self,
+ experiment_id,
+ model_id,
+ train_state=None,
+ evaluation_job_name=None,
+ eval_state=None,
+ eval_scores={},
+ input_model_id=None,
+ input_data_s3_prefix=None,
+ manifest_file_path=None,
+ eval_data_s3_path=None,
+ s3_model_output_path=None,
+ training_start_time=None,
+ training_end_time=None,
+ ):
self.experiment_id = experiment_id
self.model_id = model_id
@@ -37,19 +39,19 @@ def __init__(
def to_ddb_record(self):
return {
- 'experiment_id': self.experiment_id,
- 'model_id': self.model_id,
- 'train_state': self._train_state,
- 'evaluation_job_name': self._evaluation_job_name,
- 'eval_state': self._eval_state,
- 'eval_scores': self._eval_scores,
- 'input_model_id': self._input_model_id,
- 'input_data_s3_prefix': self._input_data_s3_prefix,
- 'manifest_file_path': self._manifest_file_path,
- 'eval_data_s3_path': self._eval_data_s3_path,
- 's3_model_output_path': self._s3_model_output_path,
- 'training_start_time': self._training_start_time,
- 'training_end_time': self._training_end_time
+ "experiment_id": self.experiment_id,
+ "model_id": self.model_id,
+ "train_state": self._train_state,
+ "evaluation_job_name": self._evaluation_job_name,
+ "eval_state": self._eval_state,
+ "eval_scores": self._eval_scores,
+ "input_model_id": self._input_model_id,
+ "input_data_s3_prefix": self._input_data_s3_prefix,
+ "manifest_file_path": self._manifest_file_path,
+ "eval_data_s3_path": self._eval_data_s3_path,
+ "s3_model_output_path": self._s3_model_output_path,
+ "training_start_time": self._training_start_time,
+ "training_end_time": self._training_end_time,
}
@classmethod
@@ -67,15 +69,10 @@ def load_from_ddb_record(cls, record):
record["eval_data_s3_path"],
record["s3_model_output_path"],
record["training_start_time"],
- record["training_end_time"]
- )
-
- def add_new_training_job_info(
- self,
- input_model_id=None,
- input_data_s3_prefix=None,
- manifest_file_path=None
- ):
+ record["training_end_time"],
+ )
+
+ def add_new_training_job_info(self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None):
self._input_model_id = input_model_id
self._input_data_s3_prefix = input_data_s3_prefix
self._manifest_file_path = manifest_file_path
@@ -87,10 +84,10 @@ def add_new_training_job_info(
self._eval_scores = {} # eval score for a new model would always be empty.
def add_new_evaluation_job_info(
- self,
- evaluation_job_name=None,
- eval_data_s3_path=None,
- ):
+ self,
+ evaluation_job_name=None,
+ eval_data_s3_path=None,
+ ):
self._evaluation_job_name = evaluation_job_name
self._eval_data_s3_path = eval_data_s3_path
@@ -107,20 +104,16 @@ def model_in_terminal_state(self):
return False
def update_model_job_status(
- self,
- training_start_time=None,
- training_end_time=None,
- train_state=None,
- s3_model_output_path=None
- ):
+ self, training_start_time=None, training_end_time=None, train_state=None, s3_model_output_path=None
+ ):
self._training_start_time = training_start_time
self._training_end_time = training_end_time
- self._train_state = train_state
+ self._train_state = train_state
self._s3_model_output_path = s3_model_output_path
def update_model_as_failed(self):
self._train_state = "Failed"
-
+
def eval_in_terminal_state(self):
if self._eval_state:
return self._eval_state.endswith("ed")
@@ -130,17 +123,15 @@ def add_model_eval_scores(self, eval_score):
if self._eval_scores is None:
self._eval_scores = {}
self._eval_scores[self._eval_data_s3_path] = eval_score
-
+
def update_eval_job_state(self, eval_state):
self._eval_state = eval_state
-
+
def update_eval_job_as_failed(self):
self._eval_state = "Failed"
def is_train_completed(self):
- if self._train_state and \
- self._train_state == "Completed" and \
- self._s3_model_output_path is not None:
+ if self._train_state and self._train_state == "Completed" and self._s3_model_output_path is not None:
return True
return False
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py
index 692e4c60..608b3860 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py
@@ -12,7 +12,7 @@
import sagemaker
logging.basicConfig()
-logger = logging.getLogger('orchestrator')
+logger = logging.getLogger("orchestrator")
logger.setLevel(logging.INFO)
try:
@@ -40,64 +40,68 @@
from orchestrator.resource_manager import ResourceManager
from orchestrator.utils.cloudwatch_logger import CloudWatchLogger
from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException
-from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, \
- SageMakerHostingException, SageMakerTrainingJobException, WorkflowJoiningJobException, \
- EvalScoreNotAvailableException, InvalidUsageException
-
-
+from orchestrator.exceptions.workflow_exceptions import (
+ UnhandledWorkflowException,
+ SageMakerHostingException,
+ SageMakerTrainingJobException,
+ WorkflowJoiningJobException,
+ EvalScoreNotAvailableException,
+ InvalidUsageException,
+)
class HostingState(str, Enum):
- PENDING = "PENDING" # A hosting update request is pending
- DEPLOYING = "DEPLOYING" # A hosting update request is in process
- DEPLOYED = "DEPLOYED" # Hosting update request was completed.
- FAILED = "FAILED" # hosting update request failed.
+ PENDING = "PENDING" # A hosting update request is pending
+ DEPLOYING = "DEPLOYING" # A hosting update request is in process
+ DEPLOYED = "DEPLOYED" # Hosting update request was completed.
+ FAILED = "FAILED" # hosting update request failed.
class TrainingState(str, Enum):
- PENDING = "PENDING" # A new model/training job create request is made
- TRAINING = "TRAINING" # Model/Training job is in status of 'Training'
- TRAINED = "TRAINED" # Model/Training job has been completed
- STOPPED = "STOPPED" # Model/Training job has been stopped
- FAILED = "FAILED" # Model/Training job has been failed
+ PENDING = "PENDING" # A new model/training job create request is made
+ TRAINING = "TRAINING" # Model/Training job is in status of 'Training'
+ TRAINED = "TRAINED" # Model/Training job has been completed
+ STOPPED = "STOPPED" # Model/Training job has been stopped
+ FAILED = "FAILED" # Model/Training job has been failed
class EvaluationState(str, Enum):
- PENDING = "PENDING" # A new evaluation job create request is made
- EVALUATING = "EVALUATING" # Evaluation job is in status of 'Evaluating'
- EVALUATED = "EVALUATED" # Evaluation job has been completed
- STOPPED = "STOPPED" # Evaluation job has been stopped
- FAILED = "FAILED" # Evaluation job has been failed
+ PENDING = "PENDING" # A new evaluation job create request is made
+ EVALUATING = "EVALUATING" # Evaluation job is in status of 'Evaluating'
+ EVALUATED = "EVALUATED" # Evaluation job has been completed
+ STOPPED = "STOPPED" # Evaluation job has been stopped
+ FAILED = "FAILED" # Evaluation job has been failed
class JoiningState(str, Enum):
- PENDING = "PENDING" # A joining request is pending
- RUNNING = "RUNNING" # A joining job is running
- SUCCEEDED = "SUCCEEDED" # A joining job has been completed
- FAILED = "FAILED" # A joining job has been failed
- CANCELLED = "CANCELLED" # A joining job has been cancelled
+ PENDING = "PENDING" # A joining request is pending
+ RUNNING = "RUNNING" # A joining job is running
+ SUCCEEDED = "SUCCEEDED" # A joining job has been completed
+ FAILED = "FAILED" # A joining job has been failed
+ CANCELLED = "CANCELLED" # A joining job has been cancelled
+
-# Using SageMakerTrainingJob primary status
+# Using SageMakerTrainingJob primary status
TRAINING_JOB_STATUS_MAP = {
"Pending": TrainingState.PENDING,
"InProgress": TrainingState.TRAINING,
"Stopping": TrainingState.TRAINING,
"Stopped": TrainingState.STOPPED,
"Failed": TrainingState.FAILED,
- "Completed": TrainingState.TRAINED
+ "Completed": TrainingState.TRAINED,
}
-# Using SageMakerTrainingJob primary status
+# Using SageMakerTrainingJob primary status
EVALUATION_JOB_STATUS_MAP = {
"Pending": EvaluationState.PENDING,
"InProgress": EvaluationState.EVALUATING,
"Stopping": EvaluationState.EVALUATING,
"Stopped": EvaluationState.STOPPED,
"Failed": EvaluationState.FAILED,
- "Completed": EvaluationState.EVALUATED
+ "Completed": EvaluationState.EVALUATED,
}
-# Using SageMakerHostingEndpoint primary status
+# Using SageMakerHostingEndpoint primary status
HOSTING_ENDPOINT_STATUS_MAP = {
"OutOfService": HostingState.FAILED,
"Creating": HostingState.DEPLOYING,
@@ -106,7 +110,7 @@ class JoiningState(str, Enum):
"RollingBack": HostingState.DEPLOYING,
"InService": HostingState.DEPLOYED,
"Deleting": HostingState.DEPLOYING,
- "Failed": HostingState.FAILED
+ "Failed": HostingState.FAILED,
}
@@ -118,10 +122,7 @@ class ExperimentManagerSyncThread(Thread):
for the latest state and update the table.
"""
- def __init__(
- self,
- experiment_manager
- ):
+ def __init__(self, experiment_manager):
"""Initialize a synchronization thread for the experiment
Args:
@@ -152,23 +153,23 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met
Three thing happens here:
a) Checks if current TrainingWorkflowMetadata needs an update.
b) Fetches latest TrainingJob state from ModelDb for next_model_to_train
- c) Updates ExperimentDb TrainingWorkflowMetadata with latest information.
+ c) Updates ExperimentDb TrainingWorkflowMetadata with latest information.
d) Finally, updates the local ExperimentManager context to latest.
-
+
Args:
training_workflow_metadata (dict): A dictionary containing
training workflow related metadata
"""
if training_workflow_metadata is None:
- # A training request hasn't been made yet.
+ # A training request hasn't been made yet.
# Nothing to proccess. Return.
return
-
+
next_model_to_train_id = training_workflow_metadata.get("next_model_to_train_id", None)
training_state = training_workflow_metadata.get("training_state", None)
if training_state is None:
- # A training request hasn't been made yet.
+ # A training request hasn't been made yet.
# Nothing to proccess. Return.
return
elif not training_state.endswith("ING"):
@@ -177,14 +178,16 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met
return
elif training_state.endswith("ING") and next_model_to_train_id is None:
# A training is in progress, but the training model-id is None!
- logger.warn(f"Model Training in {training_state}, while next_model_to_train_id is None. "
- "Training Workflow would be stuck if this continues."
+ logger.warn(
+ f"Model Training in {training_state}, while next_model_to_train_id is None. "
+ "Training Workflow would be stuck if this continues."
)
return
else:
# A training is in progress. Fetch the status of that training job from ModelDb.
training_job_record = self.model_db_client.get_model_record_with_retry(
- self.experiment_id, next_model_to_train_id)
+ self.experiment_id, next_model_to_train_id
+ )
# Get updated TrainingWorkflowState in {new_training_state}
if training_job_record is None:
@@ -199,56 +202,60 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met
if train_state_from_modeldb is not None:
new_training_state = TRAINING_JOB_STATUS_MAP[train_state_from_modeldb]
else:
- # Since ModelDb training job state is None,
+ # Since ModelDb training job state is None,
# keep the ExperimentDb TrainingWorkflowState same.
- logger.warn(f"ModelDb has model-id {next_model_to_train_id} 's state as 'None'. "
- "Training Worklow would be stuck if this continues."
+ logger.warn(
+ f"ModelDb has model-id {next_model_to_train_id} 's state as 'None'. "
+ "Training Worklow would be stuck if this continues."
)
new_training_state = training_state
expected_next_model_to_train_id = next_model_to_train_id
# Generate new TrainingWorkflowState for ExperimentDb based on new_training_state
if new_training_state == TrainingState.TRAINED:
- training_workflow_metadata['last_trained_model_id'] = next_model_to_train_id
- training_workflow_metadata['next_model_to_train_id'] = None
- training_workflow_metadata['training_state'] = new_training_state
+ training_workflow_metadata["last_trained_model_id"] = next_model_to_train_id
+ training_workflow_metadata["next_model_to_train_id"] = None
+ training_workflow_metadata["training_state"] = new_training_state
elif new_training_state == TrainingState.FAILED or new_training_state == TrainingState.STOPPED:
# training_workflow_metadata['last_trained_model_id'] remains the same
# training_workflow_metadata['next_model_to_train_id'] remains the same or change to None
# update the ExperimentDb TrainingWorkflowState to Failed
- training_workflow_metadata['training_state'] = new_training_state
+ training_workflow_metadata["training_state"] = new_training_state
else:
# training_workflow_metadata['last_trained_model_id'] remains the same
# training_workflow_metadata['next_model_to_train_id'] remains the same
# update the ExperimentDb TrainingWorkflowState to new_training_state
- training_workflow_metadata['training_state'] = new_training_state
+ training_workflow_metadata["training_state"] = new_training_state
# Try to save the update in ExperimentDb
# This can update the status only if in the current record,
# next_model_to_train_id == expected_next_model_to_train_id
try:
self.exp_db_client.update_training_workflow_metadata_with_validation(
- self.experiment_id,
- training_workflow_metadata,
- expected_next_model_to_train_id
+ self.experiment_id, training_workflow_metadata, expected_next_model_to_train_id
)
except Exception as e:
if "ConditionalCheckFailedException" in str(e):
- # Most likely Sync Thread went out of sync :(
- # Just return here without updating local ExperimentManager.
- logger.warn("Sync Thread trying to update ExperimentDb with old state. This should "
- "get fixed in next run!"
+ # Most likely Sync Thread went out of sync :(
+ # Just return here without updating local ExperimentManager.
+ logger.warn(
+ "Sync Thread trying to update ExperimentDb with old state. This should " "get fixed in next run!"
)
return
logger.error("Failed to update ExperimentDb with latest information: " + str(e))
- raise UnhandledWorkflowException("Some error occurred while update ExperimentDb record TrainingWorkflowMetadata")
+ raise UnhandledWorkflowException(
+ "Some error occurred while update ExperimentDb record TrainingWorkflowMetadata"
+ )
# Finally, update local ExperimentManager with new states.
- self.experiment_manager.experiment_record._last_trained_model_id = training_workflow_metadata['last_trained_model_id']
- self.experiment_manager.experiment_record._next_model_to_train_id = training_workflow_metadata['next_model_to_train_id']
- self.experiment_manager.experiment_record._training_state = training_workflow_metadata['training_state']
-
+ self.experiment_manager.experiment_record._last_trained_model_id = training_workflow_metadata[
+ "last_trained_model_id"
+ ]
+ self.experiment_manager.experiment_record._next_model_to_train_id = training_workflow_metadata[
+ "next_model_to_train_id"
+ ]
+ self.experiment_manager.experiment_record._training_state = training_workflow_metadata["training_state"]
def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow_metadata):
"""
@@ -266,9 +273,8 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow
# some evaluation request is in progress
if evaluation_state is not None and evaluation_state.endswith("ING"):
- evaluation_model_id = next_evaluation_job_id.split('-eval-')[0]
- evaluation_job_record = self.model_db_client.get_model_record(
- self.experiment_id, evaluation_model_id)
+ evaluation_model_id = next_evaluation_job_id.split("-eval-")[0]
+ evaluation_job_record = self.model_db_client.get_model_record(self.experiment_id, evaluation_model_id)
# if evaluation model record exists in the model table
if evaluation_job_record is not None:
@@ -281,9 +287,7 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow
self.experiment_manager.experiment_record._evaluation_state = evaluation_state
# update table states via ddb client
- self.exp_db_client.update_experiment_evaluation_state(
- self.experiment_id, evaluation_state
- )
+ self.exp_db_client.update_experiment_evaluation_state(self.experiment_id, evaluation_state)
if evaluation_state == EvaluationState.EVALUATED:
self.experiment_manager.experiment_record._last_evaluation_job_id = next_evaluation_job_id
@@ -292,20 +296,18 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow
self.exp_db_client.update_experiment_last_evaluation_job_id(
self.experiment_id, next_evaluation_job_id
)
- self.exp_db_client.update_experiment_next_evaluation_job_id(
- self.experiment_id, None
- )
-
+ self.exp_db_client.update_experiment_next_evaluation_job_id(self.experiment_id, None)
+
# update latest_train/eval metrics to publish to CW
self._update_metrics_from_latest_eval_job(next_evaluation_job_id)
def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metadata):
"""Update the hosting workflow metadata in the experiment table
-
+
Args:
hosting_workflow_metadata (dict): A dictionary containing
hosting workflow related metadata
- """
+ """
if hosting_workflow_metadata is None:
return
@@ -321,12 +323,8 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad
model_id = predictor.get_hosted_model_id()
assert model_id == last_hosted_model_id
except Exception:
- self.exp_db_client.update_experiment_hosting_state(
- self.experiment_id, None
- )
- self.exp_db_client.update_experiment_hosting_endpoint(
- self.experiment_id, None
- )
+ self.exp_db_client.update_experiment_hosting_state(self.experiment_id, None)
+ self.exp_db_client.update_experiment_hosting_endpoint(self.experiment_id, None)
self.experiment_manager.experiment_record._hosting_state = None
self.experiment_manager.experiment_record._hosting_endpoint = None
@@ -337,9 +335,7 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad
# describe endpoint to get state of the deployment
try:
- sm_endpoint_info = self.sagemaker_client.describe_endpoint(
- EndpointName=self.experiment_id
- )
+ sm_endpoint_info = self.sagemaker_client.describe_endpoint(EndpointName=self.experiment_id)
except Exception:
# Do not raise exception
return
@@ -348,16 +344,14 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad
self.experiment_manager.experiment_record._hosting_state = hosting_state
# update table states via ddb client
- self.exp_db_client.update_experiment_hosting_state(
- self.experiment_id, hosting_state
- )
+ self.exp_db_client.update_experiment_hosting_state(self.experiment_id, hosting_state)
if hosting_state == HostingState.DEPLOYED:
# update local record
self.experiment_manager.experiment_record._hosting_endpoint = sm_endpoint_info.get("EndpointArn")
self.experiment_manager.experiment_record._last_hosted_model_id = next_model_to_host_id
self.experiment_manager.experiment_record._next_model_to_host_id = None
-
+
# update DynamoDB record
self.exp_db_client.update_experiment_hosting_endpoint(
self.experiment_id, sm_endpoint_info.get("EndpointArn")
@@ -365,9 +359,7 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad
self.exp_db_client.update_experiment_last_hosted_model_id(
self.experiment_id, next_model_to_host_id
)
- self.exp_db_client.update_experiment_next_model_to_host_id(
- self.experiment_id, None
- )
+ self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, None)
self._update_metrics_from_latest_hosting_update(next_model_to_host_id)
else:
@@ -394,31 +386,27 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad
self.experiment_manager.experiment_record._hosting_state = hosting_state
# update hosting_state in exp table
- self.exp_db_client.update_experiment_hosting_state(
- self.experiment_id, hosting_state
- )
+ self.exp_db_client.update_experiment_hosting_state(self.experiment_id, hosting_state)
if hosting_state == HostingState.DEPLOYED:
# update local record
self.experiment_manager.experiment_record._last_hosted_model_id = next_model_to_host_id
self.experiment_manager.experiment_record._next_model_to_host_id = None
-
+
# update DynamoDB record
self.exp_db_client.update_experiment_last_hosted_model_id(
self.experiment_id, next_model_to_host_id
)
- self.exp_db_client.update_experiment_next_model_to_host_id(
- self.experiment_id, None
- )
+ self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, None)
self._update_metrics_from_latest_hosting_update(next_model_to_host_id)
def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metadata):
"""Update the joining workflow metadata in the experiment table
-
+
Args:
joining_workflow_metadata (dict): A dictionary containing
joining workflow related metadata
- """
+ """
if joining_workflow_metadata is None:
return
@@ -427,8 +415,7 @@ def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metad
# some joining job request is in progress
if joining_state is not None and joining_state.endswith("ING"):
- join_job_record = self.join_db_client.get_join_job_record(
- self.experiment_id, next_join_job_id)
+ join_job_record = self.join_db_client.get_join_job_record(self.experiment_id, next_join_job_id)
# if join job record exists in the join table
if join_job_record is not None:
@@ -441,35 +428,29 @@ def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metad
self.experiment_manager.experiment_record._joining_state = joining_state
# update table states via ddb client
- self.exp_db_client.update_experiment_joining_state(
- self.experiment_id, joining_state
- )
+ self.exp_db_client.update_experiment_joining_state(self.experiment_id, joining_state)
if joining_state == JoiningState.SUCCEEDED:
self.experiment_manager.experiment_record._last_joined_job_id = next_join_job_id
self.experiment_manager.experiment_record._next_join_job_id = None
- self.exp_db_client.update_experiment_last_joined_job_id(
- self.experiment_id, next_join_job_id
- )
- self.exp_db_client.update_experiment_next_join_job_id(
- self.experiment_id, None
- )
+ self.exp_db_client.update_experiment_last_joined_job_id(self.experiment_id, next_join_job_id)
+ self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, None)
def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id):
"""
Updates SyncThread's local information on every Evaluation Job complete run.
- Also Emit CW metric for New Model Evaluation Scores plot, while updating
+ Also Emit CW metric for New Model Evaluation Scores plot, while updating
local latest_trained_model_* information, for continuous CW puts (for Number plots)
"""
try:
last_trained_model_id = self.experiment_manager.last_trained_model_id
currently_hosted_model_id = self.experiment_manager.last_hosted_model_id
-
+
if last_trained_model_id in latest_evaluation_job_id:
# using in as latest_evaluation_job_id would be of format last_trained_model_id-{eval}-{timestamp}
- # If the EvaluationJob was for latest Trained Model
+ # If the EvaluationJob was for latest Trained Model
eval_score = self.get_latest_eval_score_for_model_id(last_trained_model_id)
if eval_score == "n.a.":
logger.debug("EvalScore from last run in n.a.")
@@ -481,9 +462,7 @@ def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id):
# Also publish this score once, for Eval Score over time Graph
self.experiment_manager.cw_logger.publish_newly_trained_model_eval_information(
- self.experiment_id,
- last_trained_model_id,
- eval_score
+ self.experiment_id, last_trained_model_id, eval_score
)
elif currently_hosted_model_id in latest_evaluation_job_id:
# using in as latest_evaluation_job_id would be of format currently_hosted_model_id-{eval}-{timestamp}
@@ -497,8 +476,10 @@ def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id):
self.latest_hosted_model_eval_score = eval_score
else:
# Evaluation Job not for latest-trained-model
- logger.debug("Latest Evaluated Model doesn't match Latest Trained Model, or"
- " Currently Hosted Model. Skipping reporting EvalScore")
+ logger.debug(
+ "Latest Evaluated Model doesn't match Latest Trained Model, or"
+ " Currently Hosted Model. Skipping reporting EvalScore"
+ )
return
except Exception as e:
@@ -520,18 +501,13 @@ def _update_metrics_from_latest_hosting_update(self, latest_hosted_model_id):
# Also publish this score once, for Eval Score over time Graph
self.experiment_manager.cw_logger.publish_latest_hosting_information(
- self.experiment_id,
- latest_hosted_model_id,
- eval_score
+ self.experiment_id, latest_hosted_model_id, eval_score
)
except Exception as e:
logger.warn("Failed to emit latest training job eval metrics." + str(e))
def get_latest_eval_score_for_model_id(self, model_id):
- model_record = self.model_db_client.get_model_record(
- self.experiment_id,
- model_id
- )
+ model_record = self.model_db_client.get_model_record(self.experiment_id, model_id)
eval_score = "n.a."
if model_record is not None:
eval_keys = model_record["eval_scores"].keys()
@@ -540,36 +516,32 @@ def get_latest_eval_score_for_model_id(self, model_id):
return eval_score
# sort eval score by s3 prefix as joining job is ordered by time
eval_keys = sorted(eval_keys)
- return model_record["eval_scores"][eval_keys[-1]]
- else:
+ return model_record["eval_scores"][eval_keys[-1]]
+ else:
return eval_score
-
+
def emit_cloudwatch_metrics_for_training_and_hosting(self):
try:
# emit CloudWatch Training metrics
if self.latest_trained_model_id and self.latest_trained_model_eval_score:
self.experiment_manager.cw_logger.publish_latest_training_information(
- self.experiment_id,
- self.latest_trained_model_id,
- self.latest_trained_model_eval_score
+ self.experiment_id, self.latest_trained_model_id, self.latest_trained_model_eval_score
)
else:
- #logger.debug("Train CW Metrics Not Set")
+ # logger.debug("Train CW Metrics Not Set")
pass
except Exception:
logger.debug("Failed to publish CW Metrics for Training State")
logger.debug(e)
- try:
+ try:
# emit CloudWatch Hosting metrics
if self.latest_hosted_model_id and self.latest_hosted_model_eval_score:
self.experiment_manager.cw_logger.publish_latest_hosting_information(
- self.experiment_id,
- self.latest_hosted_model_id,
- self.latest_hosted_model_eval_score
+ self.experiment_id, self.latest_hosted_model_id, self.latest_hosted_model_eval_score
)
else:
- #logger.debug("Host CW Metrics Not Set")
+ # logger.debug("Host CW Metrics Not Set")
pass
except Exception:
logger.debug("Failed to publish CW Metrics for Training State")
@@ -600,7 +572,8 @@ def sync_experiment_state_with_ddb(self):
next_model_to_train = ModelManager(
model_db_client=self.model_db_client,
experiment_id=self.experiment_id,
- model_id=next_model_to_train_id)
+ model_id=next_model_to_train_id,
+ )
next_model_to_train.update_model_training_state()
time.sleep(1)
self._update_experiment_db_training_workflow_metadata(training_workflow_metadata)
@@ -615,12 +588,17 @@ def sync_experiment_state_with_ddb(self):
self.experiment_manager.next_model_to_evaluate.update_model_evaluation_state()
else:
# only init the ModelManager() if the evaluation job record already exists
- if self.model_db_client.get_model_record(self.experiment_id, \
- next_evaluation_job_id.split('-eval-')[0]) is not None:
+ if (
+ self.model_db_client.get_model_record(
+ self.experiment_id, next_evaluation_job_id.split("-eval-")[0]
+ )
+ is not None
+ ):
next_model_to_evaluate = ModelManager(
model_db_client=self.model_db_client,
experiment_id=self.experiment_id,
- model_id=next_evaluation_job_id.split('-eval-')[0])
+ model_id=next_evaluation_job_id.split("-eval-")[0],
+ )
next_model_to_evaluate.update_model_evaluation_state()
time.sleep(1)
self._update_experiment_db_evaluation_workflow_metadata(evaluation_workflow_metadata)
@@ -643,7 +621,8 @@ def sync_experiment_state_with_ddb(self):
next_join_job = JoinManager(
join_db_client=self.join_db_client,
experiment_id=self.experiment_id,
- join_job_id=next_join_job_id)
+ join_job_id=next_join_job_id,
+ )
next_join_job.update_join_job_state()
time.sleep(1)
self._update_experiment_db_joining_workflow_metadata(joining_workflow_metadata)
@@ -663,10 +642,10 @@ def run(self):
logger.error(e)
logger.warn("Resuming Sync in 10 seconds...")
time.sleep(10)
- time.sleep(.5)
+ time.sleep(0.5)
-class ExperimentManager():
+class ExperimentManager:
"""
A experiment entity to manage different components in the continual learning
iteration loops. One experiment will be initiated to solve a single RL problem.
@@ -674,15 +653,16 @@ class ExperimentManager():
entity provides methods/functionalities for model training/evaluation/deployment
and data joining.
"""
-
- def __init__(self,
- config,
- experiment_id,
- training_workflow_metadata={},
- hosting_workflow_metadata={},
- joining_workflow_metadata={},
- evaluation_workflow_metadata={}
- ):
+
+ def __init__(
+ self,
+ config,
+ experiment_id,
+ training_workflow_metadata={},
+ hosting_workflow_metadata={},
+ joining_workflow_metadata={},
+ evaluation_workflow_metadata={},
+ ):
"""Initialize/Reload an experiment entity to manage the workflow
Args:
@@ -696,12 +676,12 @@ def __init__(self,
Return:
sagemaker_rl.orchestrator.workflow.experiment_manager.ExperimentManager: A ``ExperimentManager`` object
to manage the workflow
- """
+ """
self.boto_session = boto3.Session()
self._region_name = self.boto_session.region_name
self.account = self.boto_session.client("sts").get_caller_identity()["Account"]
if self._region_name is None:
- raise ValueError('Must setup AWS configuration with a valid region')
+ raise ValueError("Must setup AWS configuration with a valid region")
# unique id common across all experiments in the account
self.experiment_id = experiment_id
@@ -709,7 +689,7 @@ def __init__(self,
# load configs
self.config = config
self.image = self.config.get("image", None).replace("{AWS_REGION}", self._region_name)
-
+
self.algor_config = self.config.get("algor", {})
self.local_mode = self.config.get("local_mode", True)
if self.local_mode:
@@ -721,62 +701,55 @@ def __init__(self,
self.soft_deployment = self.config.get("soft_deployment", False)
# load resource config and init shared resourced if not exists
- self.resource_manager = ResourceManager(self.config.get("resource", {}),
- boto_session=self.boto_session)
+ self.resource_manager = ResourceManager(self.config.get("resource", {}), boto_session=self.boto_session)
self.resource_manager.create_shared_resource_if_not_exist()
# init clients
self.exp_db_client = self.resource_manager.exp_db_client
self.model_db_client = self.resource_manager.model_db_client
self.join_db_client = self.resource_manager.join_db_client
- self.cw_logger = CloudWatchLogger(
- self.boto_session.client("cloudwatch"),
- self._region_name
- )
+ self.cw_logger = CloudWatchLogger(self.boto_session.client("cloudwatch"), self._region_name)
self.sagemaker_client = self.sagemaker_session.sagemaker_client
-
+
# init s3 client for rewards upload
- self.s3_client = self.boto_session.client('s3')
+ self.s3_client = self.boto_session.client("s3")
- # create a local JoinJobRecord object.
+ # create a local JoinJobRecord object.
self.experiment_record = ExperimentRecord(
experiment_id,
training_workflow_metadata,
hosting_workflow_metadata,
joining_workflow_metadata,
- evaluation_workflow_metadata
+ evaluation_workflow_metadata,
)
self.next_model_to_train = None
self.next_join_job = None
self.next_model_to_evaluate = None
- # Try to save new ExperimentRecord to ExperimentDb. If it throws
+ # Try to save new ExperimentRecord to ExperimentDb. If it throws
# RecordAlreadyExistsException, re-read the ExperimentRecord from ExperimentDb,
# and use it as initial state
try:
- self.exp_db_client.create_new_experiment_record(
- self.experiment_record.to_ddb_record()
- )
+ self.exp_db_client.create_new_experiment_record(self.experiment_record.to_ddb_record())
except RecordAlreadyExistsException:
- logger.warn(f"Experiment with name {self.experiment_id} already exists. "
- "Reusing current state from ExperimentDb.")
- experiment_record = self.exp_db_client.get_experiment_record(
- experiment_id
+ logger.warn(
+ f"Experiment with name {self.experiment_id} already exists. "
+ "Reusing current state from ExperimentDb."
)
+ experiment_record = self.exp_db_client.get_experiment_record(experiment_id)
self.experiment_record = ExperimentRecord.load_from_ddb_record(experiment_record)
except Exception as e:
logger.error("Unhandled Exception! " + str(e))
raise UnhandledWorkflowException("Something went wrong while creating a new experiment")
try:
- self.cw_logger.create_cloudwatch_dashboard_from_experiment_id(
- self.experiment_id
- )
+ self.cw_logger.create_cloudwatch_dashboard_from_experiment_id(self.experiment_id)
except Exception as e:
logger.error("Unable to create CloudWatch Dashboard." + str(e))
- logger.error("To see metrics on CloudWatch, run bandit_experiment."
- "cw_logger.create_cloudwatch_dashboard_from_experiment_id function again.")
-
+ logger.error(
+ "To see metrics on CloudWatch, run bandit_experiment."
+ "cw_logger.create_cloudwatch_dashboard_from_experiment_id function again."
+ )
# start a daemon thread to sync ExperimentDb states to local states
# the daemon thread will keep running till the session ends
@@ -784,8 +757,8 @@ def __init__(self,
# Run the thread in SageMaker mode only
if not self.local_mode:
- self.sync_thread.setDaemon(True)
- self.sync_thread.start()
+ self.sync_thread.setDaemon(True)
+ self.sync_thread.start()
def _sync_experiment_state_with_ddb(self):
"""
@@ -796,51 +769,51 @@ def _sync_experiment_state_with_ddb(self):
self.sync_thread.sync_experiment_state_with_ddb()
def _update_instance_type_for_local_mode(self):
- """Update the instance type if running in 'local' mode
- """
+ """Update the instance type if running in 'local' mode"""
self.config["resource"]["private_resource"]["hosting_fleet"]["instance_type"] = "local"
self.config["resource"]["private_resource"]["training_fleet"]["instance_type"] = "local"
self.config["resource"]["private_resource"]["evaluation_fleet"]["instance_type"] = "local"
def _jsonify(self):
- """Return a jsonify dict with metadata of the 'Experiment' object
- """
+ """Return a jsonify dict with metadata of the 'Experiment' object"""
return self.experiment_record.to_ddb_record()
def _get_prefix_and_relative_path(self, path_list):
"""Return shared prefix and relative paths given a list of paths
-
+
Args:
path_list (list): A list of string representing S3 paths
-
+
Returns:
(str, list): Return shared prefix and a list of relative paths
"""
# example of path: s3://custom-bucket/exp-1/exp-1-join-id-time-stamp/train
# use s3 bucket as prefix
# allow data from different experiments but in same account
- parts = path_list[0].split('/')
- shared_prefix = '/'.join(parts[0:3]) # s3://custom-bucket
+ parts = path_list[0].split("/")
+ shared_prefix = "/".join(parts[0:3]) # s3://custom-bucket
key_path_list = []
for path in path_list:
- parts = path.split('/')
- prefix = '/'.join(parts[0:3])
+ parts = path.split("/")
+ prefix = "/".join(parts[0:3])
if prefix != shared_prefix:
- logger.error(f" Prefix `{prefix}` is different from the shared prefix '{shared_prefix}'. "
- "Data in the list are not coming from same s3 bucket.")
- object_path = '/'.join(parts[3:])
+ logger.error(
+ f" Prefix `{prefix}` is different from the shared prefix '{shared_prefix}'. "
+ "Data in the list are not coming from same s3 bucket."
+ )
+ object_path = "/".join(parts[3:])
key_path_list.append(object_path)
return shared_prefix, key_path_list
def _write_manifest_to_s3(self, manifest_file):
"""Upload manifest file to S3 bucket
-
+
Args:
manifest_file (dict): A json blob that contains manifest shared prefix
and list of relative paths
-
+
Returns:
str: S3 data path for the uploaded manifest file
"""
@@ -852,31 +825,27 @@ def _write_manifest_to_s3(self, manifest_file):
manifest_bucket_name = "sagemaker-{}-{}".format(region, account)
timstamp = str(int(time.time()))
manifest_s3_file_key = f"{self.experiment_id}/manifest_files/manifest-{timstamp}"
- body = b''
- body += str(json.dumps(manifest_file, sort_keys=True, indent=4)).encode('utf_8')
+ body = b""
+ body += str(json.dumps(manifest_file, sort_keys=True, indent=4)).encode("utf_8")
try:
- s3_client.put_object(Body=body,
- Bucket=manifest_bucket_name,
- Key=manifest_s3_file_key)
+ s3_client.put_object(Body=body, Bucket=manifest_bucket_name, Key=manifest_s3_file_key)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError("Failed to upload manifest data with error {}: {}".format(
- error_code, message
- ))
-
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError("Failed to upload manifest data with error {}: {}".format(error_code, message))
+
manifest_file_path = f"s3://{manifest_bucket_name}/{manifest_s3_file_key}"
logger.info(f"Successfully upload manifest file to s3 bucket path `{manifest_file_path}'")
return manifest_file_path
def _generate_manifest(self, input_data_path_list):
"""Generate manifest file and upload it to S3 bucket
-
+
Args:
input_data_path_list (list): A list of strings representing
input S3 data paths
-
+
Returns:
str: S3 data path for the uploaded manifest file
"""
@@ -890,7 +859,7 @@ def _generate_manifest(self, input_data_path_list):
manifest = []
shared_prefix, key_path_list = self._get_prefix_and_relative_path(input_data_path_list)
logger.info(f"Generating manifest file with shared prefix '{shared_prefix}/' ...")
- manifest.append({'prefix': shared_prefix + '/'})
+ manifest.append({"prefix": shared_prefix + "/"})
for relative_key_path in key_path_list:
manifest.append(relative_key_path)
@@ -902,10 +871,13 @@ def last_trained_model_id(self):
if self.experiment_record._last_trained_model_id is None:
logger.warning("No model has been trained. Please check later.")
- if self.experiment_record._training_state is not None and \
- self.experiment_record._training_state.endswith("ING"):
- logger.warning(f"A training job with model id '{self.experiment_record._next_model_to_train_id}' "
- f"is running in state of '{self.experiment_record._training_state}'")
+ if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith(
+ "ING"
+ ):
+ logger.warning(
+ f"A training job with model id '{self.experiment_record._next_model_to_train_id}' "
+ f"is running in state of '{self.experiment_record._training_state}'"
+ )
return self.experiment_record._last_trained_model_id
@@ -914,10 +886,13 @@ def last_evaluation_job_id(self):
if self.experiment_record._last_evaluation_job_id is None:
logger.warning("No model has been evaluated. Please check later.")
- if self.experiment_record._evaluation_state is not None \
- and self.experiment_record._evaluation_state.endswith("ING"):
- logger.warning(f"A evaluation job with job id '{self.experiment_record._next_evaluation_job_id}' "
- f"is running in state of '{self.experiment_record._evaluation_state}'")
+ if self.experiment_record._evaluation_state is not None and self.experiment_record._evaluation_state.endswith(
+ "ING"
+ ):
+ logger.warning(
+ f"A evaluation job with job id '{self.experiment_record._next_evaluation_job_id}' "
+ f"is running in state of '{self.experiment_record._evaluation_state}'"
+ )
return self.experiment_record._last_evaluation_job_id
@@ -926,10 +901,11 @@ def last_hosted_model_id(self):
if self.experiment_record._last_hosted_model_id is None:
logger.warning("No model has been hosted. Please deploy a model and check later.")
- if self.experiment_record._hosting_state is not None \
- and self.experiment_record._hosting_state.endswith("ING"):
- logger.warning(f"A deployment with model id '{self.experiment_record._next_model_to_host_id}' "
- f"is running in state of '{self.experiment_record._hosting_state}'")
+ if self.experiment_record._hosting_state is not None and self.experiment_record._hosting_state.endswith("ING"):
+ logger.warning(
+ f"A deployment with model id '{self.experiment_record._next_model_to_host_id}' "
+ f"is running in state of '{self.experiment_record._hosting_state}'"
+ )
return self.experiment_record._last_hosted_model_id
@@ -938,15 +914,16 @@ def last_joined_job_id(self):
if self.experiment_record._last_joined_job_id is None:
logger.warning("No joining job has been completed. Please check later.")
- if self.experiment_record._joining_state is not None \
- and self.experiment_record._joining_state.endswith("ING"):
- logger.warning(f"A joining job with job id '{self.experiment_record._next_join_job_id}' "
- f"is running in state of '{self.experiment_record._joining_state}'")
+ if self.experiment_record._joining_state is not None and self.experiment_record._joining_state.endswith("ING"):
+ logger.warning(
+ f"A joining job with job id '{self.experiment_record._next_join_job_id}' "
+ f"is running in state of '{self.experiment_record._joining_state}'"
+ )
return self.experiment_record._last_joined_job_id
@property
- def last_joined_job_train_data(self):
+ def last_joined_job_train_data(self):
record = self.join_db_client.get_join_job_record(self.experiment_id, self.last_joined_job_id)
return record["output_joined_train_data_s3_path"]
@@ -957,31 +934,32 @@ def last_joined_job_eval_data(self):
def _get_hosting_environ_vars(self, model_id):
"""Return hosting endpoint environment variables
-
+
Args:
model_id (str): A unique string representing which model
to be hosted by the endpoint
-
+
Returns:
dict: A dictionary containing environment variables of hosting endpoint
"""
- environ_vars = {"AWS_DEFAULT_REGION": self._region_name,
- "EXPERIMENT_ID": self.experiment_id,
- "EXP_METADATA_DYNAMO_TABLE": self.resource_manager.exp_db_table_name,
- "MODEL_METADATA_DYNAMO_TABLE": self.resource_manager.model_db_table_name,
- "MODEL_ID": model_id,
- "AWS_REGION": self._region_name,
- "FIREHOSE_STREAM": None,
- # Set to true if inference logging is required.
- "LOG_INFERENCE_DATA": str(not self.local_mode).lower(),
- # For efficient soft model updates.
- "MODEL_METADATA_POLLING": str(self.soft_deployment).lower()
- }
+ environ_vars = {
+ "AWS_DEFAULT_REGION": self._region_name,
+ "EXPERIMENT_ID": self.experiment_id,
+ "EXP_METADATA_DYNAMO_TABLE": self.resource_manager.exp_db_table_name,
+ "MODEL_METADATA_DYNAMO_TABLE": self.resource_manager.model_db_table_name,
+ "MODEL_ID": model_id,
+ "AWS_REGION": self._region_name,
+ "FIREHOSE_STREAM": None,
+ # Set to true if inference logging is required.
+ "LOG_INFERENCE_DATA": str(not self.local_mode).lower(),
+ # For efficient soft model updates.
+ "MODEL_METADATA_POLLING": str(self.soft_deployment).lower(),
+ }
return environ_vars
def _setup_hosting_endpoint(self, model_id, wait, **kwargs):
"""Initiate a hosting endpoint deployment
-
+
Args:
model_id (str): A unique string representing which model to deploy
wait (bool): Whether to wait until the deployment finished
@@ -1004,24 +982,29 @@ def _setup_hosting_endpoint(self, model_id, wait, **kwargs):
name=model_id,
sagemaker_session=self.sagemaker_session,
env=environ_vars,
- **kwargs)
+ **kwargs,
+ )
hosting_instance_count = self.resource_manager.hosting_fleet_config.get("instance_count", 1)
hosting_instance_type = self.resource_manager.hosting_fleet_config.get("instance_type", "local")
try:
- sagemaker_model.deploy(initial_instance_count=hosting_instance_count,
- instance_type=hosting_instance_type,
- endpoint_name=self.experiment_id,
- wait=wait)
+ sagemaker_model.deploy(
+ initial_instance_count=hosting_instance_count,
+ instance_type=hosting_instance_type,
+ endpoint_name=self.experiment_id,
+ wait=wait,
+ )
except Exception as e:
logger.error(f"Failed to deploy experiment {self.experiment_id}: " + str(e))
- raise UnhandledWorkflowException( "Some error occurred while setting up hosting endpoint. "
- "Please check SageMaker console for more information.")
+ raise UnhandledWorkflowException(
+ "Some error occurred while setting up hosting endpoint. "
+ "Please check SageMaker console for more information."
+ )
def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True):
"""Update the model hosted in an existing endpoint
-
+
Args:
soft_deploy (bool): Whether to update the model hosted by the
endpoint with soft deployment support
@@ -1029,12 +1012,8 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True):
to deploy/update
"""
# update 'next_model_to_host_id' and 'hosting_state'
- self.exp_db_client.update_experiment_next_model_to_host_id(
- self.experiment_id, model_id
- )
- self.exp_db_client.update_experiment_hosting_state(
- self.experiment_id, HostingState.PENDING
- )
+ self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, model_id)
+ self.exp_db_client.update_experiment_hosting_state(self.experiment_id, HostingState.PENDING)
# soft deployment will happen once the 'next_model_host_id' is persisted into ExperimentDB
if not soft_deploy:
update_endpoint = True
@@ -1054,8 +1033,10 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True):
if closed:
logger.info("Closed docker container[s] that was already running (maybe from previous job)")
else:
- logger.exception("Failed to close a docker container that was already running (maybe from "
- "previous job). Please close it manually and retry.")
+ logger.exception(
+ "Failed to close a docker container that was already running (maybe from "
+ "previous job). Please close it manually and retry."
+ )
model_record = self.model_db_client.get_model_record(self.experiment_id, model_id)
sagemaker_model = sagemaker.model.Model(
@@ -1064,26 +1045,29 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True):
role=self.resource_manager.iam_role_arn,
name=model_id,
sagemaker_session=self.sagemaker_session,
- env=environ_vars)
+ env=environ_vars,
+ )
hosting_instance_count = self.resource_manager.hosting_fleet_config.get("instance_count", 1)
hosting_instance_type = self.resource_manager.hosting_fleet_config.get("instance_type", "local")
try:
- sagemaker_model.deploy(initial_instance_count=hosting_instance_count,
- instance_type=hosting_instance_type,
- endpoint_name=self.experiment_id,
- update_endpoint=update_endpoint,
- wait=wait)
+ sagemaker_model.deploy(
+ initial_instance_count=hosting_instance_count,
+ instance_type=hosting_instance_type,
+ endpoint_name=self.experiment_id,
+ update_endpoint=update_endpoint,
+ wait=wait,
+ )
except Exception as e:
logger.error(e)
pass
def _check_if_model_ready(self, model_id):
"""Check if the model exists and already trained
-
+
Args:
model_id (str): A unique string representing which model
to check
-
+
Returns:
bool: Whether the model exists and is already trained
"""
@@ -1093,24 +1077,23 @@ def _check_if_model_ready(self, model_id):
return False
# check if the model training is completed successfully to consume by next step
- model_exist = self.model_db_client.check_model_record_exists(
- self.experiment_id, model_id
- )
+ model_exist = self.model_db_client.check_model_record_exists(self.experiment_id, model_id)
if not model_exist:
- logger.error(f"Model with mode_id '{model_id}' was not found in model table. "
- "Please create a model first")
+ logger.error(
+ f"Model with mode_id '{model_id}' was not found in model table. " "Please create a model first"
+ )
return False
# 'model_id' found in table, check if the 'model_id' is trained
model_to_deploy = ModelManager(
- model_db_client=self.model_db_client,
- experiment_id=self.experiment_id,
- model_id=model_id
- )
+ model_db_client=self.model_db_client, experiment_id=self.experiment_id, model_id=model_id
+ )
if not model_to_deploy.model_record.is_train_completed():
- logger.warning(f"Model '{model_id}' is in status of "
- f"{model_to_deploy.model_record._train_state}, Please check later.")
+ logger.warning(
+ f"Model '{model_id}' is in status of "
+ f"{model_to_deploy.model_record._train_state}, Please check later."
+ )
return False
return True
@@ -1118,21 +1101,23 @@ def _check_if_model_ready(self, model_id):
def deploy_model(self, model_id, wait=True, **kwargs):
"""Deploy a new model by creating a new hosting endpoint
or update the model hosted by an existing endpoint
-
+
Args:
model_id (str): A unique string representing which model
to deploy/update
wait (bool): Whether to wait until the deployment finish
"""
# TODO: add validation/instructions if multiple deployment
- # request happened in th same experiment
-
+ # request happened in th same experiment
+
# Sync experiment state if required
self._sync_experiment_state_with_ddb()
# check if 'model_id' is already hosted
- if self.experiment_record._last_hosted_model_id == model_id \
- and self.experiment_record._hosting_state == HostingState.DEPLOYED:
+ if (
+ self.experiment_record._last_hosted_model_id == model_id
+ and self.experiment_record._hosting_state == HostingState.DEPLOYED
+ ):
logger.info(f"Model {model_id} is already being hosted. No deployment needed.")
return
@@ -1152,39 +1137,39 @@ def deploy_model(self, model_id, wait=True, **kwargs):
if closed:
logger.info("Closed docker container[s] that was already running (maybe from previous job).")
else:
- logger.exception("Failed to close a docker container that was already running (maybe from "
- "previous job). Please close it manually and retry.")
+ logger.exception(
+ "Failed to close a docker container that was already running (maybe from "
+ "previous job). Please close it manually and retry."
+ )
else:
logger.info("No hosting endpoint found, creating a new hosting endpoint.")
# update 'next_model_to_host_id' and 'hosting_state'
- self.exp_db_client.update_experiment_next_model_to_host_id(
- self.experiment_id, model_id
- )
- self.exp_db_client.update_experiment_hosting_state(
- self.experiment_id, HostingState.PENDING
- )
-
+ self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, model_id)
+ self.exp_db_client.update_experiment_hosting_state(self.experiment_id, HostingState.PENDING)
+
# starting hosting endpoint
try:
self._setup_hosting_endpoint(model_id, wait=wait, **kwargs)
except Exception as e:
logger.error(e)
pass
-
+
else:
if self.experiment_record._hosting_state.endswith("ING"):
logger.warning("Some deployment request is in progress, canceled this one")
return
elif self.experiment_record._hosting_state.endswith("ED"):
self._update_model_in_endpoint(self.soft_deployment, model_id, wait=wait)
-
+
# wait until exp ddb table updated
if self.local_mode or wait:
- deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \
- and self.experiment_record._last_hosted_model_id == model_id \
- and self.experiment_record._next_model_to_host_id is None
-
+ deployed_state = (
+ self.experiment_record._hosting_state == HostingState.DEPLOYED
+ and self.experiment_record._last_hosted_model_id == model_id
+ and self.experiment_record._next_model_to_host_id is None
+ )
+
num_retries = 0
num_retries_blue_green_deployment = 0
max_retries = 100
@@ -1193,62 +1178,75 @@ def deploy_model(self, model_id, wait=True, **kwargs):
# local mode is fast, 'num_retries' increases exponentially
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table hosting status to be updated...")
-
+
if self.soft_deployment:
time.sleep(10 * max_retries)
- deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \
- and self.experiment_record._last_hosted_model_id == model_id \
- and self.experiment_record._next_model_to_host_id is None
+ deployed_state = (
+ self.experiment_record._hosting_state == HostingState.DEPLOYED
+ and self.experiment_record._last_hosted_model_id == model_id
+ and self.experiment_record._next_model_to_host_id is None
+ )
num_retries += 1
- if num_retries >= max_retries and self.local_mode:
- raise UnhandledWorkflowException(f"Deployment with model "
- f"'{self.experiment_record._next_model_to_host_id}' was in "
- f"state of '{self.experiment_record._hosting_state}'. Failed "
- "to sync table status.")
-
+ if num_retries >= max_retries and self.local_mode:
+ raise UnhandledWorkflowException(
+ f"Deployment with model "
+ f"'{self.experiment_record._next_model_to_host_id}' was in "
+ f"state of '{self.experiment_record._hosting_state}'. Failed "
+ "to sync table status."
+ )
+
else:
# blue-green deployment takes ~8 min, retry every 30 seconds
time.sleep(30)
- deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \
- and self.experiment_record._last_hosted_model_id == model_id \
- and self.experiment_record._next_model_to_host_id is None
+ deployed_state = (
+ self.experiment_record._hosting_state == HostingState.DEPLOYED
+ and self.experiment_record._last_hosted_model_id == model_id
+ and self.experiment_record._next_model_to_host_id is None
+ )
num_retries_blue_green_deployment += 1
-
- if num_retries_blue_green_deployment%2 == 0:
- logger.debug(f"Waited {int(num_retries_blue_green_deployment / 2)} "
- f"minutes for blue-green deployment...")
-
- if num_retries_blue_green_deployment >=30: # restrict maximum wait time to 15min
- raise UnhandledWorkflowException(f"Deployment with model "
- f"'{self.experiment_record._next_model_to_host_id}' was in "
- f"state of '{self.experiment_record._hosting_state}'. Failed "
- "to sync table status.")
-
+
+ if num_retries_blue_green_deployment % 2 == 0:
+ logger.debug(
+ f"Waited {int(num_retries_blue_green_deployment / 2)} "
+ f"minutes for blue-green deployment..."
+ )
+
+ if num_retries_blue_green_deployment >= 30: # restrict maximum wait time to 15min
+ raise UnhandledWorkflowException(
+ f"Deployment with model "
+ f"'{self.experiment_record._next_model_to_host_id}' was in "
+ f"state of '{self.experiment_record._hosting_state}'. Failed "
+ "to sync table status."
+ )
+
if self.experiment_record._hosting_state == HostingState.FAILED:
- raise SageMakerHostingException("Deployment with model "
- f"'{self.experiment_record._next_model_to_host_id}' ended "
- f"with state '{self.experiment_record._hosting_state}'. "
- "Please check Sagemaker log for more information.")
-
+ raise SageMakerHostingException(
+ "Deployment with model "
+ f"'{self.experiment_record._next_model_to_host_id}' ended "
+ f"with state '{self.experiment_record._hosting_state}'. "
+ "Please check Sagemaker log for more information."
+ )
+
@property
def predictor(self):
if self.experiment_record._hosting_endpoint:
- return Predictor(endpoint_name=self.experiment_id,
- sagemaker_session=self.sagemaker_session)
+ return Predictor(endpoint_name=self.experiment_id, sagemaker_session=self.sagemaker_session)
else:
- logger.warning("Hosting endpoint is not ready yet. A deployment "
- f"with model id '{self.experiment_record._next_model_to_host_id}' is in state of "
- f"'{self.experiment_record._hosting_state}'. Please check later.")
+ logger.warning(
+ "Hosting endpoint is not ready yet. A deployment "
+ f"with model id '{self.experiment_record._next_model_to_host_id}' is in state of "
+ f"'{self.experiment_record._hosting_state}'. Please check later."
+ )
return None
def ingest_rewards(self, rewards_buffer):
"""Upload rewards data in a rewards buffer to S3 bucket
-
+
Args:
rewards_buffer (list): A list of json blobs containing
rewards data
-
+
Returns:
str: S3 data prefix path that contains the rewards file
"""
@@ -1256,37 +1254,33 @@ def ingest_rewards(self, rewards_buffer):
rewards_bucket_name = self.resource_manager._create_s3_bucket_if_not_exist("sagemaker")
timstamp = str(int(time.time()))
rewards_s3_file_key = f"{self.experiment_id}/rewards_data/{self.experiment_id}-{timstamp}/rewards-{timstamp}"
- body = b''
+ body = b""
for reward in rewards_buffer:
- body += str(json.dumps(reward) + '\n').encode('utf_8')
+ body += str(json.dumps(reward) + "\n").encode("utf_8")
try:
- self.s3_client.put_object(Body=body,
- Bucket=rewards_bucket_name,
- Key=rewards_s3_file_key)
+ self.s3_client.put_object(Body=body, Bucket=rewards_bucket_name, Key=rewards_s3_file_key)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError("Failed to upload rewards data with error {}: {}".format(
- error_code, message
- ))
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError("Failed to upload rewards data with error {}: {}".format(error_code, message))
rewards_file_path = f"s3://{rewards_bucket_name}/{rewards_s3_file_key}"
logger.info("Waiting for reward data to be uploaded.")
- waiter = self.s3_client.get_waiter('object_exists')
+ waiter = self.s3_client.get_waiter("object_exists")
waiter.wait(Bucket=rewards_bucket_name, Key=rewards_s3_file_key)
logger.info(f"Successfully upload reward files to s3 bucket path {rewards_file_path}")
- reward_s3_prefix = '/'.join(rewards_file_path.split('/')[:-1])
+ reward_s3_prefix = "/".join(rewards_file_path.split("/")[:-1])
return reward_s3_prefix
def ingest_joined_data(self, joined_data_buffer, ratio=0.8):
"""Upload joined data in joined data buffer to S3 bucket
-
+
Args:
joined_data_buffer (list): A list of json blobs containing
joined data
@@ -1297,28 +1291,27 @@ def ingest_joined_data(self, joined_data_buffer, ratio=0.8):
# update next_join_job_id and joining state
next_join_job_id = JoinManager.name_next_join_job(experiment_id=self.experiment_id)
- self.exp_db_client.update_experiment_next_join_job_id(
- self.experiment_id,
- next_join_job_id)
- self.exp_db_client.update_experiment_joining_state(
- self.experiment_id,
- JoiningState.PENDING)
-
- self.next_join_job = JoinManager(join_db_client=self.join_db_client,
- experiment_id=self.experiment_id,
- join_job_id=next_join_job_id,
- input_obs_data_s3_path="local-join-does-not-apply",
- input_reward_data_s3_path="local-join-does-not-apply",
- boto_session=self.boto_session)
-
+ self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, next_join_job_id)
+ self.exp_db_client.update_experiment_joining_state(self.experiment_id, JoiningState.PENDING)
+
+ self.next_join_job = JoinManager(
+ join_db_client=self.join_db_client,
+ experiment_id=self.experiment_id,
+ join_job_id=next_join_job_id,
+ input_obs_data_s3_path="local-join-does-not-apply",
+ input_reward_data_s3_path="local-join-does-not-apply",
+ boto_session=self.boto_session,
+ )
+
logger.info("Started dummy local joining job...")
- self.next_join_job.start_dummy_join(joined_data_buffer=joined_data_buffer,
- ratio=ratio)
+ self.next_join_job.start_dummy_join(joined_data_buffer=joined_data_buffer, ratio=ratio)
# this method can be invoked either in local/SM mode
- succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \
- and self.experiment_record._last_joined_job_id == next_join_job_id \
- and self.experiment_record._next_join_job_id is None
+ succeeded_state = (
+ self.experiment_record._joining_state == JoiningState.SUCCEEDED
+ and self.experiment_record._last_joined_job_id == next_join_job_id
+ and self.experiment_record._next_join_job_id is None
+ )
num_retries = 0
max_retries = 100
while not succeeded_state:
@@ -1326,23 +1319,31 @@ def ingest_joined_data(self, joined_data_buffer, ratio=0.8):
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table joining status to be updated...")
time.sleep(10 * max_retries)
- succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \
- and self.experiment_record._last_joined_job_id == next_join_job_id \
- and self.experiment_record._next_join_job_id is None
+ succeeded_state = (
+ self.experiment_record._joining_state == JoiningState.SUCCEEDED
+ and self.experiment_record._last_joined_job_id == next_join_job_id
+ and self.experiment_record._next_join_job_id is None
+ )
num_retries += 1
if num_retries >= max_retries:
- raise UnhandledWorkflowException(f"Joining job '{self.experiment_record._next_join_job_id}' "
- f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states.")
- if self.experiment_record._joining_state == JoiningState.FAILED or \
- self.experiment_record._joining_state == JoiningState.CANCELLED:
- raise WorkflowJoiningJobException(f"Joining job '{self.experiment_record._next_join_job_id}' "
- f"ended with state '{self.experiment_record._joining_state}'. Please check if provided "
- "joined_data_buffer was in correct data format.")
-
+ raise UnhandledWorkflowException(
+ f"Joining job '{self.experiment_record._next_join_job_id}' "
+ f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states."
+ )
+ if (
+ self.experiment_record._joining_state == JoiningState.FAILED
+ or self.experiment_record._joining_state == JoiningState.CANCELLED
+ ):
+ raise WorkflowJoiningJobException(
+ f"Joining job '{self.experiment_record._next_join_job_id}' "
+ f"ended with state '{self.experiment_record._joining_state}'. Please check if provided "
+ "joined_data_buffer was in correct data format."
+ )
+
def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True):
"""Start a joining job given rewards data path and observation
data time window
-
+
Args:
rewards_s3_path (str): S3 data path containing the rewards data
obs_time_window (int): Define a time window of past X hours to
@@ -1355,25 +1356,24 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True):
self._sync_experiment_state_with_ddb()
if obs_time_window is None:
- logger.warning(f"Start a join job to join reward data "
- f"under '{rewards_s3_path}' with all the observation data")
+ logger.warning(
+ f"Start a join job to join reward data " f"under '{rewards_s3_path}' with all the observation data"
+ )
obs_end_time = None
obs_start_time = None
else:
- logger.info(f"Start a join job to join reward data "
- f"under '{rewards_s3_path}' with observation "
- f"data in the past {obs_time_window} hours")
+ logger.info(
+ f"Start a join job to join reward data "
+ f"under '{rewards_s3_path}' with observation "
+ f"data in the past {obs_time_window} hours"
+ )
obs_end_time = datetime.utcnow()
obs_start_time = obs_end_time - timedelta(hours=obs_time_window)
# update next_join_job_id and joining state
next_join_job_id = JoinManager.name_next_join_job(experiment_id=self.experiment_id)
- self.exp_db_client.update_experiment_next_join_job_id(
- self.experiment_id,
- next_join_job_id)
- self.exp_db_client.update_experiment_joining_state(
- self.experiment_id,
- JoiningState.PENDING)
+ self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, next_join_job_id)
+ self.exp_db_client.update_experiment_joining_state(self.experiment_id, JoiningState.PENDING)
input_obs_data_s3_path = f"s3://{self.resource_manager.firehose_bucket}/{self.experiment_id}"
input_obs_data_s3_path = f"{input_obs_data_s3_path}/inference_data"
@@ -1381,14 +1381,16 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True):
logger.info("Creating resource for joining job...")
try:
- self.next_join_job = JoinManager(join_db_client=self.join_db_client,
- experiment_id=self.experiment_id,
- join_job_id=next_join_job_id,
- input_obs_data_s3_path=input_obs_data_s3_path,
- obs_start_time=obs_start_time,
- obs_end_time=obs_end_time,
- input_reward_data_s3_path=rewards_s3_path,
- boto_session=self.boto_session)
+ self.next_join_job = JoinManager(
+ join_db_client=self.join_db_client,
+ experiment_id=self.experiment_id,
+ join_job_id=next_join_job_id,
+ input_obs_data_s3_path=input_obs_data_s3_path,
+ obs_start_time=obs_start_time,
+ obs_end_time=obs_end_time,
+ input_reward_data_s3_path=rewards_s3_path,
+ boto_session=self.boto_session,
+ )
logger.info("Started joining job...")
self.next_join_job.start_join(ratio=ratio, wait=wait)
@@ -1398,35 +1400,45 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True):
# wait until exp ddb table updated
if self.local_mode or wait:
- succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \
- and self.experiment_record._last_joined_job_id == next_join_job_id \
- and self.experiment_record._next_join_job_id is None
- num_retries = 0
+ succeeded_state = (
+ self.experiment_record._joining_state == JoiningState.SUCCEEDED
+ and self.experiment_record._last_joined_job_id == next_join_job_id
+ and self.experiment_record._next_join_job_id is None
+ )
+ num_retries = 0
max_retries = 100
-
+
while not succeeded_state:
# Sync experiment state if required
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table joining status to be updated...")
time.sleep(10 * num_retries)
- succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \
- and self.experiment_record._last_joined_job_id == next_join_job_id \
- and self.experiment_record._next_join_job_id is None
+ succeeded_state = (
+ self.experiment_record._joining_state == JoiningState.SUCCEEDED
+ and self.experiment_record._last_joined_job_id == next_join_job_id
+ and self.experiment_record._next_join_job_id is None
+ )
num_retries += 1
if num_retries > max_retries:
- raise UnhandledWorkflowException(f"Joining job '{self.experiment_record._next_join_job_id}' "
- f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states.")
+ raise UnhandledWorkflowException(
+ f"Joining job '{self.experiment_record._next_join_job_id}' "
+ f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states."
+ )
- if self.experiment_record._joining_state == JoiningState.FAILED or \
- self.experiment_record._joining_state == JoiningState.CANCELLED:
- raise WorkflowJoiningJobException(f"Joining job '{self.experiment_record._next_join_job_id}' "
- f"ended with state '{self.experiment_record._joining_state}'. Please check Athena queries logs "
- "for more information.")
+ if (
+ self.experiment_record._joining_state == JoiningState.FAILED
+ or self.experiment_record._joining_state == JoiningState.CANCELLED
+ ):
+ raise WorkflowJoiningJobException(
+ f"Joining job '{self.experiment_record._next_join_job_id}' "
+ f"ended with state '{self.experiment_record._joining_state}'. Please check Athena queries logs "
+ "for more information."
+ )
def initialize_first_model(self, wait=True, input_data_s3_prefix=None):
"""
Initializes the first Model training for an Experiment
-
+
Args:
wait (bool): Whether to wait until the training job finishes
input_data_s3_prefix (str): S3 data path containing data
@@ -1437,25 +1449,23 @@ def initialize_first_model(self, wait=True, input_data_s3_prefix=None):
# experiment only allow one training job at a time,
# validate no other training request is in progress
- if self.experiment_record._training_state is not None \
- and self.experiment_record._training_state.endswith("ING"):
- logger.error(f"A training request with model id '{self.experiment_record._next_model_to_train_id}' "
- f"was in the state of '{self.experiment_record._training_state}'. "
- "Wait until the training job finished or canceled the request.")
+ if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith(
+ "ING"
+ ):
+ logger.error(
+ f"A training request with model id '{self.experiment_record._next_model_to_train_id}' "
+ f"was in the state of '{self.experiment_record._training_state}'. "
+ "Wait until the training job finished or canceled the request."
+ )
raise InvalidUsageException("Please wait for old Training Job to Complete before requesting a new one!")
else:
# update next_model_to_train_id and training state
next_model_to_train_id = ModelManager.name_next_model(experiment_id=self.experiment_id)
logger.info(f"Next Model name would be {next_model_to_train_id}")
- self.exp_db_client.update_experiment_next_model_to_train_id(
- self.experiment_id,
- next_model_to_train_id)
- self.exp_db_client.update_experiment_training_state(
- self.experiment_id,
- TrainingState.PENDING)
+ self.exp_db_client.update_experiment_next_model_to_train_id(self.experiment_id, next_model_to_train_id)
+ self.exp_db_client.update_experiment_training_state(self.experiment_id, TrainingState.PENDING)
logger.info(f"Start training job for model '{next_model_to_train_id}''")
-
# generate manifest file if input is a list
manifest_file_path = None
if isinstance(input_data_s3_prefix, list):
@@ -1472,53 +1482,62 @@ def initialize_first_model(self, wait=True, input_data_s3_prefix=None):
role=self.resource_manager.iam_role_arn,
instance_config=self.resource_manager.training_fleet_config,
boto_session=self.boto_session,
- algor_config=self.algor_config
- )
+ algor_config=self.algor_config,
+ )
self.next_model_to_train.fit(
wait=wait,
input_model_id=None,
input_data_s3_prefix=input_data_s3_prefix,
manifest_file_path=manifest_file_path,
- logs=wait
- )
+ logs=wait,
+ )
except Exception as e:
- logger.error(f"Failed to start new Model Training job for"
- " ModelId {next_model_to_train_id}")
+ logger.error(f"Failed to start new Model Training job for" " ModelId {next_model_to_train_id}")
logger.error(e)
pass
# wait until ExperimentDb state is updated
if self.local_mode or wait:
- trained_state = self.experiment_record._training_state == TrainingState.TRAINED \
- and self.experiment_record._last_trained_model_id == next_model_to_train_id \
- and self.experiment_record._next_model_to_train_id is None
+ trained_state = (
+ self.experiment_record._training_state == TrainingState.TRAINED
+ and self.experiment_record._last_trained_model_id == next_model_to_train_id
+ and self.experiment_record._next_model_to_train_id is None
+ )
num_retries = 0
max_retries = 100
-
+
while not trained_state:
# Sync experiment state if required
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table training status to be updated...")
time.sleep(10 * num_retries)
- trained_state = self.experiment_record._training_state == TrainingState.TRAINED \
- and self.experiment_record._last_trained_model_id == next_model_to_train_id \
- and self.experiment_record._next_model_to_train_id is None
+ trained_state = (
+ self.experiment_record._training_state == TrainingState.TRAINED
+ and self.experiment_record._last_trained_model_id == next_model_to_train_id
+ and self.experiment_record._next_model_to_train_id is None
+ )
num_retries += 1
if num_retries >= max_retries:
- raise UnhandledWorkflowException(f"Training job '{self.experiment_record._next_model_to_train_id}' "
- f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED.")
- if self.experiment_record._training_state == TrainingState.FAILED \
- or self.experiment_record._training_state == TrainingState.STOPPED:
- raise SageMakerTrainingJobException(f"Training job '{self.experiment_record._next_model_to_train_id}' "
- f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for "
- "more information.")
+ raise UnhandledWorkflowException(
+ f"Training job '{self.experiment_record._next_model_to_train_id}' "
+ f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED."
+ )
+ if (
+ self.experiment_record._training_state == TrainingState.FAILED
+ or self.experiment_record._training_state == TrainingState.STOPPED
+ ):
+ raise SageMakerTrainingJobException(
+ f"Training job '{self.experiment_record._next_model_to_train_id}' "
+ f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for "
+ "more information."
+ )
def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=None):
"""
Train a new model given the training data and a pretrained model
-
+
Args:
wait (bool): Whether to wait until the training finish
input_data_s3_prefix (str): S3 data path containing data
@@ -1531,8 +1550,10 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=
# use 'last_trained_model_id' by default as input model for next training
if input_model_id is None and self.experiment_record._last_trained_model_id is not None:
- logger.info(f"Use last trained model {self.experiment_record._last_trained_model_id} "
- "as pre-trained model for training")
+ logger.info(
+ f"Use last trained model {self.experiment_record._last_trained_model_id} "
+ "as pre-trained model for training"
+ )
input_model_id = self.experiment_record._last_trained_model_id
@@ -1543,11 +1564,14 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=
# experiment only allows one training job at a time,
# validate no other training request is in progress
- if self.experiment_record._training_state is not None and \
- self.experiment_record._training_state.endswith("ING"):
- logger.error(f"A training request with model id '{self.experiment_record._next_model_to_train_id}' "
- f"was in the state of '{self.experiment_record._training_state}'. "
- "Please wait until the training job is finished.")
+ if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith(
+ "ING"
+ ):
+ logger.error(
+ f"A training request with model id '{self.experiment_record._next_model_to_train_id}' "
+ f"was in the state of '{self.experiment_record._training_state}'. "
+ "Please wait until the training job is finished."
+ )
raise InvalidUsageException("Please wait for old Training Job to Complete before requesting a new one!")
else:
# update next_model_to_train_id and training state
@@ -1555,12 +1579,8 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=
logger.info(f"Starting training job for ModelId '{next_model_to_train_id}''")
- self.exp_db_client.update_experiment_next_model_to_train_id(
- self.experiment_id,
- next_model_to_train_id)
- self.exp_db_client.update_experiment_training_state(
- self.experiment_id,
- TrainingState.PENDING)
+ self.exp_db_client.update_experiment_next_model_to_train_id(self.experiment_id, next_model_to_train_id)
+ self.exp_db_client.update_experiment_training_state(self.experiment_id, TrainingState.PENDING)
manifest_file_path = None
if isinstance(input_data_s3_prefix, list):
@@ -1576,22 +1596,26 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=
role=self.resource_manager.iam_role_arn,
instance_config=self.resource_manager.training_fleet_config,
boto_session=self.boto_session,
- algor_config=self.algor_config
- )
- self.next_model_to_train.fit(wait=wait,
- input_model_id=input_model_id,
- input_data_s3_prefix=input_data_s3_prefix,
- manifest_file_path=manifest_file_path,
- logs=wait)
+ algor_config=self.algor_config,
+ )
+ self.next_model_to_train.fit(
+ wait=wait,
+ input_model_id=input_model_id,
+ input_data_s3_prefix=input_data_s3_prefix,
+ manifest_file_path=manifest_file_path,
+ logs=wait,
+ )
except Exception as e:
logger.error(e)
pass
# wait until exp ddb table updated
if self.local_mode or wait:
- trained_state = self.experiment_record._training_state == TrainingState.TRAINED \
- and self.experiment_record._last_trained_model_id == next_model_to_train_id \
- and self.experiment_record._next_model_to_train_id is None
+ trained_state = (
+ self.experiment_record._training_state == TrainingState.TRAINED
+ and self.experiment_record._last_trained_model_id == next_model_to_train_id
+ and self.experiment_record._next_model_to_train_id is None
+ )
num_retries = 0
max_retries = 100
while not trained_state:
@@ -1599,23 +1623,31 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table training status to be updated...")
time.sleep(10 * num_retries)
- trained_state = self.experiment_record._training_state == TrainingState.TRAINED \
- and self.experiment_record._last_trained_model_id == next_model_to_train_id \
- and self.experiment_record._next_model_to_train_id is None
+ trained_state = (
+ self.experiment_record._training_state == TrainingState.TRAINED
+ and self.experiment_record._last_trained_model_id == next_model_to_train_id
+ and self.experiment_record._next_model_to_train_id is None
+ )
num_retries += 1
if num_retries >= max_retries:
- raise UnhandledWorkflowException(f"Training job '{self.experiment_record._next_model_to_train_id}' "
- f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED.")
- if self.experiment_record._training_state == TrainingState.FAILED \
- or self.experiment_record._training_state == TrainingState.STOPPED:
- raise SageMakerTrainingJobException(f"Training job '{self.experiment_record._next_model_to_train_id}' "
- f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for "
- "more information.")
+ raise UnhandledWorkflowException(
+ f"Training job '{self.experiment_record._next_model_to_train_id}' "
+ f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED."
+ )
+ if (
+ self.experiment_record._training_state == TrainingState.FAILED
+ or self.experiment_record._training_state == TrainingState.STOPPED
+ ):
+ raise SageMakerTrainingJobException(
+ f"Training job '{self.experiment_record._next_model_to_train_id}' "
+ f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for "
+ "more information."
+ )
def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait=True):
"""
Start an evaluation job to evaluate a model
-
+
Args:
input_data_s3_prefix (str): S3 data path containing data used
for evaluation
@@ -1629,8 +1661,9 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait
if evaluate_model_id is None:
if self.experiment_record._last_trained_model_id:
# use 'last_trained_model_id' by default as input model for evaluation
- logger.info(f"Using last trained model {self.experiment_record._last_trained_model_id}"
- "for evaluation")
+ logger.info(
+ f"Using last trained model {self.experiment_record._last_trained_model_id}" "for evaluation"
+ )
evaluate_model_id = self.experiment_record._last_trained_model_id
else:
logger.error("Evaluation ModelId in None!")
@@ -1650,24 +1683,23 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait
# evaluate_model_id is still None. Raise an exception...
raise InvalidUsageException("Please provide a valid ModelId to be evaluated")
- if self.experiment_record._evaluation_state is not None \
- and self.experiment_record._evaluation_state.endswith("ING"):
- logger.warning(f"A evaluation request with job id '{self.experiment_record._next_evaluation_job_id}' "
+ if self.experiment_record._evaluation_state is not None and self.experiment_record._evaluation_state.endswith(
+ "ING"
+ ):
+ logger.warning(
+ f"A evaluation request with job id '{self.experiment_record._next_evaluation_job_id}' "
f"was in the state of '{self.experiment_record._evaluation_state}'. "
- "Wait until the evaluation job finished or canceled the request.")
+ "Wait until the evaluation job finished or canceled the request."
+ )
raise InvalidUsageException("Please wait for old Evaluation Job to Complete before requesting a new one!")
else:
next_evaluation_job_id = f"{evaluate_model_id}-eval-{str(int(time.time()))}"
logger.info(f"Evaluating model '{evaluate_model_id}' with evaluation job id '{next_evaluation_job_id}'")
- self.exp_db_client.update_experiment_next_evaluation_job_id(
- self.experiment_id,
- next_evaluation_job_id)
+ self.exp_db_client.update_experiment_next_evaluation_job_id(self.experiment_id, next_evaluation_job_id)
- self.exp_db_client.update_experiment_evaluation_state(
- self.experiment_id,
- EvaluationState.PENDING)
+ self.exp_db_client.update_experiment_evaluation_state(self.experiment_id, EvaluationState.PENDING)
manifest_file_path = None
if isinstance(input_data_s3_prefix, list):
@@ -1686,55 +1718,65 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait
role=self.resource_manager.iam_role_arn,
instance_config=self.resource_manager.evaluation_fleet_config,
boto_session=self.boto_session,
- algor_config=self.algor_config
- )
+ algor_config=self.algor_config,
+ )
self.next_model_to_evaluate.evaluate(
input_data_s3_prefix=input_data_s3_prefix,
manifest_file_path=manifest_file_path,
evaluation_job_name=next_evaluation_job_id,
- local_mode = self.local_mode,
+ local_mode=self.local_mode,
wait=wait,
- logs=True
- )
+ logs=True,
+ )
except Exception as e:
logger.error(e)
pass
# wait until exp ddb table updated
if self.local_mode or wait:
- evaluated_state = self.experiment_record._evaluation_state == EvaluationState.EVALUATED \
- and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id \
- and self.experiment_record._next_evaluation_job_id is None
+ evaluated_state = (
+ self.experiment_record._evaluation_state == EvaluationState.EVALUATED
+ and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id
+ and self.experiment_record._next_evaluation_job_id is None
+ )
num_retries = 0
- max_retries = 100
+ max_retries = 100
while not evaluated_state:
# Sync experiment state if required
self._sync_experiment_state_with_ddb()
logger.debug("Waiting for experiment table evaluation status to be updated...")
time.sleep(10 * num_retries)
- evaluated_state = self.experiment_record._evaluation_state == EvaluationState.EVALUATED \
- and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id \
- and self.experiment_record._next_evaluation_job_id is None
+ evaluated_state = (
+ self.experiment_record._evaluation_state == EvaluationState.EVALUATED
+ and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id
+ and self.experiment_record._next_evaluation_job_id is None
+ )
num_retries += 1
if num_retries >= max_retries:
- raise UnhandledWorkflowException(f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' "
- f"was in state of '{self.experiment_record._evaluation_state}'. Failed to sync table states.")
- if self.experiment_record._evaluation_state == EvaluationState.FAILED \
- or self.experiment_record._evaluation_state == EvaluationState.STOPPED:
- raise SageMakerTrainingJobException(f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' "
- f"ended in state of '{self.experiment_record._evaluation_state}'. Please check Sagemaker logs for "
- "more information.")
+ raise UnhandledWorkflowException(
+ f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' "
+ f"was in state of '{self.experiment_record._evaluation_state}'. Failed to sync table states."
+ )
+ if (
+ self.experiment_record._evaluation_state == EvaluationState.FAILED
+ or self.experiment_record._evaluation_state == EvaluationState.STOPPED
+ ):
+ raise SageMakerTrainingJobException(
+ f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' "
+ f"ended in state of '{self.experiment_record._evaluation_state}'. Please check Sagemaker logs for "
+ "more information."
+ )
def get_eval_score(self, evaluate_model_id=None, eval_data_path=None):
"""
Return evaluation score given model id and evaluation data path
-
+
Args:
evaluate_model_id (str): Model id used for evaluation
eval_data_path (str): S3 data path of evaluation data
-
+
Returns:
float: evaluation score of given model and evaluation data
"""
@@ -1745,13 +1787,12 @@ def get_eval_score(self, evaluate_model_id=None, eval_data_path=None):
if evaluate_model_id != self.experiment_record._last_trained_model_id:
if not self._check_if_model_ready(evaluate_model_id):
return
-
+
# use last joined job's eval data by default
if eval_data_path is None:
eval_data_path = self.last_joined_job_eval_data
- logger.info(f"Getting eval scores for model '{evaluate_model_id}'"
- f" on eval data set '{eval_data_path}'")
+ logger.info(f"Getting eval scores for model '{evaluate_model_id}'" f" on eval data set '{eval_data_path}'")
eval_score = "n.a."
if not evaluate_model_id or not eval_data_path:
@@ -1760,41 +1801,44 @@ def get_eval_score(self, evaluate_model_id=None, eval_data_path=None):
else:
model_record = self.model_db_client.get_model_record(self.experiment_id, evaluate_model_id)
if model_record:
- eval_scores_map = model_record.get('eval_scores', {})
+ eval_scores_map = model_record.get("eval_scores", {})
eval_score = eval_scores_map.get(eval_data_path, eval_score)
else:
logger.warn(f"Model Record not found with ModelId: {evaluate_model_id}")
pass
if eval_score == "n.a.":
- raise EvalScoreNotAvailableException(f"Evaluation score is not available for model '{evaluate_model_id}'"
- f"with data '{eval_data_path}'.'")
+ raise EvalScoreNotAvailableException(
+ f"Evaluation score is not available for model '{evaluate_model_id}'" f"with data '{eval_data_path}'.'"
+ )
else:
eval_score = float(eval_score)
- logger.info(f"Evaluation score for model '{evaluate_model_id}'"
- f"with data '{eval_data_path}' is {eval_score}.")
+ logger.info(
+ f"Evaluation score for model '{evaluate_model_id}'" f"with data '{eval_data_path}' is {eval_score}."
+ )
return eval_score
-
+
def get_cloudwatch_dashboard_details(self):
return self.cw_logger.get_cloudwatch_dashboard_details(self.experiment_id)
-
+
def clean_resource(self, experiment_id):
"""Clean up resource of the given experiment,
including hosting endpoint and firehose stream
"""
if not self.local_mode:
self.resource_manager.delete_firehose_stream(experiment_id)
-
+
# clean athena tables
logger.info(f"Deleting athena tables for '{experiment_id}'...")
last_join_job = JoinManager(
join_db_client=self.join_db_client,
experiment_id=self.experiment_id,
- join_job_id=self.last_joined_job_id)
+ join_job_id=self.last_joined_job_id,
+ )
last_join_job._delete_obs_table_if_exist()
last_join_job._delete_rewards_table_if_exist()
-
+
logger.info(f"Deleting hosting endpoint '{experiment_id}'...")
self.sagemaker_session.delete_endpoint_config(experiment_id)
self.sagemaker_session.delete_endpoint(experiment_id)
@@ -1807,26 +1851,18 @@ def clean_table_records(self, experiment_id):
to be cleaned up
"""
# delete join job records from table
- join_job_records = self.join_db_client.get_all_join_job_records_of_experiment(
- experiment_id
- )
+ join_job_records = self.join_db_client.get_all_join_job_records_of_experiment(experiment_id)
if join_job_records:
self.join_db_client.batch_delete_items(
- experiment_id,
- [record["join_job_id"] for record in join_job_records]
+ experiment_id, [record["join_job_id"] for record in join_job_records]
)
# delete model records from table
- model_records = self.model_db_client.get_all_model_records_of_experiment(
- experiment_id
- )
+ model_records = self.model_db_client.get_all_model_records_of_experiment(experiment_id)
if model_records:
- self.model_db_client.batch_delete_items(
- experiment_id,
- [record["model_id"] for record in model_records]
- )
+ self.model_db_client.batch_delete_items(experiment_id, [record["model_id"] for record in model_records])
# # exit sync thread
self.sync_thread.thread_running.clear()
@@ -1838,7 +1874,7 @@ def clean_table_records(self, experiment_id):
def _close_existing_containers(self):
"""closing local running containers if exist
-
+
Returns:
(bool, bool): Whether a running container exist,
Whether successfully close the container
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py
index d64dab86..09ca9f6e 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py
@@ -11,8 +11,7 @@
from orchestrator.clients.ddb.join_db_client import JoinDbClient
from orchestrator.workflow.datatypes.join_job_record import JoinJobRecord
from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException
-from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, \
- JoinQueryIdsNotAvailableException
+from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, JoinQueryIdsNotAvailableException
logger = logging.getLogger("orchestrator")
@@ -22,20 +21,22 @@ class JoinManager:
will handle the joining job creation and joining job metadata
management.
"""
+
def __init__(
- self,
- join_db_client: JoinDbClient,
- experiment_id,
- join_job_id,
- current_state=None,
- input_obs_data_s3_path=None,
- obs_start_time=None,
- obs_end_time=None,
- input_reward_data_s3_path=None,
- output_joined_train_data_s3_path=None,
- output_joined_eval_data_s3_path=None,
- join_query_ids=[],
- boto_session=None):
+ self,
+ join_db_client: JoinDbClient,
+ experiment_id,
+ join_job_id,
+ current_state=None,
+ input_obs_data_s3_path=None,
+ obs_start_time=None,
+ obs_end_time=None,
+ input_reward_data_s3_path=None,
+ output_joined_train_data_s3_path=None,
+ output_joined_eval_data_s3_path=None,
+ join_query_ids=[],
+ boto_session=None,
+ ):
"""Initialize a joining job entity in the current experiment
Args:
@@ -80,7 +81,7 @@ def __init__(
self.query_s3_output_bucket = self._create_athena_s3_bucket_if_not_exist()
self.athena_client = self.boto_session.client("athena")
- # create a local JoinJobRecord object.
+ # create a local JoinJobRecord object.
self.join_job_record = JoinJobRecord(
experiment_id,
join_job_id,
@@ -91,8 +92,8 @@ def __init__(
input_reward_data_s3_path,
output_joined_train_data_s3_path,
output_joined_eval_data_s3_path,
- join_query_ids
- )
+ join_query_ids,
+ )
# create obs partitioned/non-partitioned table if not exists
if input_obs_data_s3_path and input_obs_data_s3_path != "local-join-does-not-apply":
@@ -104,28 +105,22 @@ def __init__(
if obs_start_time and obs_end_time:
self._add_time_partitions(obs_start_time, obs_end_time)
- # try to save this record file. if it throws RecordAlreadyExistsException
+ # try to save this record file. if it throws RecordAlreadyExistsException
# reload the record from JoinJobDb, and recreate
try:
- self.join_db_client.create_new_join_job_record(
- self.join_job_record.to_ddb_record()
- )
+ self.join_db_client.create_new_join_job_record(self.join_job_record.to_ddb_record())
except RecordAlreadyExistsException:
logger.debug("Join job already exists. Reloading from join job record.")
- join_job_record = self.join_db_client.get_join_job_record(
- experiment_id,
- join_job_id
- )
+ join_job_record = self.join_db_client.get_join_job_record(experiment_id, join_job_id)
self.join_job_record = JoinJobRecord.load_from_ddb_record(join_job_record)
except Exception as e:
logger.error("Unhandled Exception! " + str(e))
raise UnhandledWorkflowException("Something went wrong while creating a new join job")
def _jsonify(self):
- """Return a jsonify dict with metadata of the 'JoinJob' object
- """
+ """Return a jsonify dict with metadata of the 'JoinJob' object"""
return self.join_job_record.to_ddb_record()
-
+
@classmethod
def name_next_join_job(cls, experiment_id):
"""Generate unique join job id of a new joining job in the experiment
@@ -149,11 +144,11 @@ def _formatted_table_name(self, table_name_string):
"""
# athena does not allow special characters other than '_'
# replace all special characters with '_'
- return re.sub('[^A-Za-z0-9]+', '_', table_name_string)
+ return re.sub("[^A-Za-z0-9]+", "_", table_name_string)
def _create_athena_s3_bucket_if_not_exist(self):
"""Create s3 bucket for athena data if not exists
- Use sagemaker-{region}-{account_id} bucket to store data
+ Use sagemaker-{region}-{account_id} bucket to store data
Returns:
str: s3 bucket name for athena
@@ -171,9 +166,7 @@ def _create_athena_s3_bucket_if_not_exist(self):
if region == "us-east-1":
s3.create_bucket(Bucket=s3_bucket_name)
else:
- s3.create_bucket(
- Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region}
- )
+ s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region})
logger.info("Successfully create S3 bucket '{}' for athena queries".format(s3_bucket_name))
except ClientError as e:
error_code = e.response["Error"]["Code"]
@@ -181,9 +174,7 @@ def _create_athena_s3_bucket_if_not_exist(self):
if error_code == "BucketAlreadyOwnedByYou":
pass
- elif (
- error_code == "OperationAborted" and "conflicting conditional operation" in message
- ):
+ elif error_code == "OperationAborted" and "conflicting conditional operation" in message:
# If this bucket is already being concurrently created, we don't need to create it again.
pass
elif error_code == "TooManyBuckets":
@@ -191,18 +182,17 @@ def _create_athena_s3_bucket_if_not_exist(self):
s3.meta.client.head_bucket(Bucket=s3_bucket_name)
else:
raise
-
- s3_waiter = s3_client.get_waiter('bucket_exists')
+
+ s3_waiter = s3_client.get_waiter("bucket_exists")
s3_waiter.wait(Bucket=s3_bucket_name)
return s3_bucket_name
def _create_obs_table_if_not_exist(self):
- """Create athena table for observation data if not exists
- """
+ """Create athena table for observation data if not exists"""
# create both partitioned and non-partitioned table for obs data
# ensure input path ending with '/'
input_obs_data_s3_path = self.join_job_record.get_input_obs_data_s3_path()
- input_obs_data_s3_path = input_obs_data_s3_path.strip('/')+'/'
+ input_obs_data_s3_path = input_obs_data_s3_path.strip("/") + "/"
query_string = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {self.obs_table_partitioned} (
@@ -238,8 +228,10 @@ def _create_obs_table_if_not_exist(self):
query_id = self._start_query(query_string, s3_output_path)
self.wait_query_to_finish(query_id)
- logger.debug(f"Successfully create observation table "
- f"'{self.obs_table_non_partitioned}' and '{self.obs_table_partitioned}' for query")
+ logger.debug(
+ f"Successfully create observation table "
+ f"'{self.obs_table_non_partitioned}' and '{self.obs_table_partitioned}' for query"
+ )
def _delete_obs_table_if_exist(self):
query_string = f"""
@@ -257,12 +249,11 @@ def _delete_obs_table_if_exist(self):
self.wait_query_to_finish(query_id)
def _create_rewards_table_if_not_exist(self):
- """Create athena table for rewards data if not exists
- """
+ """Create athena table for rewards data if not exists"""
# create table if not exists
# ensure input path ending with '/'
input_reward_data_s3_path = self.join_job_record.get_input_reward_data_s3_path()
- input_reward_data_s3_path = input_reward_data_s3_path.strip('/')+'/'
+ input_reward_data_s3_path = input_reward_data_s3_path.strip("/") + "/"
query_string = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {self.rewards_table} (
@@ -288,7 +279,7 @@ def _create_rewards_table_if_not_exist(self):
self.wait_query_to_finish(query_id)
logger.debug(f"Successfully update s3 location of rewards table '{self.rewards_table}'")
-
+
def _delete_rewards_table_if_exist(self):
query_string = f"""
DROP TABLE IF EXISTS {self.rewards_table}
@@ -309,20 +300,20 @@ def _add_time_partitions(self, start_time, end_time):
input_obs_data_s3_path = self.join_job_record.get_input_obs_data_s3_path()
# Adding partitions for each hour
- partition_string_list = []
+ partition_string_list = []
time_delta = end_time - start_time
days = time_delta.days
seconds = time_delta.seconds
- hours = int(days*24 + seconds/3600)
+ hours = int(days * 24 + seconds / 3600)
for i in range(hours + 1):
- dt = start_time + timedelta(hours=i)
+ dt = start_time + timedelta(hours=i)
dt_str = dt.strftime("%Y-%m-%d-%H")
bucket_dt_str = dt.strftime("%Y/%m/%d/%H")
partition_string = f"PARTITION (dt = '{dt_str}') LOCATION '{input_obs_data_s3_path}/{bucket_dt_str}/'"
partition_string_list.append(partition_string)
query_string = f"ALTER TABLE {self.obs_table_partitioned} ADD IF NOT EXISTS"
-
+
for partition_string in partition_string_list:
query_string = f"""
{query_string}\n{partition_string}"""
@@ -389,13 +380,13 @@ def _get_join_query_string(self, ratio=0.8, train_data=True, start_time=None, en
query_sample_string = f"SELECT * FROM joined_table WHERE joined_table.sample_prob <= {ratio}"
else:
query_sample_string = f"SELECT * FROM joined_table WHERE joined_table.sample_prob > {ratio}"
-
+
query_string = f"""
{query_string_prefix}
{query_sample_string}"""
-
+
return query_string
-
+
def _start_query(self, query_string, s3_output_path):
"""Start query with given query string and output path
@@ -411,16 +402,14 @@ def _start_query(self, query_string, s3_output_path):
response = self.athena_client.start_query_execution(
QueryString=query_string,
ResultConfiguration={
- 'OutputLocation': s3_output_path,
- }
- )
- query_id = response['QueryExecutionId']
+ "OutputLocation": s3_output_path,
+ },
+ )
+ query_id = response["QueryExecutionId"]
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError("Failed to submit athena query with error {}: {}".format(
- error_code, message
- ))
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError("Failed to submit athena query with error {}: {}".format(error_code, message))
return query_id
def wait_query_to_finish(self, query_id):
@@ -429,28 +418,28 @@ def wait_query_to_finish(self, query_id):
Args:
query_id (str): query id of Athena query
"""
- status = 'QUEUED'
- while status == 'RUNNING' or status == 'QUEUED':
+ status = "QUEUED"
+ while status == "RUNNING" or status == "QUEUED":
try:
- response = self.athena_client.get_query_execution(
- QueryExecutionId=query_id
- )
- status = response['QueryExecution']['Status']['State']
+ response = self.athena_client.get_query_execution(QueryExecutionId=query_id)
+ status = response["QueryExecution"]["Status"]["State"]
logger.debug(f"Waiting query to finish...")
time.sleep(5)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format(
- error_code, message
- ))
-
- if status == 'FAILED':
- raise RuntimeError(f"Query failed with reason: {response['QueryExecution']['Status']['StateChangeReason']}")
- elif status == 'CANCELLED':
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError(
+ "Failed to retrieve athena query status with error {}: {}".format(error_code, message)
+ )
+
+ if status == "FAILED":
+ raise RuntimeError(
+ f"Query failed with reason: {response['QueryExecution']['Status']['StateChangeReason']}"
+ )
+ elif status == "CANCELLED":
logger.warning("Query was cancelled...")
- elif status == 'SUCCEEDED':
- logger.debug("Query finished successfully")
+ elif status == "SUCCEEDED":
+ logger.debug("Query finished successfully")
def get_query_status(self, query_id):
"""Return query status given query ID
@@ -462,18 +451,14 @@ def get_query_status(self, query_id):
str: Status of the query
"""
try:
- response = self.athena_client.get_query_execution(
- QueryExecutionId=query_id
- )
- status = response['QueryExecution']['Status']['State']
+ response = self.athena_client.get_query_execution(QueryExecutionId=query_id)
+ status = response["QueryExecution"]["Status"]["State"]
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format(
- error_code, message
- ))
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format(error_code, message))
return status
-
+
def start_join(self, ratio=0.8, wait=True):
"""Start Athena queries for the joining
@@ -486,23 +471,21 @@ def start_join(self, ratio=0.8, wait=True):
obs_start_time, obs_end_time = self.join_job_record.get_obs_start_end_time()
- join_query_for_train_data = self._get_join_query_string(ratio=ratio,
- train_data=True, start_time=obs_start_time, end_time=obs_end_time)
- join_query_for_eval_data = self._get_join_query_string(ratio=ratio,
- train_data=False, start_time=obs_start_time, end_time=obs_end_time)
+ join_query_for_train_data = self._get_join_query_string(
+ ratio=ratio, train_data=True, start_time=obs_start_time, end_time=obs_end_time
+ )
+ join_query_for_eval_data = self._get_join_query_string(
+ ratio=ratio, train_data=False, start_time=obs_start_time, end_time=obs_end_time
+ )
- s3_output_path = f"s3://{self.query_s3_output_bucket}/" \
- f"{self.experiment_id}/joined_data/{self.join_job_id}"
+ s3_output_path = f"s3://{self.query_s3_output_bucket}/" f"{self.experiment_id}/joined_data/{self.join_job_id}"
logger.info(f"Joined data will be stored under {s3_output_path}")
-
join_query_id_for_train = self._start_query(join_query_for_train_data, f"{s3_output_path}/train")
join_query_id_for_eval = self._start_query(join_query_for_eval_data, f"{s3_output_path}/eval")
# updates join table states vid ddb client
- self.join_db_client.update_join_job_current_state(
- self.experiment_id, self.join_job_id, 'PENDING'
- )
+ self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, "PENDING")
self.join_db_client.update_join_job_output_joined_train_data_s3_path(
self.experiment_id, self.join_job_id, f"{s3_output_path}/train"
)
@@ -526,8 +509,8 @@ def _val_list_to_csv_byte_string(self, val_list):
Return:
str: A string in csv format, concatenated by ','
"""
- val_str_list = list(map(lambda x: f"\"{x}\"", val_list))
- return str(','.join(val_str_list) + '\n').encode('utf_8')
+ val_str_list = list(map(lambda x: f'"{x}"', val_list))
+ return str(",".join(val_str_list) + "\n").encode("utf_8")
def _upload_data_buffer_as_joined_data_format(self, data_buffer, s3_bucket, s3_prefix):
"""Upload joined data buffer to s3 bucket
@@ -553,24 +536,20 @@ def _upload_data_buffer_as_joined_data_format(self, data_buffer, s3_bucket, s3_p
s3_client = self.boto_session.client("s3")
try:
- logger.info("_upload_data_buffer_as_joined_data_format put s3://{}/{}".format(
- s3_bucket, joined_data_s3_file_key
- ))
- s3_client.put_object(Body=body,
- Bucket=s3_bucket,
- Key=joined_data_s3_file_key)
+ logger.info(
+ "_upload_data_buffer_as_joined_data_format put s3://{}/{}".format(s3_bucket, joined_data_s3_file_key)
+ )
+ s3_client.put_object(Body=body, Bucket=s3_bucket, Key=joined_data_s3_file_key)
except ClientError as e:
- error_code = e.response['Error']['Code']
- message = e.response['Error']['Message']
- logger.error("Failed to upload local joined data with error {}: {}".format(
- error_code, message
- ))
+ error_code = e.response["Error"]["Code"]
+ message = e.response["Error"]["Message"]
+ logger.error("Failed to upload local joined data with error {}: {}".format(error_code, message))
return None
joined_data_file_path = f"s3://{s3_bucket}/{joined_data_s3_file_key}"
logger.debug("Waiting for local joined data to be uploaded.")
- waiter = s3_client.get_waiter('object_exists')
+ waiter = s3_client.get_waiter("object_exists")
waiter.wait(Bucket=s3_bucket, Key=joined_data_s3_file_key)
logger.debug(f"Successfully upload local joined data files to s3 bucket path {joined_data_file_path}")
@@ -596,14 +575,11 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8):
else:
joined_eval_data_buffer.append(record)
- s3_output_path = f"s3://{self.query_s3_output_bucket}/" \
- f"{self.experiment_id}/joined_data/{self.join_job_id}"
+ s3_output_path = f"s3://{self.query_s3_output_bucket}/" f"{self.experiment_id}/joined_data/{self.join_job_id}"
logger.info(f"Joined data will be stored under {s3_output_path}")
# updates join table states vid ddb client
- self.join_db_client.update_join_job_current_state(
- self.experiment_id, self.join_job_id, 'PENDING'
- )
+ self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, "PENDING")
self.join_db_client.update_join_job_output_joined_train_data_s3_path(
self.experiment_id, self.join_job_id, f"{s3_output_path}/train"
)
@@ -615,12 +591,14 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8):
joined_train_data_path = self._upload_data_buffer_as_joined_data_format(
joined_train_data_buffer,
self.query_s3_output_bucket,
- f"{self.experiment_id}/joined_data/{self.join_job_id}/train")
+ f"{self.experiment_id}/joined_data/{self.join_job_id}/train",
+ )
joined_eval_data_path = self._upload_data_buffer_as_joined_data_format(
joined_eval_data_buffer,
self.query_s3_output_bucket,
- f"{self.experiment_id}/joined_data/{self.join_job_id}/eval")
+ f"{self.experiment_id}/joined_data/{self.join_job_id}/eval",
+ )
# dummy join finished, update joining job state
if joined_train_data_path and joined_eval_data_path:
@@ -628,28 +606,26 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8):
else:
current_state = "FAILED"
- self.join_db_client.update_join_job_current_state(
- self.experiment_id, self.join_job_id, current_state
- )
-
+ self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, current_state)
+
def update_join_job_state(self):
for num_retries in range(3):
try:
- join_job_record = self.join_db_client.get_join_job_record(
- self.experiment_id, self.join_job_id
- )
+ join_job_record = self.join_db_client.get_join_job_record(self.experiment_id, self.join_job_id)
self._update_join_table_states(join_job_record)
except Exception as e:
if num_retries >= 2:
- current_state = 'FAILED'
+ current_state = "FAILED"
self.join_db_client.update_join_job_current_state(
self.experiment_id, self.join_job_id, current_state
)
logger.error(f"Failing join job '{self.join_job_id}'...")
return
else:
- logger.warn(f"Received exception '{e}' while updating join "
- "job status. This exception will be ignored, and retried.")
+ logger.warn(
+ f"Received exception '{e}' while updating join "
+ "job status. This exception will be ignored, and retried."
+ )
time.sleep(5)
continue
@@ -664,7 +640,7 @@ def _update_join_table_states(self, join_job_record):
"""
if join_job_record is None:
return
-
+
current_state = join_job_record.get("current_state", None)
join_query_ids = join_job_record.get("join_query_ids", [])
@@ -673,8 +649,9 @@ def _update_join_table_states(self, join_job_record):
return
if not join_query_ids:
- raise JoinQueryIdsNotAvailableException(f"Query ids for Joining job "
- f"'{self.join_job_id}' cannot be found.")
+ raise JoinQueryIdsNotAvailableException(
+ f"Query ids for Joining job " f"'{self.join_job_id}' cannot be found."
+ )
query_states = []
@@ -682,22 +659,14 @@ def _update_join_table_states(self, join_job_record):
query_states.append(self.get_query_status(query_id))
# only 'SUCCEEDED' if both queries are 'SUCCEEDED'
- if query_states[0] == 'SUCCEEDED' and query_states[1] == 'SUCCEEDED':
- current_state = 'SUCCEEDED'
- elif 'FAILED' in query_states:
- current_state = 'FAILED'
- elif 'CANCELLED' in query_states:
- current_state = 'CANCELLED'
+ if query_states[0] == "SUCCEEDED" and query_states[1] == "SUCCEEDED":
+ current_state = "SUCCEEDED"
+ elif "FAILED" in query_states:
+ current_state = "FAILED"
+ elif "CANCELLED" in query_states:
+ current_state = "CANCELLED"
else:
- current_state = 'RUNNING'
+ current_state = "RUNNING"
# update table states via ddb client
- self.join_db_client.update_join_job_current_state(
- self.experiment_id, self.join_job_id, current_state
- )
-
-
-
-
-
-
+ self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, current_state)
diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py
index dc92a68a..122c2e3c 100644
--- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py
+++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py
@@ -22,50 +22,52 @@
from src.vw_utils import EVAL_CHANNEL
logger = logging.getLogger("orchestrator")
-
-
+
+
class CaptureStdout(list):
def __enter__(self):
self._stdout = sys.stdout
sys.stdout = self._stringio = StringIO()
return self
+
def __exit__(self, type, value, traceback):
self.extend(self._stringio.getvalue().splitlines())
- del self._stringio # free up some memory
+ del self._stringio # free up some memory
sys.stdout = self._stdout
-
+
# Capture the exception and don't throw it back for graceful exit.
return True
-class ModelManager():
+class ModelManager:
"""A model entity with the given experiment. This class will handle
the model creation, model training, model evaluation and model metadata
management.
"""
def __init__(
- self,
- model_db_client: ModelDbClient,
- experiment_id,
- model_id,
- image=None,
- role=None,
- instance_config={},
- boto_session=None,
- algor_config={},
- train_state=None,
- evaluation_job_name=None,
- eval_state=None,
- eval_scores={},
- input_model_id=None,
- rl_estimator=None,
- input_data_s3_prefix=None,
- manifest_file_path=None,
- eval_data_s3_path=None,
- s3_model_output_path=None,
- training_start_time=None,
- training_end_time=None):
+ self,
+ model_db_client: ModelDbClient,
+ experiment_id,
+ model_id,
+ image=None,
+ role=None,
+ instance_config={},
+ boto_session=None,
+ algor_config={},
+ train_state=None,
+ evaluation_job_name=None,
+ eval_state=None,
+ eval_scores={},
+ input_model_id=None,
+ rl_estimator=None,
+ input_data_s3_prefix=None,
+ manifest_file_path=None,
+ eval_data_s3_path=None,
+ s3_model_output_path=None,
+ training_start_time=None,
+ training_end_time=None,
+ ):
"""Initialize a model entity in the current experiment
Args:
@@ -83,7 +85,7 @@ def __init__(
configuration for the model training/evaluation job.
boto_session (boto3.session.Session): A session stores configuration
state and allows you to create service clients and resources.
- algor_config (dict): A dictionary that specify the algorithm type
+ algor_config (dict): A dictionary that specify the algorithm type
and hyper parameters of the training/evaluation job.
train_state (str): State of the model training job.
evaluation_job_name (str): Job name for Latest Evaluation Job for this model
@@ -95,7 +97,7 @@ def __init__(
a SageMaker Training Job.
input_data_s3_prefix (str): Input data path for the data source of the
model training job.
- s3_model_output_path (str): Output data path of model artifact for the
+ s3_model_output_path (str): Output data path of model artifact for the
model training job.
training_start_time (str): Starting timestamp of the model training job.
training_end_time (str): Finished timestamp of the model training job.
@@ -120,7 +122,7 @@ def __init__(
self.instance_count = self.instance_config.get("instance_count", 1)
self.algor_params = self.algor_config.get("algorithms_parameters", {})
- # create a local ModelRecord object.
+ # create a local ModelRecord object.
self.model_record = ModelRecord(
experiment_id,
model_id,
@@ -134,21 +136,16 @@ def __init__(
eval_data_s3_path,
s3_model_output_path,
training_start_time,
- training_end_time
- )
-
- # try to save this record file. if it throws RecordAlreadyExistsException
+ training_end_time,
+ )
+
+ # try to save this record file. if it throws RecordAlreadyExistsException
# reload the record from ModelDb, and recreate
try:
- self.model_db_client.create_new_model_record(
- self.model_record.to_ddb_record()
- )
+ self.model_db_client.create_new_model_record(self.model_record.to_ddb_record())
except RecordAlreadyExistsException:
logger.debug("Model already exists. Reloading from model record.")
- model_record = self.model_db_client.get_model_record(
- experiment_id,
- model_id
- )
+ model_record = self.model_db_client.get_model_record(experiment_id, model_id)
self.model_record = ModelRecord.load_from_ddb_record(model_record)
except Exception as e:
logger.error("Unhandled Exception! " + str(e))
@@ -158,7 +155,7 @@ def __init__(
boto_session = boto3.Session()
self.boto_session = boto_session
- if self.instance_type == 'local':
+ if self.instance_type == "local":
self.sagemaker_session = LocalSession()
else:
self.sagemaker_session = sagemaker.session.Session(self.boto_session)
@@ -198,30 +195,28 @@ def _get_rl_estimator_args(self, eval=False):
job_types = "evaluation_jobs" if eval else "training_jobs"
sagemaker_bucket = self.sagemaker_session.default_bucket()
- output_path = f"s3://{sagemaker_bucket}/{self.experiment_id}/{job_types}/"
+ output_path = f"s3://{sagemaker_bucket}/{self.experiment_id}/{job_types}/"
metric_definitions = [
- {
- 'Name': 'average_loss',
- 'Regex': 'average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$'
- }
- ]
-
- args = dict(entry_point=entry_point,
- source_dir='src',
- dependencies=["common/sagemaker_rl"],
- image_uri=self.image,
- role=self.role,
- sagemaker_session=self.sagemaker_session,
- instance_type=self.instance_type,
- instance_count=self.instance_count,
- metric_definitions=metric_definitions,
- hyperparameters=self.algor_params,
- output_path=output_path,
- code_location=output_path.strip('/')
- )
-
- if self.instance_type == 'local':
+ {"Name": "average_loss", "Regex": "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$"}
+ ]
+
+ args = dict(
+ entry_point=entry_point,
+ source_dir="src",
+ dependencies=["common/sagemaker_rl"],
+ image_uri=self.image,
+ role=self.role,
+ sagemaker_session=self.sagemaker_session,
+ instance_type=self.instance_type,
+ instance_count=self.instance_count,
+ metric_definitions=metric_definitions,
+ hyperparameters=self.algor_params,
+ output_path=output_path,
+ code_location=output_path.strip("/"),
+ )
+
+ if self.instance_type == "local":
logger.info(f"{estimator_type} job will be executed in 'local' mode")
else:
logger.info(f"{estimator_type} job will be executed in 'SageMaker' mode")
@@ -231,29 +226,19 @@ def _fit_first_model(self, input_data_s3_prefix=None, manifest_file_path=None, w
"""
A Estimator fit() call to initiate the first model of the experiment
"""
-
-
+
rl_estimator_args = self._get_rl_estimator_args()
self.rl_estimator = RLEstimator(**rl_estimator_args)
if manifest_file_path:
input_data = sagemaker.session.s3_input(
- s3_data=manifest_file_path,
- input_mode='File',
- s3_data_type='ManifestFile'
- )
+ s3_data=manifest_file_path, input_mode="File", s3_data_type="ManifestFile"
+ )
self.rl_estimator.fit(job_name=self.model_id, inputs=input_data, wait=wait, logs=logs)
else:
- self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait,logs=logs)
-
- def fit(
- self,
- input_model_id=None,
- input_data_s3_prefix=None,
- manifest_file_path=None,
- wait=False,
- logs=True
- ):
+ self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait, logs=logs)
+
+ def fit(self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True):
"""A Estimator fit() call to start a model training job.
Args:
@@ -269,52 +254,39 @@ def fit(
self.model_record.add_new_training_job_info(
input_model_id=input_model_id,
input_data_s3_prefix=input_data_s3_prefix,
- manifest_file_path=manifest_file_path
+ manifest_file_path=manifest_file_path,
)
self.model_db_client.update_model_record(self._jsonify())
if input_model_id is None:
self._fit_first_model(
- input_data_s3_prefix=input_data_s3_prefix,
- manifest_file_path=manifest_file_path,
- wait=wait,
- logs=logs)
+ input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, wait=wait, logs=logs
+ )
else:
# use 'input_model_id' as pretrained model for training
- input_model_record = self.model_db_client.get_model_record(
- self.experiment_id,
- input_model_id
- )
+ input_model_record = self.model_db_client.get_model_record(self.experiment_id, input_model_id)
model_artifact_path = input_model_record.get("s3_model_output_path")
rl_estimator_args = self._get_rl_estimator_args()
- rl_estimator_args['model_channel_name'] = 'pretrained_model'
- rl_estimator_args['model_uri'] = model_artifact_path
+ rl_estimator_args["model_channel_name"] = "pretrained_model"
+ rl_estimator_args["model_uri"] = model_artifact_path
self.rl_estimator = RLEstimator(**rl_estimator_args)
if manifest_file_path:
- inputs = sagemaker.session.s3_input(
- s3_data=manifest_file_path,
- s3_data_type='ManifestFile'
- )
+ inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile")
else:
inputs = input_data_s3_prefix
- self.rl_estimator.fit(
- job_name=self.model_id,
- inputs=inputs,
- wait=wait,
- logs=logs
- )
+ self.rl_estimator.fit(job_name=self.model_id, inputs=inputs, wait=wait, logs=logs)
def evaluate(
- self,
- input_data_s3_prefix=None,
- manifest_file_path=None,
- evaluation_job_name=None,
- local_mode=True,
- wait=False,
- logs=True
- ):
+ self,
+ input_data_s3_prefix=None,
+ manifest_file_path=None,
+ evaluation_job_name=None,
+ local_mode=True,
+ wait=False,
+ logs=True,
+ ):
"""A Estimator fit() call to start a model evaluation job.
Args:
@@ -331,33 +303,29 @@ def evaluate(
# Model object has already been initialized with up-to-date DDb record.
model_artifact_path = self.model_record.get_model_artifact_path()
rl_estimator_args = self._get_rl_estimator_args(eval=True)
- rl_estimator_args['model_channel_name'] = 'pretrained_model'
- rl_estimator_args['model_uri'] = model_artifact_path
+ rl_estimator_args["model_channel_name"] = "pretrained_model"
+ rl_estimator_args["model_uri"] = model_artifact_path
if manifest_file_path:
- inputs = sagemaker.session.s3_input(
- s3_data=manifest_file_path,
- s3_data_type='ManifestFile'
- )
+ inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile")
if local_mode:
rl_estimator_args["hyperparameters"].update({"local_mode_manifest": True})
else:
inputs = input_data_s3_prefix
-
+
# (dict[str, str] or dict[str, sagemaker.session.s3_input]) for evaluation channel
eval_channel_inputs = {EVAL_CHANNEL: inputs}
self.rl_estimator = RLEstimator(**rl_estimator_args)
- # update to save eval_data_s3_path in DDb as well, or
+ # update to save eval_data_s3_path in DDb as well, or
# update to read from SM describe call... maybe will not work in local mode but.
eval_data_s3_path = manifest_file_path if (manifest_file_path is not None) else input_data_s3_prefix
# we keep eval job state as pending, before the SM job has been submitted.
# the syncer function should update this state, based on SM job status.
self.model_record.add_new_evaluation_job_info(
- evaluation_job_name=evaluation_job_name,
- eval_data_s3_path=eval_data_s3_path
+ evaluation_job_name=evaluation_job_name, eval_data_s3_path=eval_data_s3_path
)
self.model_db_client.update_model_record(self._jsonify())
@@ -369,26 +337,16 @@ def evaluate(
# Capture eval score by regex expression
# log should contain only one "average loss = some number" pattern
with CaptureStdout() as log_output:
- self.rl_estimator.fit(
- job_name=evaluation_job_name,
- inputs=eval_channel_inputs,
- wait=wait,
- logs=logs
- )
+ self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs)
- self.log_output = '\n'.join(log_output)
+ self.log_output = "\n".join(log_output)
logger.debug(self.log_output)
else:
- self.rl_estimator.fit(
- job_name=evaluation_job_name,
- inputs=eval_channel_inputs,
- wait=wait,
- logs=logs
- )
+ self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs)
def update_model_training_state(self):
self._update_model_table_training_states()
-
+
def update_model_evaluation_state(self):
self._update_model_table_evaluation_states()
@@ -411,59 +369,57 @@ def _update_model_table_training_states(self):
# need not do anything.
self.model_db_client.update_model_record(self._jsonify())
return self._jsonify()
-
+
# Else, try and fetch updated SageMaker TrainingJob status
sm_job_info = {}
-
- max_describe_retries = 100
+
+ max_describe_retries = 100
sleep_between_describe_retries = 10
-
+
for i in range(max_describe_retries):
try:
- sm_job_info = self.sagemaker_client.describe_training_job(
- TrainingJobName=self.model_id)
+ sm_job_info = self.sagemaker_client.describe_training_job(TrainingJobName=self.model_id)
except Exception as e:
if "ValidationException" in str(e):
if i > max_describe_retries:
# max attempts for DescribeTrainingJob. Fail with ValidationException
- logger.warn(f"Looks like SageMaker Job was not submitted successfully."
- f" Failing Training Job with ModelId {self.model_id}"
+ logger.warn(
+ f"Looks like SageMaker Job was not submitted successfully."
+ f" Failing Training Job with ModelId {self.model_id}"
)
self.model_record.update_model_as_failed()
self.model_db_client.update_model_as_failed(self._jsonify())
return
- else:
+ else:
time.sleep(sleep_between_describe_retries)
continue
else:
- # Do not raise exception, most probably throttling.
- logger.warn(f"Failed to check SageMaker Training Job state for ModelId {self.model_id}."
- " This exception will be ignored, and retried."
+ # Do not raise exception, most probably throttling.
+ logger.warn(
+ f"Failed to check SageMaker Training Job state for ModelId {self.model_id}."
+ " This exception will be ignored, and retried."
)
logger.debug(e)
time.sleep(sleep_between_describe_retries)
return self._jsonify()
- train_state = sm_job_info.get('TrainingJobStatus', "Pending")
- training_start_time = sm_job_info.get('TrainingStartTime', None)
+ train_state = sm_job_info.get("TrainingJobStatus", "Pending")
+ training_start_time = sm_job_info.get("TrainingStartTime", None)
training_end_time = sm_job_info.get("TrainingEndTime", None)
if training_start_time is not None:
- training_start_time = training_start_time.strftime("%Y-%m-%d %H:%M:%S")
+ training_start_time = training_start_time.strftime("%Y-%m-%d %H:%M:%S")
if training_end_time is not None:
- training_end_time = training_end_time.strftime("%Y-%m-%d %H:%M:%S")
-
- model_artifacts = sm_job_info.get('ModelArtifacts', None)
+ training_end_time = training_end_time.strftime("%Y-%m-%d %H:%M:%S")
+
+ model_artifacts = sm_job_info.get("ModelArtifacts", None)
if model_artifacts is not None:
s3_model_output_path = model_artifacts.get("S3ModelArtifacts", None)
else:
s3_model_output_path = None
self.model_record.update_model_job_status(
- training_start_time,
- training_end_time,
- train_state,
- s3_model_output_path
+ training_start_time, training_end_time, train_state, s3_model_output_path
)
self.model_db_client.update_model_job_state(self._jsonify())
@@ -481,63 +437,65 @@ def _update_model_table_evaluation_states(self):
"""
if self.model_record.eval_in_terminal_state():
- self.model_db_client.update_model_record(
- self._jsonify()
- )
+ self.model_db_client.update_model_record(self._jsonify())
return self._jsonify()
-
+
# Try and fetch updated SageMaker Training Job Status
sm_eval_job_info = {}
-
- max_describe_retries = 100
+
+ max_describe_retries = 100
sleep_between_describe_retries = 10
for i in range(max_describe_retries):
try:
sm_eval_job_info = self.sagemaker_client.describe_training_job(
- TrainingJobName=self.model_record._evaluation_job_name)
+ TrainingJobName=self.model_record._evaluation_job_name
+ )
except Exception as e:
if "ValidationException" in str(e):
print(e)
if i > max_describe_retries:
# 3rd attempt for DescribeTrainingJob with validation failure
- logger.warn("Looks like SageMaker Job was not submitted successfully."
- f" Failing EvaluationJob {self.model_record._evaluation_job_name}"
+ logger.warn(
+ "Looks like SageMaker Job was not submitted successfully."
+ f" Failing EvaluationJob {self.model_record._evaluation_job_name}"
)
self.model_record.update_eval_job_as_failed()
self.model_db_client.update_model_eval_as_failed(self._jsonify())
return
- else:
+ else:
time.sleep(sleep_between_describe_retries)
continue
else:
- # Do not raise exception, most probably throttling.
- logger.warn("Failed to check SageMaker Training Job state for EvaluationJob: "
- f" {self.model_record._evaluation_job_name}. This exception will be ignored,"
- " and retried."
+ # Do not raise exception, most probably throttling.
+ logger.warn(
+ "Failed to check SageMaker Training Job state for EvaluationJob: "
+ f" {self.model_record._evaluation_job_name}. This exception will be ignored,"
+ " and retried."
)
time.sleep(sleep_between_describe_retries)
return self._jsonify()
-
- eval_state = sm_eval_job_info.get('TrainingJobStatus', 'Pending')
- if eval_state == 'Completed':
+ eval_state = sm_eval_job_info.get("TrainingJobStatus", "Pending")
+ if eval_state == "Completed":
eval_score = "n.a."
if self.local_mode:
- rgx = re.compile('average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$', re.M)
+ rgx = re.compile("average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$", re.M)
eval_score_rgx = rgx.findall(self.log_output)
-
+
if len(eval_score_rgx) == 0:
logger.warning("No eval score available from vw job log.")
else:
- eval_score = eval_score_rgx[0][0] # [('eval_score', '')]
+ eval_score = eval_score_rgx[0][0] # [('eval_score', '')]
else:
attempts = 0
- while eval_score == 'n.a.' and attempts < 4:
+ while eval_score == "n.a." and attempts < 4:
try:
- metric_df = TrainingJobAnalytics(self.model_record._evaluation_job_name, ['average_loss']).dataframe()
- eval_score = str(metric_df[metric_df['metric_name'] == 'average_loss']['value'][0])
+ metric_df = TrainingJobAnalytics(
+ self.model_record._evaluation_job_name, ["average_loss"]
+ ).dataframe()
+ eval_score = str(metric_df[metric_df["metric_name"] == "average_loss"]["value"][0])
except Exception:
# to avoid throttling
time.sleep(5)
@@ -549,4 +507,4 @@ def _update_model_table_evaluation_states(self):
else:
# update eval state via ddb client
self.model_record.update_eval_job_state(eval_state)
- self.model_db_client.update_model_eval_job_state(self._jsonify())
\ No newline at end of file
+ self.model_db_client.update_model_eval_job_state(self._jsonify())
diff --git a/09_deploy/common/sagemaker_rl/ray_launcher.py b/09_deploy/common/sagemaker_rl/ray_launcher.py
index f787859d..2a64a670 100644
--- a/09_deploy/common/sagemaker_rl/ray_launcher.py
+++ b/09_deploy/common/sagemaker_rl/ray_launcher.py
@@ -28,13 +28,14 @@ class Cluster(Enum):
for Neural Network training and secondary cluster has CPU instances for rollouts.
For single machine or homogeneous cluster, primary is the default type.
"""
+
Primary = "primary"
Secondary = "secondary"
class SageMakerRayLauncher(object):
"""Base class for SageMaker RL applications using Ray-RLLib.
- Customers should sub-class this, fill in the required methods, and
+ Customers should sub-class this, fill in the required methods, and
call .train_main() to start a training process.
Example::
@@ -47,7 +48,7 @@ def create_environment(env_config):
class MyLauncher(SageMakerRayLauncher):
def register_env_creator(self):
register_env("RoboschoolHumanoid-v1", create_environment)
-
+
def get_experiment_config(self):
return {
"training": {
@@ -81,16 +82,14 @@ def _get_cluster_type(self):
return Cluster.Secondary
def register_env_creator(self):
- """Sub-classes must implement this.
- """
+ """Sub-classes must implement this."""
raise NotImplementedError("Subclasses should implement this to call ray.tune.registry.register_env")
def get_experiment_config(self):
raise NotImplementedError("Subclasses must define the experiment config to pass to ray.tune.run_experiments")
def customize_experiment_config(self, config):
- """Applies command-line hyperparameters to the config.
- """
+ """Applies command-line hyperparameters to the config."""
# TODO: use ConfigList from Coach launcher, and share customization code.
hyperparams_dict = json.loads(os.environ.get("SM_HPS", "{}"))
@@ -98,7 +97,7 @@ def customize_experiment_config(self, config):
# TODO: move this to before customer-specified so they can override
hyperparams_dict["rl.training.local_dir"] = INTERMEDIATE_DIR
hyperparams_dict["rl.training.checkpoint_at_end"] = True
- hyperparams_dict["rl.training.checkpoint_freq"] = config['training'].get('checkpoint_freq', 10)
+ hyperparams_dict["rl.training.checkpoint_freq"] = config["training"].get("checkpoint_freq", 10)
self.hyperparameters = ConfigurationList() # TODO: move to shared
for name, value in hyperparams_dict.items():
@@ -132,9 +131,9 @@ def ray_init_config(self):
return config
master_ip = get_ip_from_host(host_name=self.host_name)
self.start_ray_cluster(master_ip)
- self.sage_cluster_communicator.write_host_config(ip=master_ip,
- host_name="%s:%s" % (
- self.cluster_type.value, self.host_name))
+ self.sage_cluster_communicator.write_host_config(
+ ip=master_ip, host_name="%s:%s" % (self.cluster_type.value, self.host_name)
+ )
self.sage_cluster_communicator.create_s3_signal("%s:%s" % (self.cluster_type.value, self.host_name))
print("Waiting for %s worker nodes to join!" % (len(all_wokers_host_names)))
self.sage_cluster_communicator.wait_for_signals(all_wokers_host_names)
@@ -158,13 +157,17 @@ def ray_init_config(self):
def start_ray_cluster(self, master_ip):
if ray.__version__ >= "0.6.5":
- p = subprocess.Popen("ray start --head --redis-port=6379 --node-ip-address=%s" % master_ip,
- shell=True,
- stderr=subprocess.STDOUT)
+ p = subprocess.Popen(
+ "ray start --head --redis-port=6379 --node-ip-address=%s" % master_ip,
+ shell=True,
+ stderr=subprocess.STDOUT,
+ )
else:
- p = subprocess.Popen("ray start --head --redis-port=6379 --no-ui --node-ip-address=%s" % master_ip,
- shell=True,
- stderr=subprocess.STDOUT)
+ p = subprocess.Popen(
+ "ray start --head --redis-port=6379 --no-ui --node-ip-address=%s" % master_ip,
+ shell=True,
+ stderr=subprocess.STDOUT,
+ )
time.sleep(3)
if p.poll() != 0:
@@ -172,11 +175,18 @@ def start_ray_cluster(self, master_ip):
def join_ray_cluster(self, master_ip, node_ip):
if ray.__version__ >= "0.8.2":
- p = subprocess.Popen("ray start --address=%s:6379" % (master_ip),
- shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
+ p = subprocess.Popen(
+ "ray start --address=%s:6379" % (master_ip),
+ shell=True,
+ stderr=subprocess.STDOUT,
+ stdout=subprocess.PIPE,
+ )
else:
- p = subprocess.Popen("ray start --redis-address=%s:6379 --node-ip-address=%s" % (master_ip, node_ip),
- shell=True, stderr=subprocess.STDOUT)
+ p = subprocess.Popen(
+ "ray start --redis-address=%s:6379 --node-ip-address=%s" % (master_ip, node_ip),
+ shell=True,
+ stderr=subprocess.STDOUT,
+ )
time.sleep(3)
if p.poll() != 0:
raise RuntimeError("Could not join Ray server running at %s:6379" % master_ip)
@@ -196,8 +206,9 @@ def copy_checkpoints_to_model_output(self):
checkpoints.sort(key=natural_keys)
latest_checkpoints = checkpoints[-2:]
- validation = sum(1 if x.endswith("tune_metadata") or x.endswith("extra_data") else 0 for x in
- latest_checkpoints)
+ validation = sum(
+ 1 if x.endswith("tune_metadata") or x.endswith("extra_data") else 0 for x in latest_checkpoints
+ )
if ray.__version__ >= "0.6.5":
if validation is not 1:
@@ -254,8 +265,8 @@ def save_checkpoint_and_serving_model(self, algorithm=None, env_string=None):
def set_up_checkpoint(self, config=None):
try:
- checkpoint_dir = config['training']['restore']
- print("Found checkpoint dir %s in user config." %checkpoint_dir)
+ checkpoint_dir = config["training"]["restore"]
+ print("Found checkpoint dir %s in user config." % checkpoint_dir)
return config
except KeyError:
pass
@@ -269,13 +280,15 @@ def set_up_checkpoint(self, config=None):
print("checkpoint_dir is {}".format(checkpoint_dir))
checkpoint_dir_contents = os.listdir(checkpoint_dir)
if len(checkpoint_dir_contents) not in [2, 3]:
- raise RuntimeError(f"Unexpected files {checkpoint_dir_contents} in checkpoint dir. "
- "Please check ray documents for the correct checkpoint format.")
+ raise RuntimeError(
+ f"Unexpected files {checkpoint_dir_contents} in checkpoint dir. "
+ "Please check ray documents for the correct checkpoint format."
+ )
validation = 0
checkpoint_file_in_container = ""
for filename in checkpoint_dir_contents:
- is_tune_metadata= filename.endswith("tune_metadata")
+ is_tune_metadata = filename.endswith("tune_metadata")
is_extra_data = filename.endswith("extra_data")
is_checkpoint_meta = is_tune_metadata + is_extra_data
validation += is_checkpoint_meta
@@ -288,20 +301,21 @@ def set_up_checkpoint(self, config=None):
else:
if validation is not 2:
raise RuntimeError("Failed to find .tune_metadata or .extra_data to restore checkpoint")
-
+
if checkpoint_file_in_container:
- print("Found checkpoint: %s. Setting `restore` path in ray config." %checkpoint_file_in_container)
- config['training']['restore'] = checkpoint_file_in_container
+ print("Found checkpoint: %s. Setting `restore` path in ray config." % checkpoint_file_in_container)
+ config["training"]["restore"] = checkpoint_file_in_container
else:
- print("No valid checkpoint found in %s. Training from scratch." %checkpoint_dir)
+ print("No valid checkpoint found in %s. Training from scratch." % checkpoint_dir)
return config
-
+
def _checkpoint_dir_finder(self, current_dir=None):
current_dir_subfolders = os.walk(current_dir).__next__()[1]
if len(current_dir_subfolders) > 1:
- raise RuntimeError(f"Multiple folders detected: '{current_dir_subfolders}'."
- "Please provide one checkpoint only." )
+ raise RuntimeError(
+ f"Multiple folders detected: '{current_dir_subfolders}'." "Please provide one checkpoint only."
+ )
elif not current_dir_subfolders:
return current_dir
return self._checkpoint_dir_finder(os.path.join(current_dir, *current_dir_subfolders))
@@ -322,11 +336,12 @@ def launch(self):
experiment_config = self.get_experiment_config()
experiment_config = self.customize_experiment_config(experiment_config)
experiment_config = self.set_up_checkpoint(experiment_config)
-
- print("Important! Ray with version <=7.2 may report \"Did not find checkpoint file\" even if the",
- "experiment is actually restored successfully. If restoration is expected, please check",
- "\"training_iteration\" in the experiment info to confirm."
- )
+
+ print(
+ 'Important! Ray with version <=7.2 may report "Did not find checkpoint file" even if the',
+ "experiment is actually restored successfully. If restoration is expected, please check",
+ '"training_iteration" in the experiment info to confirm.',
+ )
run_experiments(experiment_config)
all_wokers_host_names = self.get_all_host_names()[1:]
# If distributed job, send TERMINATION_SIGNAL to all workers.
@@ -335,12 +350,10 @@ def launch(self):
algo = experiment_config["training"]["run"]
env_string = experiment_config["training"]["config"]["env"]
- self.save_checkpoint_and_serving_model(algorithm=algo,
- env_string=env_string)
+ self.save_checkpoint_and_serving_model(algorithm=algo, env_string=env_string)
@classmethod
def train_main(cls):
- """main function that kicks things off
- """
+ """main function that kicks things off"""
launcher = cls()
launcher.launch()
diff --git a/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py b/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py
index 6a2e3184..cd47d95e 100644
--- a/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py
+++ b/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py
@@ -5,7 +5,7 @@
import time
-class SageClusterCommunicator():
+class SageClusterCommunicator:
def __init__(self):
bucket = os.environ.get("SM_HP_S3_BUCKET", None)
prefix = os.environ.get("SM_HP_S3_PREFIX", None)
@@ -20,7 +20,7 @@ def __init__(self):
def get_client(self):
session = boto3.session.Session()
- return session.client('s3', region_name=self.aws_region)
+ return session.client("s3", region_name=self.aws_region)
def _get_s3_key(self, key):
return os.path.normpath(self.s3_prefix + "/config/" + key)
@@ -39,10 +39,10 @@ def _find_s3_output_path(self):
tuple (bucket, prefix)
"""
module_dir_s3_path = self._required_environment_param("module_dir")
- if not module_dir_s3_path.startswith('s3://'):
+ if not module_dir_s3_path.startswith("s3://"):
raise ValueError('Unexpected format for module_dir_s3_path. Expected "s3://...')
bucket_prefix = module_dir_s3_path.replace("s3://", "")
- bucket, key = bucket_prefix.split('/', 1)
+ bucket, key = bucket_prefix.split("/", 1)
prefix = "/".join(key.split("/")[:-2])
if prefix == "":
# {bucket}/{job_name}/source/sourcedir.tar.gz structure not present
@@ -51,7 +51,7 @@ def _find_s3_output_path(self):
def create_s3_signal(self, signal):
s3_client = self.get_client()
- s3_client.upload_fileobj(io.BytesIO(b''), self.s3_bucket, self._get_s3_key(signal))
+ s3_client.upload_fileobj(io.BytesIO(b""), self.s3_bucket, self._get_s3_key(signal))
def wait_for_signals(self, signals, timeout=600, sleep_time=5):
if len(signals) == 0:
@@ -69,7 +69,8 @@ def wait_for_signals(self, signals, timeout=600, sleep_time=5):
time_elapsed += sleep_time
if time_elapsed >= timeout:
raise RuntimeError(
- "Could not find all the signals: %s for last %s seconds" % (signals, time_elapsed))
+ "Could not find all the signals: %s for last %s seconds" % (signals, time_elapsed)
+ )
else:
print("Received all signal[s]: %s" % signals)
return
@@ -79,7 +80,7 @@ def write_host_config(self, ip, host_name):
data = {"IP": ip, "HOST_NAME": host_name}
json_blob = json.dumps(data)
file_handle = io.BytesIO(json_blob.encode())
- file_handle_done = io.BytesIO(b'done')
+ file_handle_done = io.BytesIO(b"done")
s3_client.upload_fileobj(file_handle, self.s3_bucket, self._get_s3_key(self.ip_key))
s3_client.upload_fileobj(file_handle_done, self.s3_bucket, self._get_s3_key(self.done_file_key))
@@ -87,7 +88,7 @@ def get_master_config(self):
s3_client = self.get_client()
self._wait_for_ip_upload()
try:
- s3_client.download_file(self.s3_bucket, self._get_s3_key(self.ip_key), 'ip.json')
+ s3_client.download_file(self.s3_bucket, self._get_s3_key(self.ip_key), "ip.json")
with open("ip.json") as f:
json_obj = json.load(f)
ip = json_obj["IP"]
@@ -122,9 +123,7 @@ def download_file(self, s3_key, local_path):
def upload_file(self, s3_key, local_path):
s3_client = self.get_client()
try:
- s3_client.upload_file(Filename=local_path,
- Bucket=self.s3_bucket,
- Key=s3_key)
+ s3_client.upload_file(Filename=local_path, Bucket=self.s3_bucket, Key=s3_key)
return True
except Exception as e:
return False
diff --git a/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py b/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py
index 26a1b45f..dd258940 100644
--- a/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py
+++ b/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py
@@ -20,7 +20,7 @@ def reward(self, _reward):
return _reward * self.scale
-class SagemakerStableBaselinesLauncher():
+class SagemakerStableBaselinesLauncher:
"""
Sagemaker's Stable Baselines Launcher.
"""
@@ -32,23 +32,22 @@ def __init__(self, env, output_path, model, num_timesteps):
self._num_timesteps = num_timesteps
def _train(self):
- """Train the RL model
- """
+ """Train the RL model"""
self._model.learn(total_timesteps=self._num_timesteps)
def _predict(self, model, video_path):
- """Run predictions on trained RL model.
- """
+ """Run predictions on trained RL model."""
- vr = VideoRecorder(env=self._env, path="{}/rl_out.mp4".format(video_path, str(MPI.COMM_WORLD.Get_rank())),
- enabled=True)
+ vr = VideoRecorder(
+ env=self._env, path="{}/rl_out.mp4".format(video_path, str(MPI.COMM_WORLD.Get_rank())), enabled=True
+ )
obs = self._env.reset()
for i in range(1000):
action, _states = model.predict(obs)
obs, rewards, dones, info = self._env.step(action)
if dones:
obs = self._env.reset()
- self._env.render(mode='rgb_array')
+ self._env.render(mode="rgb_array")
vr.capture_frame()
vr.close()
self._env.close()
@@ -66,33 +65,59 @@ class SagemakerStableBaselinesPPO1Launcher(SagemakerStableBaselinesLauncher):
Sagemaker's Stable Baselines PPO1 Launcher.
"""
- def __init__(self, env, output_path, timesteps_per_actorbatch,
- clip_param, entcoeff, optim_epochs,
- optim_stepsize, optim_batchsize,
- gamma, lam, schedule,
- verbose, num_timesteps):
+ def __init__(
+ self,
+ env,
+ output_path,
+ timesteps_per_actorbatch,
+ clip_param,
+ entcoeff,
+ optim_epochs,
+ optim_stepsize,
+ optim_batchsize,
+ gamma,
+ lam,
+ schedule,
+ verbose,
+ num_timesteps,
+ ):
print(
"Initializing PPO with output_path: {} and Hyper Params [timesteps_per_actorbatch: {},clip_param: {}, "
"entcoeff: {}, optim_epochs: {}, optim_stepsize: {}, optim_batchsize: {}, gamma: {}, lam: {}, "
- "schedule: {}, verbose: {}, num_timesteps: {}]".format(output_path, timesteps_per_actorbatch,
- clip_param, entcoeff, optim_epochs,
- optim_stepsize, optim_batchsize,
- gamma, lam, schedule,
- verbose, num_timesteps))
- super().__init__(env, output_path,
- PPO1(policy=MlpPolicy,
- env=env,
- gamma=gamma,
- timesteps_per_actorbatch=timesteps_per_actorbatch,
- clip_param=clip_param,
- entcoeff=entcoeff,
- optim_epochs=optim_epochs,
- optim_stepsize=optim_stepsize,
- optim_batchsize=optim_batchsize,
- lam=lam,
- schedule=schedule,
- verbose=verbose),
- num_timesteps)
+ "schedule: {}, verbose: {}, num_timesteps: {}]".format(
+ output_path,
+ timesteps_per_actorbatch,
+ clip_param,
+ entcoeff,
+ optim_epochs,
+ optim_stepsize,
+ optim_batchsize,
+ gamma,
+ lam,
+ schedule,
+ verbose,
+ num_timesteps,
+ )
+ )
+ super().__init__(
+ env,
+ output_path,
+ PPO1(
+ policy=MlpPolicy,
+ env=env,
+ gamma=gamma,
+ timesteps_per_actorbatch=timesteps_per_actorbatch,
+ clip_param=clip_param,
+ entcoeff=entcoeff,
+ optim_epochs=optim_epochs,
+ optim_stepsize=optim_stepsize,
+ optim_batchsize=optim_batchsize,
+ lam=lam,
+ schedule=schedule,
+ verbose=verbose,
+ ),
+ num_timesteps,
+ )
def create_env(env_id, output_path, seed=0):
diff --git a/09_deploy/common/sagemaker_rl/tf_serving_utils.py b/09_deploy/common/sagemaker_rl/tf_serving_utils.py
index bf867c48..55ceaafd 100644
--- a/09_deploy/common/sagemaker_rl/tf_serving_utils.py
+++ b/09_deploy/common/sagemaker_rl/tf_serving_utils.py
@@ -5,12 +5,13 @@
tf = try_import_tf()
+
def atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
- return [atoi(c) for c in re.split('(\d+)', text)]
+ return [atoi(c) for c in re.split("(\d+)", text)]
def change_permissions_recursive(path, mode):
@@ -33,18 +34,16 @@ def export_tf_serving(agent, output_dir):
output_signature["actions"] = tf.saved_model.utils.build_tensor_info(policy.sampler)
output_signature["logits"] = tf.saved_model.utils.build_tensor_info(policy.logits)
- signature_def = (
- tf.saved_model.signature_def_utils.build_signature_def(
- input_signature, output_signature,
- tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
- signature_def_key = (tf.saved_model.signature_constants.
- DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+ signature_def = tf.saved_model.signature_def_utils.build_signature_def(
+ input_signature, output_signature, tf.saved_model.signature_constants.PREDICT_METHOD_NAME
+ )
+ signature_def_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
signature_def_map = {signature_def_key: signature_def}
with policy.sess.graph.as_default():
builder = tf.saved_model.builder.SavedModelBuilder(os.path.join(output_dir, "1"))
builder.add_meta_graph_and_variables(
- policy.sess, [tf.saved_model.tag_constants.SERVING],
- signature_def_map=signature_def_map)
+ policy.sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=signature_def_map
+ )
builder.save()
print("Saved TensorFlow serving model!")
diff --git a/09_deploy/src/eval-cfa-vw.py b/09_deploy/src/eval-cfa-vw.py
index 022671b4..a28d111d 100644
--- a/09_deploy/src/eval-cfa-vw.py
+++ b/09_deploy/src/eval-cfa-vw.py
@@ -17,8 +17,8 @@ def main():
"""
Evaluate a Vowpal Wabbit (VW) model by performing counterfactual analysis (CFA)
"""
- channel_names = json.loads(os.environ['SM_CHANNELS'])
- hyperparameters = json.loads(os.environ['SM_HPS'])
+ channel_names = json.loads(os.environ["SM_CHANNELS"])
+ hyperparameters = json.loads(os.environ["SM_HPS"])
local_mode_manifest = bool(hyperparameters.get("local_mode_manifest", False))
num_arms = int(hyperparameters.get("num_arms", 0))
cfa_type = hyperparameters.get("cfa_type", "dr")
@@ -33,8 +33,7 @@ def main():
model_folder = os.environ[f"SM_CHANNEL_{MODEL_CHANNEL.upper()}"]
_, weights_path = extract_model(model_folder)
vw_load_model_args = f"-i {weights_path}"
- vw_model = VWModel(cli_args=f"{vw_load_model_args}",
- model_path=None, test_only=False, quiet_mode=False)
+ vw_model = VWModel(cli_args=f"{vw_load_model_args}", model_path=None, test_only=False, quiet_mode=False)
vw_model.start()
# Different CFA policies in VW
@@ -42,13 +41,12 @@ def main():
if cfa_type not in cfa_type_candidate:
raise ValueError(f"Customer Error: Counterfactual algorithm must be in {cfa_type_candidate}.")
if cfa_type == "dm":
- logging.warning(f"Direct method can not be used for evaluation -- it is biased."
- "Resetting to dr.")
+ logging.warning(f"Direct method can not be used for evaluation -- it is biased." "Resetting to dr.")
cfa_type = "dr"
vw_cfa_args = f"--cb {num_arms} --eval --cb_type {cfa_type}"
# Set test_only=False as VW differentiates "test" with "evaluation"
- vw_cfa = VWModel(cli_args=f"{vw_cfa_args}", test_only=False, quiet_mode=False)
+ vw_cfa = VWModel(cli_args=f"{vw_cfa_args}", test_only=False, quiet_mode=False)
vw_cfa.start()
if EVAL_CHANNEL not in channel_names:
@@ -65,16 +63,16 @@ def main():
manifest_file = files[0]
logging.info(f"Trying to download files using manifest file {manifest_file}.")
download_manifest_data(manifest_file, eval_data_dir)
-
+
eval_files = [i for i in eval_data_dir.rglob("*") if i.is_file() and i.suffix == ".csv"]
logging.info("Processing evaluation data: %s" % eval_files)
-
+
data_reader = CSVReader(input_files=eval_files)
data_iterator = data_reader.get_iterator()
-
+
if MODEL_CHANNEL not in channel_names:
raise ValueError("No model to be evaluated. Should at least provide current model.")
-
+
# Perform counterfactual analysis
count = 0
for experience in data_iterator:
@@ -85,20 +83,22 @@ def main():
predicted_action_probs = vw_model.predict(context_vector=experience_context)
n_choices = len(predicted_action_probs)
predicted_action = np.random.choice(n_choices, p=predicted_action_probs) + 1
-
- vw_cfa.evaluate(context_vector=experience_context,
- action=experience["action"],
- cost=1 - experience["reward"],
- probability=experience["action_prob"],
- label=predicted_action)
+
+ vw_cfa.evaluate(
+ context_vector=experience_context,
+ action=experience["action"],
+ cost=1 - experience["reward"],
+ probability=experience["action_prob"],
+ label=predicted_action,
+ )
count += 1
vw_model.close(prediction_only=True)
stdout = vw_cfa.close()
print(stdout.decode())
-
+
logging.info(f"Model evaluated using {count} data instances.")
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/09_deploy/src/io_utils.py b/09_deploy/src/io_utils.py
index 0db9bd48..3009eae6 100644
--- a/09_deploy/src/io_utils.py
+++ b/09_deploy/src/io_utils.py
@@ -21,7 +21,7 @@ def validate_experience(experience):
return True
-class CSVReader():
+class CSVReader:
"""Reader object that loads experiences from CSV file chunks.
The input files will be read from in an random order."""
@@ -38,7 +38,7 @@ def get_iterator(self):
yield line_dict
-class JsonLinesReader():
+class JsonLinesReader:
"""Reader object that loads experiences from JSON file chunks.
The input files will be read from in an random order."""
@@ -58,7 +58,7 @@ def get_experience(self):
return experience
def _try_parse(self, line):
- if line is None or line.strip() == '':
+ if line is None or line.strip() == "":
return None
try:
line_json = json.loads(line.strip())
@@ -68,8 +68,7 @@ def _try_parse(self, line):
assert "prob" in line_json, "prob not found in record"
return line_json
except Exception:
- logger.exception("Ignoring corrupt json record in {}: {}".format(
- self.cur_file, line))
+ logger.exception("Ignoring corrupt json record in {}: {}".format(self.cur_file, line))
return None
def _next_line(self):
@@ -89,8 +88,7 @@ def _next_line(self):
if not line:
logger.debug("Ignoring empty file {}".format(self.cur_file))
if not line:
- raise ValueError("Failed to read next line from files: {}".format(
- self.files))
+ raise ValueError("Failed to read next line from files: {}".format(self.files))
return line
def _next_file(self):
@@ -143,7 +141,7 @@ def download_manifest_data(manifest_file_path, output_dir):
with open(manifest_file_path.as_posix()) as f:
manifest = json.load(f)
s3_prefix = manifest[0]["prefix"]
- s3 = boto3.client('s3')
+ s3 = boto3.client("s3")
for file in manifest[1:]:
s3_uri = os.path.join(s3_prefix, file)
bucket, key, file_name = parse_s3_uri(s3_uri)
diff --git a/09_deploy/src/train-vw.py b/09_deploy/src/train-vw.py
index 0808364f..7ce5d3c4 100644
--- a/09_deploy/src/train-vw.py
+++ b/09_deploy/src/train-vw.py
@@ -15,9 +15,9 @@
def main():
""" Train a Vowpal Wabbit (VW) model through C++ process. """
-
- channel_names = json.loads(os.environ['SM_CHANNELS'])
- hyperparameters = json.loads(os.environ['SM_HPS'])
+
+ channel_names = json.loads(os.environ["SM_CHANNELS"])
+ hyperparameters = json.loads(os.environ["SM_HPS"])
num_arms = int(hyperparameters.get("num_arms", 0))
num_policies = int(hyperparameters.get("num_policies", 3))
exploration_policy = hyperparameters.get("exploration_policy", "egreedy").lower()
@@ -33,7 +33,7 @@ def main():
valid_policies = ["egreedy", "bag", "cover"]
if exploration_policy not in valid_policies:
raise ValueError(f"Customer Error: exploration_policy must be one of {valid_policies}.")
-
+
if exploration_policy == "egreedy":
vw_args_base = f"--cb_explore {num_arms} --epsilon {epsilon}"
else:
@@ -42,28 +42,35 @@ def main():
# No training data. Initialize and save a random model
if TRAIN_CHANNEL not in channel_names:
logging.info("No training data found. Saving a randomly initialized model!")
- vw_model = VWModel(cli_args=f"{vw_args_base} -f {MODEL_OUTPUT_PATH}",
- model_path=None, test_only=False, quiet_mode=False)
+ vw_model = VWModel(
+ cli_args=f"{vw_args_base} -f {MODEL_OUTPUT_PATH}", model_path=None, test_only=False, quiet_mode=False
+ )
vw_model.start()
vw_model.close()
save_vw_metadata(meta=vw_args_base)
-
+
# If training data is present
else:
if MODEL_CHANNEL not in channel_names:
- logging.info(f"No pre-trained model has been specified in channel {MODEL_CHANNEL}."
- f"Training will start from scratch.")
+ logging.info(
+ f"No pre-trained model has been specified in channel {MODEL_CHANNEL}."
+ f"Training will start from scratch."
+ )
vw_args = f"{vw_args_base}"
else:
# Load the pre-trained model for training.
- model_folder = os.environ[f'SM_CHANNEL_{MODEL_CHANNEL.upper()}']
+ model_folder = os.environ[f"SM_CHANNEL_{MODEL_CHANNEL.upper()}"]
_, weights_path = extract_model(model_folder)
logging.info(f"Loading model from {weights_path}")
vw_args = f"{vw_args_base} -i {weights_path}"
-
+
# Init a class that communicates with C++ VW process using pipes
- vw_model = VWModel(cli_args=f"{vw_args} -f {MODEL_OUTPUT_PATH} --save_resume",
- model_path=None, test_only=False, quiet_mode=False)
+ vw_model = VWModel(
+ cli_args=f"{vw_args} -f {MODEL_OUTPUT_PATH} --save_resume",
+ model_path=None,
+ test_only=False,
+ quiet_mode=False,
+ )
vw_model.start()
# Load training data
@@ -79,17 +86,19 @@ def main():
is_valid = validate_experience(experience)
if not is_valid:
continue
- vw_model.learn(context_vector=json.loads(experience["observation"]),
- action=experience["action"],
- cost=1 - experience["reward"],
- probability=experience["action_prob"])
+ vw_model.learn(
+ context_vector=json.loads(experience["observation"]),
+ action=experience["action"],
+ cost=1 - experience["reward"],
+ probability=experience["action_prob"],
+ )
count += 1
-
+
stdout = vw_model.close()
print(stdout.decode())
save_vw_metadata(meta=vw_args_base)
logging.info(f"Model learned using {count} training experiences.")
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/09_deploy/src/vw_model.py b/09_deploy/src/vw_model.py
index 3294f82f..39b688e3 100644
--- a/09_deploy/src/vw_model.py
+++ b/09_deploy/src/vw_model.py
@@ -24,7 +24,7 @@ def __init__(self):
class VWModel:
def __init__(self, model_path=None, cli_args="", test_only=True, quiet_mode=True):
- """ VWModel object starts a VW CLI process and communicates with it using pipes
+ """VWModel object starts a VW CLI process and communicates with it using pipes
Args:
model_path (str): location of the model weights
cli_args (str): additional args to pass to VW
@@ -72,11 +72,14 @@ def start(self):
# note bufsize=1 will make sure we immediately flush each output
# line so that we can keep scoring the model.
# bufsize=1 means line buffered.
- self.current_proc = subprocess.Popen(self.cmd, bufsize=1,
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- universal_newlines=False)
+ self.current_proc = subprocess.Popen(
+ self.cmd,
+ bufsize=1,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=False,
+ )
self.logger.info("Started VW process!")
@@ -89,7 +92,7 @@ def start(self):
raise VWError("Cannot load the model with the provided arguments: %s" % e)
def learn(self, context_vector, action, cost, probability):
- """ Learn on a given experience
+ """Learn on a given experience
Args:
context_vector (list or np.array): A vector of context features
action (int): The action ID that was taken (starts with 1)
@@ -101,7 +104,7 @@ def learn(self, context_vector, action, cost, probability):
parsed_example = self.parse_example(context_vector) + "\n"
parsed_example = f"{action}:{cost}:{probability} {parsed_example}"
-
+
if self.current_proc is None:
raise VWError("trying to learn model when current_proc is None")
@@ -134,11 +137,11 @@ def predict(self, context_vector):
self.current_proc.stdout.flush()
scores = np.array(list(map(float, self.current_proc.stdout.readline().split())))
- scores = (scores / scores.sum())
+ scores = scores / scores.sum()
return scores
-
+
def evaluate(self, context_vector, action, cost, probability, label):
- """ Used when evaluating a policy offline using logged bandits dataset
+ """Used when evaluating a policy offline using logged bandits dataset
Args:
context_vector (list or np.array): A vector of context features
action (int): The action ID that was taken (starts with 1) by the old policy
@@ -148,7 +151,7 @@ def evaluate(self, context_vector, action, cost, probability, label):
"""
parsed_example = self.parse_example(context_vector) + "\n"
parsed_example = f"{label} {action}:{cost}:{probability} {parsed_example}"
-
+
# TODO: Error handling in parsing the given example
if self.current_proc is None:
raise VWError("trying to score model when current_proc is None")
@@ -157,7 +160,7 @@ def evaluate(self, context_vector, action, cost, probability, label):
raise VWModelDown()
self.current_proc.stdin.write(parsed_example.encode())
-
+
# we need to flush to score & collect the score
# otherwise one needs to wait for the process to end
self.current_proc.stdin.flush()
@@ -166,10 +169,10 @@ def evaluate(self, context_vector, action, cost, probability, label):
# VW will make a prediction on each eval instance.
# To avoid PIPE overflow
self.current_proc.stdout.readline()
-
+
@staticmethod
def parse_example(context_vector):
- """ Parses the list of context features
+ """Parses the list of context features
Args:
context_vector (list or np.array): A vector of context features
Returns:
@@ -181,15 +184,13 @@ def parse_example(context_vector):
@staticmethod
def load_vw_model(metadata_loc, weights_loc, test_only=True, quiet_mode=True):
- """ Initialize vw model with given metadata and weights locations
- """
+ """Initialize vw model with given metadata and weights locations"""
with open(metadata_loc) as f:
metadata = f.read().strip()
return VWModel(model_path=weights_loc, cli_args=metadata, test_only=test_only, quiet_mode=quiet_mode)
def close(self, prediction_only=False):
- """ Close the VW process
- """
+ """Close the VW process"""
training_info = ""
if self.current_proc is not None:
self.current_proc.stdin.close()
diff --git a/09_deploy/src/vw_utils.py b/09_deploy/src/vw_utils.py
index e41b11a8..ffe1a2d0 100644
--- a/09_deploy/src/vw_utils.py
+++ b/09_deploy/src/vw_utils.py
@@ -4,7 +4,7 @@
TRAIN_CHANNEL = "training"
EVAL_CHANNEL = "evaluation"
MODEL_CHANNEL = "pretrained_model"
-MODEL_OUTPUT_DIR = os.environ.get('SM_MODEL_DIR', "/opt/ml/model")
+MODEL_OUTPUT_DIR = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
MODEL_OUTPUT_PATH = os.path.join(MODEL_OUTPUT_DIR, "vw.model")
diff --git a/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb b/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
index 13462c1b..912e9bd2 100644
--- a/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
+++ b/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb
@@ -32,12 +32,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -54,9 +54,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "pipeline_name = 'BERT-pipeline-{}'.format(timestamp)"
+ "pipeline_name = \"BERT-pipeline-{}\".format(timestamp)"
]
},
{
@@ -78,12 +79,13 @@
"from smexperiments.experiment import Experiment\n",
"\n",
"pipeline_experiment = Experiment.create(\n",
- " experiment_name=pipeline_name,\n",
- " description='Amazon Customer Reviews BERT Pipeline Experiment', \n",
- " sagemaker_boto_client=sm)\n",
+ " experiment_name=pipeline_name,\n",
+ " description=\"Amazon Customer Reviews BERT Pipeline Experiment\",\n",
+ " sagemaker_boto_client=sm,\n",
+ ")\n",
"\n",
"pipeline_experiment_name = pipeline_experiment.experiment_name\n",
- "print('Pipeline experiment name: {}'.format(pipeline_experiment_name))"
+ "print(\"Pipeline experiment name: {}\".format(pipeline_experiment_name))"
]
},
{
@@ -111,12 +113,12 @@
"import time\n",
"from smexperiments.trial import Trial\n",
"\n",
- "pipeline_trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n",
- " experiment_name=pipeline_experiment_name,\n",
- " sagemaker_boto_client=sm)\n",
+ "pipeline_trial = Trial.create(\n",
+ " trial_name=\"trial-{}\".format(timestamp), experiment_name=pipeline_experiment_name, sagemaker_boto_client=sm\n",
+ ")\n",
"\n",
"pipeline_trial_name = pipeline_trial.trial_name\n",
- "print('Trial name: {}'.format(pipeline_trial_name))"
+ "print(\"Trial name: {}\".format(pipeline_trial_name))"
]
},
{
@@ -221,7 +223,7 @@
"metadata": {},
"outputs": [],
"source": [
- "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n",
+ "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n",
"print(raw_input_data_s3_uri)"
]
},
@@ -241,6 +243,7 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
"input_data = ParameterString(\n",
@@ -248,15 +251,9 @@
" default_value=raw_input_data_s3_uri,\n",
")\n",
"\n",
- "processing_instance_count = ParameterInteger(\n",
- " name=\"ProcessingInstanceCount\",\n",
- " default_value=1\n",
- ")\n",
+ "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n",
"\n",
- "processing_instance_type = ParameterString(\n",
- " name=\"ProcessingInstanceType\",\n",
- " default_value=\"ml.c5.2xlarge\"\n",
- ")\n",
+ "processing_instance_type = ParameterString(name=\"ProcessingInstanceType\", default_value=\"ml.c5.2xlarge\")\n",
"\n",
"max_seq_length = ParameterInteger(\n",
" name=\"MaxSeqLength\",\n",
@@ -267,7 +264,7 @@
" name=\"BalanceDataset\",\n",
" default_value=\"True\",\n",
")\n",
- " \n",
+ "\n",
"train_split_percentage = ParameterFloat(\n",
" name=\"TrainSplitPercentage\",\n",
" default_value=0.90,\n",
@@ -288,10 +285,7 @@
" default_value=\"reviews-feature-store-\" + str(timestamp),\n",
")\n",
"\n",
- "feature_group_name = ParameterString(\n",
- " name=\"FeatureGroupName\",\n",
- " default_value=\"reviews-feature-group-\" + str(timestamp)\n",
- ")"
+ "feature_group_name = ParameterString(name=\"FeatureGroupName\", default_value=\"reviews-feature-group-\" + str(timestamp))"
]
},
{
@@ -324,12 +318,13 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " env={'AWS_DEFAULT_REGION': region}, \n",
- " )"
+ "processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " env={\"AWS_DEFAULT_REGION\": region},\n",
+ ")"
]
},
{
@@ -341,45 +336,56 @@
"from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
"from sagemaker.workflow.steps import ProcessingStep\n",
"\n",
- "processing_inputs=[\n",
- " ProcessingInput(\n",
- " input_name='raw-input-data',\n",
- " source=input_data,\n",
- " destination='/opt/ml/processing/input/data/',\n",
- " s3_data_distribution_type='ShardedByS3Key'\n",
- " )\n",
+ "processing_inputs = [\n",
+ " ProcessingInput(\n",
+ " input_name=\"raw-input-data\",\n",
+ " source=input_data,\n",
+ " destination=\"/opt/ml/processing/input/data/\",\n",
+ " s3_data_distribution_type=\"ShardedByS3Key\",\n",
+ " )\n",
"]\n",
"\n",
- "processing_outputs=[\n",
- " ProcessingOutput(output_name='bert-train',\n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/bert/train',\n",
- " ),\n",
- " ProcessingOutput(output_name='bert-validation',\n",
- " s3_upload_mode='EndOfJob', \n",
- " source='/opt/ml/processing/output/bert/validation',\n",
- " ),\n",
- " ProcessingOutput(output_name='bert-test',\n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/bert/test',\n",
- " ),\n",
- "] \n",
+ "processing_outputs = [\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-train\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/train\",\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-validation\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/validation\",\n",
+ " ),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"bert-test\",\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
+ " source=\"/opt/ml/processing/output/bert/test\",\n",
+ " ),\n",
+ "]\n",
"\n",
"processing_step = ProcessingStep(\n",
- " name='Processing', \n",
- " code='preprocess-scikit-text-to-bert-feature-store.py',\n",
+ " name=\"Processing\",\n",
+ " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n",
" processor=processor,\n",
" inputs=processing_inputs,\n",
" outputs=processing_outputs,\n",
- " job_arguments=['--train-split-percentage', str(train_split_percentage.default_value), \n",
- " '--validation-split-percentage', str(validation_split_percentage.default_value),\n",
- " '--test-split-percentage', str(test_split_percentage.default_value),\n",
- " '--max-seq-length', str(max_seq_length.default_value),\n",
- " '--balance-dataset', str(balance_dataset.default_value),\n",
- " '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),\n",
- " '--feature-group-name', str(feature_group_name.default_value)\n",
- " ]\n",
- ") \n",
+ " job_arguments=[\n",
+ " \"--train-split-percentage\",\n",
+ " str(train_split_percentage.default_value),\n",
+ " \"--validation-split-percentage\",\n",
+ " str(validation_split_percentage.default_value),\n",
+ " \"--test-split-percentage\",\n",
+ " str(test_split_percentage.default_value),\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length.default_value),\n",
+ " \"--balance-dataset\",\n",
+ " str(balance_dataset.default_value),\n",
+ " \"--feature-store-offline-prefix\",\n",
+ " str(feature_store_offline_prefix.default_value),\n",
+ " \"--feature-group-name\",\n",
+ " str(feature_group_name.default_value),\n",
+ " ],\n",
+ ")\n",
"\n",
"print(processing_step)"
]
@@ -422,15 +428,9 @@
"metadata": {},
"outputs": [],
"source": [
- "train_instance_type = ParameterString(\n",
- " name=\"TrainInstanceType\",\n",
- " default_value=\"ml.c5.9xlarge\"\n",
- ")\n",
+ "train_instance_type = ParameterString(name=\"TrainInstanceType\", default_value=\"ml.c5.9xlarge\")\n",
"\n",
- "train_instance_count = ParameterInteger(\n",
- " name=\"TrainInstanceCount\",\n",
- " default_value=1\n",
- ")"
+ "train_instance_count = ParameterInteger(name=\"TrainInstanceCount\", default_value=1)"
]
},
{
@@ -447,56 +447,26 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs = ParameterInteger(\n",
- " name=\"Epochs\",\n",
- " default_value=1\n",
- ")\n",
- " \n",
- "learning_rate = ParameterFloat(\n",
- " name=\"LearningRate\",\n",
- " default_value=0.00001\n",
- ") \n",
- " \n",
- "epsilon = ParameterFloat(\n",
- " name=\"Epsilon\",\n",
- " default_value=0.00000001\n",
- ")\n",
- " \n",
- "train_batch_size = ParameterInteger(\n",
- " name=\"TrainBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "validation_batch_size = ParameterInteger(\n",
- " name=\"ValidationBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "test_batch_size = ParameterInteger(\n",
- " name=\"TestBatchSize\",\n",
- " default_value=128\n",
- ")\n",
- " \n",
- "train_steps_per_epoch = ParameterInteger(\n",
- " name=\"TrainStepsPerEpoch\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "validation_steps = ParameterInteger(\n",
- " name=\"ValidationSteps\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "test_steps = ParameterInteger(\n",
- " name=\"TestSteps\",\n",
- " default_value=50\n",
- ")\n",
- " \n",
- "train_volume_size = ParameterInteger(\n",
- " name=\"TrainVolumeSize\",\n",
- " default_value=1024\n",
- ") \n",
- " \n",
+ "epochs = ParameterInteger(name=\"Epochs\", default_value=1)\n",
+ "\n",
+ "learning_rate = ParameterFloat(name=\"LearningRate\", default_value=0.00001)\n",
+ "\n",
+ "epsilon = ParameterFloat(name=\"Epsilon\", default_value=0.00000001)\n",
+ "\n",
+ "train_batch_size = ParameterInteger(name=\"TrainBatchSize\", default_value=128)\n",
+ "\n",
+ "validation_batch_size = ParameterInteger(name=\"ValidationBatchSize\", default_value=128)\n",
+ "\n",
+ "test_batch_size = ParameterInteger(name=\"TestBatchSize\", default_value=128)\n",
+ "\n",
+ "train_steps_per_epoch = ParameterInteger(name=\"TrainStepsPerEpoch\", default_value=50)\n",
+ "\n",
+ "validation_steps = ParameterInteger(name=\"ValidationSteps\", default_value=50)\n",
+ "\n",
+ "test_steps = ParameterInteger(name=\"TestSteps\", default_value=50)\n",
+ "\n",
+ "train_volume_size = ParameterInteger(name=\"TrainVolumeSize\", default_value=1024)\n",
+ "\n",
"use_xla = ParameterString(\n",
" name=\"UseXLA\",\n",
" default_value=\"True\",\n",
@@ -506,7 +476,7 @@
" name=\"UseAMP\",\n",
" default_value=\"True\",\n",
")\n",
- " \n",
+ "\n",
"freeze_bert_layer = ParameterString(\n",
" name=\"FreezeBERTLayer\",\n",
" default_value=\"False\",\n",
@@ -516,7 +486,7 @@
" name=\"EnableSageMakerDebugger\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"enable_checkpointing = ParameterString(\n",
" name=\"EnableCheckpointing\",\n",
" default_value=\"False\",\n",
@@ -526,7 +496,7 @@
" name=\"EnableTensorboard\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"input_mode = ParameterString(\n",
" name=\"InputMode\",\n",
" default_value=\"File\",\n",
@@ -541,7 +511,7 @@
" name=\"RunTest\",\n",
" default_value=\"False\",\n",
")\n",
- " \n",
+ "\n",
"run_sample_predictions = ParameterString(\n",
" name=\"RunSamplePredictions\",\n",
" default_value=\"False\",\n",
@@ -562,10 +532,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -601,36 +571,39 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size, \n",
- " py_version='py37',\n",
- " framework_version='2.3.1',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " metric_definitions=metrics_definitions,\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py37\",\n",
+ " framework_version=\"2.3.1\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " metric_definitions=metrics_definitions,\n",
+ ")"
]
},
{
@@ -652,27 +625,21 @@
"from sagemaker.workflow.steps import TrainingStep\n",
"\n",
"training_step = TrainingStep(\n",
- " name='Train',\n",
+ " name=\"Train\",\n",
" estimator=estimator,\n",
" inputs={\n",
- " 'train': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-train'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
+ " \"train\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-train\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " \"validation\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-validation\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
" ),\n",
- " 'validation': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-validation'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
+ " \"test\": TrainingInput(\n",
+ " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-test\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
" ),\n",
- " 'test': TrainingInput(\n",
- " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n",
- " 'bert-test'\n",
- " ].S3Output.S3Uri,\n",
- " content_type='text/csv'\n",
- " ) \n",
" },\n",
")\n",
"\n",
@@ -726,12 +693,14 @@
"source": [
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"\n",
- "evaluation_processor = SKLearnProcessor(framework_version='0.23-1',\n",
- " role=role,\n",
- " instance_type=processing_instance_type,\n",
- " instance_count=processing_instance_count,\n",
- " env={'AWS_DEFAULT_REGION': region},\n",
- " max_runtime_in_seconds=7200)"
+ "evaluation_processor = SKLearnProcessor(\n",
+ " framework_version=\"0.23-1\",\n",
+ " role=role,\n",
+ " instance_type=processing_instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " env={\"AWS_DEFAULT_REGION\": region},\n",
+ " max_runtime_in_seconds=7200,\n",
+ ")"
]
},
{
@@ -742,7 +711,7 @@
},
"outputs": [],
"source": [
- "!pygmentize evaluate_model_metrics.py\n"
+ "!pygmentize evaluate_model_metrics.py"
]
},
{
@@ -762,11 +731,7 @@
"source": [
"from sagemaker.workflow.properties import PropertyFile\n",
"\n",
- "evaluation_report = PropertyFile(\n",
- " name='EvaluationReport',\n",
- " output_name='metrics',\n",
- " path='evaluation.json'\n",
- ")"
+ "evaluation_report = PropertyFile(name=\"EvaluationReport\", output_name=\"metrics\", path=\"evaluation.json\")"
]
},
{
@@ -776,27 +741,28 @@
"outputs": [],
"source": [
"evaluation_step = ProcessingStep(\n",
- " name='EvaluateModel',\n",
+ " name=\"EvaluateModel\",\n",
" processor=evaluation_processor,\n",
- " code='evaluate_model_metrics.py',\n",
+ " code=\"evaluate_model_metrics.py\",\n",
" inputs=[\n",
" ProcessingInput(\n",
" source=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
- " destination='/opt/ml/processing/input/model'\n",
+ " destination=\"/opt/ml/processing/input/model\",\n",
" ),\n",
" ProcessingInput(\n",
- " source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri,\n",
- " destination='/opt/ml/processing/input/data'\n",
- " )\n",
+ " source=processing_step.properties.ProcessingInputs[\"raw-input-data\"].S3Input.S3Uri,\n",
+ " destination=\"/opt/ml/processing/input/data\",\n",
+ " ),\n",
" ],\n",
" outputs=[\n",
- " ProcessingOutput(output_name='metrics', \n",
- " s3_upload_mode='EndOfJob',\n",
- " source='/opt/ml/processing/output/metrics/'),\n",
+ " ProcessingOutput(\n",
+ " output_name=\"metrics\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/metrics/\"\n",
+ " ),\n",
" ],\n",
" job_arguments=[\n",
- " '--max-seq-length', str(max_seq_length.default_value),\n",
- " ],\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length.default_value),\n",
+ " ],\n",
" property_files=[evaluation_report],\n",
")"
]
@@ -814,14 +780,14 @@
"metadata": {},
"outputs": [],
"source": [
- "from sagemaker.model_metrics import MetricsSource, ModelMetrics \n",
+ "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n",
"\n",
"model_metrics = ModelMetrics(\n",
" model_statistics=MetricsSource(\n",
" s3_uri=\"{}/evaluation.json\".format(\n",
" evaluation_step.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n",
" ),\n",
- " content_type=\"application/json\"\n",
+ " content_type=\"application/json\",\n",
" )\n",
")\n",
"\n",
@@ -853,20 +819,11 @@
"metadata": {},
"outputs": [],
"source": [
- "model_approval_status = ParameterString(\n",
- " name=\"ModelApprovalStatus\",\n",
- " default_value=\"PendingManualApproval\"\n",
- ")\n",
+ "model_approval_status = ParameterString(name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\")\n",
"\n",
- "deploy_instance_type = ParameterString(\n",
- " name=\"DeployInstanceType\",\n",
- " default_value=\"ml.m5.4xlarge\"\n",
- ")\n",
+ "deploy_instance_type = ParameterString(name=\"DeployInstanceType\", default_value=\"ml.m5.4xlarge\")\n",
"\n",
- "deploy_instance_count = ParameterInteger(\n",
- " name=\"DeployInstanceCount\",\n",
- " default_value=1\n",
- ")"
+ "deploy_instance_count = ParameterInteger(name=\"DeployInstanceCount\", default_value=1)"
]
},
{
@@ -892,7 +849,7 @@
" version=\"2.3.1\",\n",
" py_version=\"py37\",\n",
" instance_type=deploy_instance_type,\n",
- " image_scope=\"inference\"\n",
+ " image_scope=\"inference\",\n",
")\n",
"print(inference_image_uri)"
]
@@ -907,10 +864,10 @@
"\n",
"register_step = RegisterModel(\n",
" name=\"RegisterModel\",\n",
- "# entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n",
- "# source_dir='src',\n",
+ " # entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n",
+ " # source_dir='src',\n",
" estimator=estimator,\n",
- " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n",
+ " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n",
" model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
" content_types=[\"application/jsonlines\"],\n",
" response_types=[\"application/jsonlines\"],\n",
@@ -918,7 +875,7 @@
" transform_instances=[\"ml.c5.18xlarge\"],\n",
" model_package_group_name=model_package_group_name,\n",
" approval_status=model_approval_status,\n",
- " model_metrics=model_metrics\n",
+ " model_metrics=model_metrics,\n",
")"
]
},
@@ -939,7 +896,7 @@
"source": [
"from sagemaker.model import Model\n",
"\n",
- "model_name = 'bert-model-{}'.format(timestamp)\n",
+ "model_name = \"bert-model-{}\".format(timestamp)\n",
"\n",
"model = Model(\n",
" name=model_name,\n",
@@ -959,7 +916,7 @@
"from sagemaker.inputs import CreateModelInput\n",
"\n",
"create_inputs = CreateModelInput(\n",
- " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n",
+ " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n",
")"
]
},
@@ -1001,10 +958,7 @@
"metadata": {},
"outputs": [],
"source": [
- "min_accuracy_value = ParameterFloat(\n",
- " name=\"MinAccuracyValue\",\n",
- " default_value=0.01\n",
- ")"
+ "min_accuracy_value = ParameterFloat(name=\"MinAccuracyValue\", default_value=0.01)"
]
},
{
@@ -1025,14 +979,14 @@
" property_file=evaluation_report,\n",
" json_path=\"metrics.accuracy.value\",\n",
" ),\n",
- " right=min_accuracy_value # accuracy\n",
+ " right=min_accuracy_value, # accuracy\n",
")\n",
"\n",
"minimum_accuracy_condition_step = ConditionStep(\n",
" name=\"AccuracyCondition\",\n",
" conditions=[minimum_accuracy_condition],\n",
- " if_steps=[register_step, create_step], # success, continue with model registration\n",
- " else_steps=[], # fail, end the pipeline\n",
+ " if_steps=[register_step, create_step], # success, continue with model registration\n",
+ " else_steps=[], # fail, end the pipeline\n",
")"
]
},
@@ -1108,7 +1062,7 @@
" min_accuracy_value,\n",
" model_approval_status,\n",
" deploy_instance_type,\n",
- " deploy_instance_count\n",
+ " deploy_instance_count,\n",
" ],\n",
" steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],\n",
" sagemaker_session=sess,\n",
@@ -1196,16 +1150,16 @@
" parameters=dict(\n",
" InputData=raw_input_data_s3_uri,\n",
" ProcessingInstanceCount=1,\n",
- " ProcessingInstanceType='ml.c5.2xlarge',\n",
+ " ProcessingInstanceType=\"ml.c5.2xlarge\",\n",
" MaxSeqLength=64,\n",
- " BalanceDataset='True',\n",
+ " BalanceDataset=\"True\",\n",
" TrainSplitPercentage=0.9,\n",
" ValidationSplitPercentage=0.05,\n",
" TestSplitPercentage=0.05,\n",
- " FeatureStoreOfflinePrefix='reviews-feature-store-'+str(timestamp),\n",
- " FeatureGroupName='reviews-feature-group-'+str(timestamp),\n",
+ " FeatureStoreOfflinePrefix=\"reviews-feature-store-\" + str(timestamp),\n",
+ " FeatureGroupName=\"reviews-feature-group-\" + str(timestamp),\n",
" LearningRate=0.000012,\n",
- " TrainInstanceType='ml.c5.9xlarge',\n",
+ " TrainInstanceType=\"ml.c5.9xlarge\",\n",
" TrainInstanceCount=1,\n",
" Epochs=1,\n",
" Epsilon=0.00000001,\n",
@@ -1216,20 +1170,20 @@
" ValidationSteps=50,\n",
" TestSteps=50,\n",
" TrainVolumeSize=1024,\n",
- " UseXLA='True',\n",
- " UseAMP='True',\n",
- " FreezeBERTLayer='False',\n",
- " EnableSageMakerDebugger='False',\n",
- " EnableCheckpointing='False',\n",
- " EnableTensorboard='False',\n",
- " InputMode='File',\n",
- " RunValidation='True',\n",
- " RunTest='False',\n",
- " RunSamplePredictions='False', \n",
+ " UseXLA=\"True\",\n",
+ " UseAMP=\"True\",\n",
+ " FreezeBERTLayer=\"False\",\n",
+ " EnableSageMakerDebugger=\"False\",\n",
+ " EnableCheckpointing=\"False\",\n",
+ " EnableTensorboard=\"False\",\n",
+ " InputMode=\"File\",\n",
+ " RunValidation=\"True\",\n",
+ " RunTest=\"False\",\n",
+ " RunSamplePredictions=\"False\",\n",
" MinAccuracyValue=0.01,\n",
- " ModelApprovalStatus='PendingManualApproval', \n",
- " DeployInstanceType='ml.m5.4xlarge',\n",
- " DeployInstanceCount=1 \n",
+ " ModelApprovalStatus=\"PendingManualApproval\",\n",
+ " DeployInstanceType=\"ml.m5.4xlarge\",\n",
+ " DeployInstanceCount=1,\n",
" )\n",
")\n",
"\n",
@@ -1270,7 +1224,7 @@
"metadata": {},
"outputs": [],
"source": [
- "execution_run_name = execution_run['PipelineExecutionDisplayName']\n",
+ "execution_run_name = execution_run[\"PipelineExecutionDisplayName\"]\n",
"print(execution_run_name)"
]
},
@@ -1280,7 +1234,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = execution_run['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = execution_run[\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -1345,20 +1299,20 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
- "# print('Executions for our pipeline...')\n",
- "# print(pipeline_execution_status)\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
+ " # print('Executions for our pipeline...')\n",
+ " # print(pipeline_execution_status)\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -1377,7 +1331,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -1387,7 +1341,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -1411,7 +1365,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -1441,8 +1395,8 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_name=None\n",
- "training_job_name=None"
+ "processing_job_name = None\n",
+ "training_job_name = None"
]
},
{
@@ -1456,15 +1410,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -1486,7 +1440,7 @@
"outputs": [],
"source": [
"# -aws-processing-job is the default name assigned by ProcessingJob\n",
- "processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)\n",
+ "processing_job_tc = \"{}-aws-processing-job\".format(processing_job_name)\n",
"print(processing_job_tc)"
]
},
@@ -1514,10 +1468,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = sm.associate_trial_component(\n",
- " TrialComponentName=processing_job_tc,\n",
- " TrialName=pipeline_trial_name\n",
- ")"
+ "response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=pipeline_trial_name)"
]
},
{
@@ -1527,7 +1478,7 @@
"outputs": [],
"source": [
"# -aws-training-job is the default name assigned by TrainingJob\n",
- "training_job_tc = '{}-aws-training-job'.format(training_job_name)\n",
+ "training_job_tc = \"{}-aws-training-job\".format(training_job_name)\n",
"print(training_job_tc)"
]
},
@@ -1537,10 +1488,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = sm.associate_trial_component(\n",
- " TrialComponentName=training_job_tc,\n",
- " TrialName=pipeline_trial_name\n",
- ")"
+ "response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=pipeline_trial_name)"
]
},
{
@@ -1560,9 +1508,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"balance_dataset\": str(balance_dataset), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"balance_dataset\": str(balance_dataset),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1574,9 +1524,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"train_split_percentage\": str(train_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"train_split_percentage\": str(train_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1588,9 +1540,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"validation_split_percentage\": str(validation_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"validation_split_percentage\": str(validation_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1602,9 +1556,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"test_split_percentage\": str(test_split_percentage), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"test_split_percentage\": str(test_split_percentage),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1616,9 +1572,11 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_tracker.log_parameters({\n",
- " \"max_seq_length\": str(max_seq_length), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"max_seq_length\": str(max_seq_length),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1630,11 +1588,13 @@
"metadata": {},
"outputs": [],
"source": [
- "time.sleep(5) # avoid throttling exception \n",
+ "time.sleep(5) # avoid throttling exception\n",
"\n",
- "processing_job_tracker.log_parameters({\n",
- " \"feature_store_offline_prefix\": str(feature_store_offline_prefix), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"feature_store_offline_prefix\": str(feature_store_offline_prefix),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1646,11 +1606,13 @@
"metadata": {},
"outputs": [],
"source": [
- "time.sleep(5) # avoid throttling exception \n",
+ "time.sleep(5) # avoid throttling exception\n",
"\n",
- "processing_job_tracker.log_parameters({\n",
- " \"feature_group_name\": str(feature_group_name), \n",
- "})\n",
+ "processing_job_tracker.log_parameters(\n",
+ " {\n",
+ " \"feature_group_name\": str(feature_group_name),\n",
+ " }\n",
+ ")\n",
"\n",
"# must save after logging\n",
"processing_job_tracker.trial_component.save()"
@@ -1671,9 +1633,10 @@
"source": [
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
- "time.sleep(30) # avoid throttling exception\n",
+ "time.sleep(30) # avoid throttling exception\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
diff --git a/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb b/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb
index 2225889e..8293053f 100644
--- a/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb
+++ b/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb
@@ -24,12 +24,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -79,20 +79,20 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
- "# print('Executions for our pipeline...')\n",
- "# print(pipeline_execution_status)\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
+ " # print('Executions for our pipeline...')\n",
+ " # print(pipeline_execution_status)\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -109,7 +109,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -119,7 +119,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -149,14 +149,16 @@
"metadata": {},
"outputs": [],
"source": [
- "#for execution_step in reversed(execution.list_steps()):\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
- " if execution_step['StepName'] == 'EvaluateModel':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ "# for execution_step in reversed(execution.list_steps()):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
+ " if execution_step[\"StepName\"] == \"EvaluateModel\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
"\n",
"describe_evaluation_processing_job_response = sm.describe_processing_job(ProcessingJobName=processing_job_name)\n",
"\n",
- "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n",
+ "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response[\"ProcessingOutputConfig\"][\"Outputs\"][0][\n",
+ " \"S3Output\"\n",
+ "][\"S3Uri\"]\n",
"evaluation_metrics_s3_uri"
]
},
@@ -169,9 +171,7 @@
"import json\n",
"from pprint import pprint\n",
"\n",
- "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(\n",
- " evaluation_metrics_s3_uri\n",
- "))\n",
+ "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(evaluation_metrics_s3_uri))\n",
"\n",
"pprint(json.loads(evaluation_json))"
]
@@ -189,15 +189,15 @@
"metadata": {},
"outputs": [],
"source": [
- "training_job_arn=None\n",
+ "training_job_arn = None\n",
"\n",
- "for execution_step in steps['PipelineExecutionSteps']:\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
" if execution_step[\"StepName\"] == \"Train\":\n",
" training_job_arn = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"]\n",
- " \n",
+ "\n",
" break\n",
- " \n",
- "training_job_name = training_job_arn.split('/')[-1]\n",
+ "\n",
+ "training_job_name = training_job_arn.split(\"/\")[-1]\n",
"print(training_job_name)"
]
},
@@ -207,7 +207,7 @@
"metadata": {},
"outputs": [],
"source": [
- "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)['ModelArtifacts']['S3ModelArtifacts']"
+ "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)[\"ModelArtifacts\"][\"S3ModelArtifacts\"]"
]
},
{
@@ -225,8 +225,8 @@
"metadata": {},
"outputs": [],
"source": [
- "!mkdir -p ./model \n",
- "!tar -zxvf model.tar.gz -C ./model "
+ "!mkdir -p ./model\n",
+ "!tar -zxvf model.tar.gz -C ./model"
]
},
{
@@ -263,8 +263,8 @@
"metadata": {},
"outputs": [],
"source": [
- "processing_job_name=None\n",
- "training_job_name=None"
+ "processing_job_name = None\n",
+ "training_job_name = None"
]
},
{
@@ -278,15 +278,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -309,9 +309,10 @@
"source": [
"from sagemaker.analytics import ExperimentAnalytics\n",
"\n",
- "time.sleep(30) # avoid throttling exception\n",
+ "time.sleep(30) # avoid throttling exception\n",
"\n",
"import pandas as pd\n",
+ "\n",
"pd.set_option(\"max_colwidth\", 500)\n",
"\n",
"experiment_analytics = ExperimentAnalytics(\n",
diff --git a/10_pipeline/03_Register_Deploy_Model.ipynb b/10_pipeline/03_Register_Deploy_Model.ipynb
index 66cb0842..fe59ccdc 100644
--- a/10_pipeline/03_Register_Deploy_Model.ipynb
+++ b/10_pipeline/03_Register_Deploy_Model.ipynb
@@ -28,12 +28,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -65,20 +65,20 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
- "# print('Executions for our pipeline...')\n",
- "# print(pipeline_execution_status)\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
+ " # print('Executions for our pipeline...')\n",
+ " # print(pipeline_execution_status)\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -95,7 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -105,7 +105,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -135,9 +135,9 @@
"metadata": {},
"outputs": [],
"source": [
- "for execution_step in steps['PipelineExecutionSteps']:\n",
- " if execution_step['StepName'] == 'RegisterModel':\n",
- " model_package_arn = execution_step['Metadata']['RegisterModel']['Arn']\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
+ " if execution_step[\"StepName\"] == \"RegisterModel\":\n",
+ " model_package_arn = execution_step[\"Metadata\"][\"RegisterModel\"][\"Arn\"]\n",
" break\n",
"print(model_package_arn)"
]
@@ -150,7 +150,7 @@
"source": [
"model_package_update_response = sm.update_model_package(\n",
" ModelPackageArn=model_package_arn,\n",
- " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n",
+ " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n",
")"
]
},
@@ -167,13 +167,13 @@
"metadata": {},
"outputs": [],
"source": [
- "for execution_step in steps['PipelineExecutionSteps']:\n",
- " if execution_step['StepName'] == 'CreateModel':\n",
- " model_arn = execution_step['Metadata']['Model']['Arn']\n",
+ "for execution_step in steps[\"PipelineExecutionSteps\"]:\n",
+ " if execution_step[\"StepName\"] == \"CreateModel\":\n",
+ " model_arn = execution_step[\"Metadata\"][\"Model\"][\"Arn\"]\n",
" break\n",
"print(model_arn)\n",
"\n",
- "model_name = model_arn.split('/')[-1]\n",
+ "model_name = model_arn.split(\"/\")[-1]\n",
"print(model_name)"
]
},
@@ -192,13 +192,14 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "model_from_registry_name = 'bert-model-from-registry-{}'.format(timestamp)\n",
+ "model_from_registry_name = \"bert-model-from-registry-{}\".format(timestamp)\n",
"print(\"Model from registry name : {}\".format(model_from_registry_name))\n",
"\n",
"model_registry_package_container = {\n",
- " 'ModelPackageName': model_package_arn,\n",
+ " \"ModelPackageName\": model_package_arn,\n",
"}"
]
},
@@ -208,12 +209,10 @@
"metadata": {},
"outputs": [],
"source": [
- "from pprint import pprint \n",
+ "from pprint import pprint\n",
"\n",
"create_model_from_registry_respose = sm.create_model(\n",
- " ModelName = model_from_registry_name,\n",
- " ExecutionRoleArn = role,\n",
- " PrimaryContainer = model_registry_package_container\n",
+ " ModelName=model_from_registry_name, ExecutionRoleArn=role, PrimaryContainer=model_registry_package_container\n",
")\n",
"pprint(create_model_from_registry_respose)"
]
@@ -224,7 +223,7 @@
"metadata": {},
"outputs": [],
"source": [
- "model_from_registry_arn = create_model_from_registry_respose['ModelArn']\n",
+ "model_from_registry_arn = create_model_from_registry_respose[\"ModelArn\"]\n",
"model_from_registry_arn"
]
},
@@ -234,17 +233,21 @@
"metadata": {},
"outputs": [],
"source": [
- "endpoint_config_name = 'bert-model-from-registry-epc-{}'.format(timestamp)\n",
+ "endpoint_config_name = \"bert-model-from-registry-epc-{}\".format(timestamp)\n",
"print(endpoint_config_name)\n",
"\n",
"create_endpoint_config_response = sm.create_endpoint_config(\n",
- " EndpointConfigName = endpoint_config_name,\n",
- " ProductionVariants=[{\n",
- " 'InstanceType':'ml.m5.4xlarge',\n",
- " 'InitialVariantWeight':1,\n",
- " 'InitialInstanceCount':1,\n",
- " 'ModelName': model_name,\n",
- " 'VariantName':'AllTraffic'}])"
+ " EndpointConfigName=endpoint_config_name,\n",
+ " ProductionVariants=[\n",
+ " {\n",
+ " \"InstanceType\": \"ml.m5.4xlarge\",\n",
+ " \"InitialVariantWeight\": 1,\n",
+ " \"InitialInstanceCount\": 1,\n",
+ " \"ModelName\": model_name,\n",
+ " \"VariantName\": \"AllTraffic\",\n",
+ " }\n",
+ " ],\n",
+ ")"
]
},
{
@@ -253,13 +256,13 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_endpoint_name = 'bert-model-from-registry-ep-{}'.format(timestamp)\n",
+ "pipeline_endpoint_name = \"bert-model-from-registry-ep-{}\".format(timestamp)\n",
"print(\"EndpointName={}\".format(pipeline_endpoint_name))\n",
"\n",
"create_endpoint_response = sm.create_endpoint(\n",
- " EndpointName=pipeline_endpoint_name,\n",
- " EndpointConfigName=endpoint_config_name)\n",
- "print(create_endpoint_response['EndpointArn'])"
+ " EndpointName=pipeline_endpoint_name, EndpointConfigName=endpoint_config_name\n",
+ ")\n",
+ "print(create_endpoint_response[\"EndpointArn\"])"
]
},
{
@@ -270,7 +273,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker REST Endpoint'.format(region, pipeline_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker REST Endpoint'.format(\n",
+ " region, pipeline_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -288,7 +297,7 @@
"source": [
"%%time\n",
"\n",
- "waiter = sm.get_waiter('endpoint_in_service')\n",
+ "waiter = sm.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=pipeline_endpoint_name)"
]
},
@@ -317,20 +326,20 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
" display(viz.show(pipeline_execution_step=execution_step))\n",
- " time.sleep(5)\n"
+ " time.sleep(5)"
]
},
{
@@ -351,14 +360,16 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=pipeline_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer()) "
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=pipeline_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -374,15 +385,12 @@
"metadata": {},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -393,12 +401,14 @@
"source": [
"import csv\n",
"\n",
- "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df_reviews = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"\n",
- "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n",
+ "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n",
"df_sample_reviews = df_sample_reviews.reset_index(drop=True)\n",
"df_sample_reviews.shape"
]
@@ -420,14 +430,14 @@
"source": [
"import pandas as pd\n",
"\n",
+ "\n",
"def predict(review_body):\n",
- " inputs = [\n",
- " {\"features\": [review_body]}\n",
- " ]\n",
+ " inputs = [{\"features\": [review_body]}]\n",
" predicted_classes = predictor.predict(inputs)\n",
- " return predicted_classes[0]['predicted_label']\n",
- " \n",
- "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n",
+ " return predicted_classes[0][\"predicted_label\"]\n",
+ "\n",
+ "\n",
+ "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n",
"df_sample_reviews.head(5)"
]
},
diff --git a/10_pipeline/airflow/00_Create_S3_Bucket.ipynb b/10_pipeline/airflow/00_Create_S3_Bucket.ipynb
index 0c8274b9..96981663 100644
--- a/10_pipeline/airflow/00_Create_S3_Bucket.ipynb
+++ b/10_pipeline/airflow/00_Create_S3_Bucket.ipynb
@@ -17,21 +17,21 @@
"\n",
"session = boto3.session.Session()\n",
"region = session.region_name\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
- "airflow_bucket_name = 'airflow-'+region+'-'+account_id\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
+ "airflow_bucket_name = \"airflow-\" + region + \"-\" + account_id\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n",
"s3.create_bucket(Bucket=airflow_bucket_name)\n",
"\n",
"response = s3.put_public_access_block(\n",
- " Bucket = airflow_bucket_name,\n",
+ " Bucket=airflow_bucket_name,\n",
" PublicAccessBlockConfiguration={\n",
- " 'BlockPublicAcls': True,\n",
- " 'IgnorePublicAcls': True,\n",
- " 'BlockPublicPolicy': True,\n",
- " 'RestrictPublicBuckets': True\n",
- " }\n",
- ")\n"
+ " \"BlockPublicAcls\": True,\n",
+ " \"IgnorePublicAcls\": True,\n",
+ " \"BlockPublicPolicy\": True,\n",
+ " \"RestrictPublicBuckets\": True,\n",
+ " },\n",
+ ")"
]
},
{
@@ -40,9 +40,9 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_mwaa_private_path = 's3://{}'.format(airflow_bucket_name)\n",
- "s3_mwaa_dags_private_path = 's3://{}/dags'.format(airflow_bucket_name)\n",
- "s3_mwaa_pipeline_private_path = 's3://{}/dags/pipeline'.format(airflow_bucket_name)\n",
+ "s3_mwaa_private_path = \"s3://{}\".format(airflow_bucket_name)\n",
+ "s3_mwaa_dags_private_path = \"s3://{}/dags\".format(airflow_bucket_name)\n",
+ "s3_mwaa_pipeline_private_path = \"s3://{}/dags/pipeline\".format(airflow_bucket_name)\n",
"print(s3_mwaa_private_path)"
]
},
@@ -52,7 +52,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_s3_bucket_passed=False"
+ "setup_s3_bucket_passed = False"
]
},
{
@@ -61,7 +61,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Default bucket: {}'.format(airflow_bucket_name))"
+ "print(\"Default bucket: {}\".format(airflow_bucket_name))"
]
},
{
@@ -105,9 +105,9 @@
"try:\n",
" response = s3.head_bucket(Bucket=airflow_bucket_name)\n",
" print(response)\n",
- " setup_s3_bucket_passed=True\n",
+ " setup_s3_bucket_passed = True\n",
"except ClientError as e:\n",
- " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))"
+ " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))"
]
},
{
diff --git a/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb b/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb
index 9a07c5f3..74af3409 100644
--- a/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb
+++ b/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb
@@ -20,17 +20,17 @@
"session = boto3.session.Session()\n",
"region = session.region_name\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
"\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n",
"\n",
- "airflow_env_name = 'mwaa-'+region+'-'+account_id\n",
- "airflow_vpc_name = 'mwaa-vpc'+region+'-'+account_id"
+ "airflow_env_name = \"mwaa-\" + region + \"-\" + account_id\n",
+ "airflow_vpc_name = \"mwaa-vpc\" + region + \"-\" + account_id"
]
},
{
@@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_s3_bucket_passed=False\n",
+ "setup_s3_bucket_passed = False\n",
"%store -r airflow_bucket_name\n",
"%store airflow_env_name\n",
"%store airflow_vpc_name"
@@ -58,9 +58,9 @@
"try:\n",
" response = s3.head_bucket(Bucket=airflow_bucket_name)\n",
" print(response)\n",
- " setup_s3_bucket_passed=True\n",
+ " setup_s3_bucket_passed = True\n",
"except ClientError as e:\n",
- " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))"
+ " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))"
]
},
{
@@ -85,13 +85,13 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('./dags/config.py', 'r') as f:\n",
+ "with open(\"./dags/config.py\", \"r\") as f:\n",
" lines = f.readlines()\n",
"\n",
- "with open('./dags/config.py', 'w') as f:\n",
+ "with open(\"./dags/config.py\", \"w\") as f:\n",
" for line in lines:\n",
- " line = line.replace('{0}', region)\n",
- " line = line.replace('{1}', bucket)\n",
+ " line = line.replace(\"{0}\", region)\n",
+ " line = line.replace(\"{1}\", bucket)\n",
" f.write(line)"
]
},
@@ -148,20 +148,24 @@
"metadata": {},
"outputs": [],
"source": [
- "#Check number of policies attached to TeamRole, we need to have nine\n",
- "iam = boto3.resource('iam')\n",
- "iam_client = boto3.client('iam')\n",
- "team_role_arn = iam.Role('TeamRole').arn\n",
+ "# Check number of policies attached to TeamRole, we need to have nine\n",
+ "iam = boto3.resource(\"iam\")\n",
+ "iam_client = boto3.client(\"iam\")\n",
+ "team_role_arn = iam.Role(\"TeamRole\").arn\n",
"\n",
- "team_role = iam.Role('TeamRole')\n",
+ "team_role = iam.Role(\"TeamRole\")\n",
"\n",
- "aws_managed_policies = [p for p in team_role.attached_policies.all() ] \n",
+ "aws_managed_policies = [p for p in team_role.attached_policies.all()]\n",
"\n",
- "if(len(aws_managed_policies) >= 10): \n",
- " print('You have: {} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA VPC Creation Policy.'.format(len(aws_managed_policies)))\n",
+ "if len(aws_managed_policies) >= 10:\n",
+ " print(\n",
+ " \"You have: {} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA VPC Creation Policy.\".format(\n",
+ " len(aws_managed_policies)\n",
+ " )\n",
+ " )\n",
" print(\"Please do NOT continue unless until you run this and get a Success message\")\n",
"else:\n",
- " print(\"Success! Please Continue...\") "
+ " print(\"Success! Please Continue...\")"
]
},
{
@@ -170,9 +174,9 @@
"metadata": {},
"outputs": [],
"source": [
- "mwaa_vpc_policy_json = open('./src/mwaa_vpc_policy.json', 'r').read()\n",
- "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace('{0}',region)\n",
- "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace('{1}',account_id)"
+ "mwaa_vpc_policy_json = open(\"./src/mwaa_vpc_policy.json\", \"r\").read()\n",
+ "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace(\"{0}\", region)\n",
+ "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace(\"{1}\", account_id)"
]
},
{
@@ -190,10 +194,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam_client.create_policy(\n",
- " PolicyName='mwaa_vpc_policy',\n",
- " PolicyDocument=mwaa_vpc_policy_json\n",
- ")\n",
+ "response = iam_client.create_policy(PolicyName=\"mwaa_vpc_policy\", PolicyDocument=mwaa_vpc_policy_json)\n",
"\n",
"mwaa_vpc_policy_arn = response[\"Policy\"][\"Arn\"]"
]
@@ -211,36 +212,33 @@
"metadata": {},
"outputs": [],
"source": [
- "cloudformation = boto3.resource('cloudformation')\n",
+ "cloudformation = boto3.resource(\"cloudformation\")\n",
"\n",
- "mwaa_vpc_template_yaml = open('./cfn/mwaa_vpc_template.yaml', 'r').read()\n",
+ "mwaa_vpc_template_yaml = open(\"./cfn/mwaa_vpc_template.yaml\", \"r\").read()\n",
"\n",
"response = cloudformation.create_stack(\n",
- " StackName='mwaa-vpc-stack',\n",
+ " StackName=\"mwaa-vpc-stack\",\n",
" TemplateBody=mwaa_vpc_template_yaml,\n",
" Parameters=[\n",
- " {\n",
- " 'ParameterKey': 'EnvironmentName',\n",
- " 'ParameterValue': airflow_vpc_name\n",
- " },\n",
+ " {\"ParameterKey\": \"EnvironmentName\", \"ParameterValue\": airflow_vpc_name},\n",
" ],\n",
" ResourceTypes=[\n",
- " 'AWS::EC2::VPC',\n",
+ " \"AWS::EC2::VPC\",\n",
" ],\n",
- " OnFailure='ROLLBACK',\n",
- " EnableTerminationProtection=False\n",
+ " OnFailure=\"ROLLBACK\",\n",
+ " EnableTerminationProtection=False,\n",
")\n",
"\n",
- "stack_status = 'IN_PROGRESS'\n",
+ "stack_status = \"IN_PROGRESS\"\n",
"\n",
- "print ('Starting deployment of VPC {}. \\n'.format(airflow_vpc_name))\n",
+ "print(\"Starting deployment of VPC {}. \\n\".format(airflow_vpc_name))\n",
"\n",
- "while stack_status != 'CREATE_COMPLETE':\n",
- " stack_status = cloudformation.Stack('mwaa-vpc-stack').stack_status\n",
+ "while stack_status != \"CREATE_COMPLETE\":\n",
+ " stack_status = cloudformation.Stack(\"mwaa-vpc-stack\").stack_status\n",
" time.sleep(30)\n",
" print(\"Still waiting....\")\n",
"\n",
- "print ('\\n Sucess! VPC {} has been deployed sucessfully.'.format(airflow_vpc_name))"
+ "print(\"\\n Sucess! VPC {} has been deployed sucessfully.\".format(airflow_vpc_name))"
]
},
{
@@ -249,21 +247,21 @@
"metadata": {},
"outputs": [],
"source": [
- "vpc_outputs = cloudformation.Stack('mwaa-vpc-stack').outputs\n",
+ "vpc_outputs = cloudformation.Stack(\"mwaa-vpc-stack\").outputs\n",
"\n",
"airflow_sg_id = None\n",
- "for output in vpc_outputs: \n",
- " if output['OutputKey'] == 'IngressSecurityGroup': \n",
- " airflow_sg_id = output['OutputValue'] \n",
+ "for output in vpc_outputs:\n",
+ " if output[\"OutputKey\"] == \"IngressSecurityGroup\":\n",
+ " airflow_sg_id = output[\"OutputValue\"]\n",
" break\n",
- " \n",
- "subnet_index_list = ['PrivateSubnet1', 'PrivateSubnet2']\n",
+ "\n",
+ "subnet_index_list = [\"PrivateSubnet1\", \"PrivateSubnet2\"]\n",
"airflow_subnet_ids = []\n",
"\n",
"for output in vpc_outputs:\n",
" for subnet_index in subnet_index_list:\n",
- " if output['OutputKey'] == subnet_index: \n",
- " airflow_subnet_ids.append(output['OutputValue']) \n"
+ " if output[\"OutputKey\"] == subnet_index:\n",
+ " airflow_subnet_ids.append(output[\"OutputValue\"])"
]
},
{
@@ -289,11 +287,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#Remove MWAA VPC Policy only needed for VPC Creation\n",
- "response = iam_client.detach_role_policy(\n",
- " RoleName=\"TeamRole\",\n",
- " PolicyArn=mwaa_vpc_policy_arn\n",
- ")"
+ "# Remove MWAA VPC Policy only needed for VPC Creation\n",
+ "response = iam_client.detach_role_policy(RoleName=\"TeamRole\", PolicyArn=mwaa_vpc_policy_arn)"
]
},
{
@@ -302,13 +297,17 @@
"metadata": {},
"outputs": [],
"source": [
- "#Check number of policies attached to TeamRole, we need to have nine\n",
- "team_role = iam.Role('TeamRole')\n",
+ "# Check number of policies attached to TeamRole, we need to have nine\n",
+ "team_role = iam.Role(\"TeamRole\")\n",
"\n",
- "aws_managed_policies = [p for p in team_role.attached_policies.all() ] \n",
+ "aws_managed_policies = [p for p in team_role.attached_policies.all()]\n",
"\n",
- "if(len(aws_managed_policies) >= 10): \n",
- " print('You have: {0} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA Policy.'.format(len(aws_managed_policies)))\n",
+ "if len(aws_managed_policies) >= 10:\n",
+ " print(\n",
+ " \"You have: {0} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA Policy.\".format(\n",
+ " len(aws_managed_policies)\n",
+ " )\n",
+ " )\n",
" print(\"Please do NOT continue unless until you run this and get a Success message\")\n",
"else:\n",
" print(\"Success! Please Continue...\")"
@@ -320,13 +319,13 @@
"metadata": {},
"outputs": [],
"source": [
- "mwaa_policy_json = open('./src/mwaa_policy.json', 'r').read()\n",
- "mwaa_policy_json = mwaa_policy_json.replace('{0}',region)\n",
- "mwaa_policy_json = mwaa_policy_json.replace('{1}',account_id)\n",
- "mwaa_policy_json = mwaa_policy_json.replace('{2}',airflow_env_name)\n",
- "mwaa_policy_json = mwaa_policy_json.replace('{3}',airflow_bucket_name)\n",
+ "mwaa_policy_json = open(\"./src/mwaa_policy.json\", \"r\").read()\n",
+ "mwaa_policy_json = mwaa_policy_json.replace(\"{0}\", region)\n",
+ "mwaa_policy_json = mwaa_policy_json.replace(\"{1}\", account_id)\n",
+ "mwaa_policy_json = mwaa_policy_json.replace(\"{2}\", airflow_env_name)\n",
+ "mwaa_policy_json = mwaa_policy_json.replace(\"{3}\", airflow_bucket_name)\n",
"\n",
- "mwaa_assume_policy_json = open('./src/mwaa_assume_policy.json', 'r').read()"
+ "mwaa_assume_policy_json = open(\"./src/mwaa_assume_policy.json\", \"r\").read()"
]
},
{
@@ -335,20 +334,11 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam_client.create_policy(\n",
- " PolicyName='mwaa_policy',\n",
- " PolicyDocument=mwaa_policy_json\n",
- ")\n",
+ "response = iam_client.create_policy(PolicyName=\"mwaa_policy\", PolicyDocument=mwaa_policy_json)\n",
"\n",
- "response = iam_client.attach_role_policy(\n",
- " RoleName=\"TeamRole\",\n",
- " PolicyArn=response[\"Policy\"][\"Arn\"]\n",
- ")\n",
+ "response = iam_client.attach_role_policy(RoleName=\"TeamRole\", PolicyArn=response[\"Policy\"][\"Arn\"])\n",
"\n",
- "response = iam_client.update_assume_role_policy(\n",
- " RoleName=\"TeamRole\",\n",
- " PolicyDocument=mwaa_assume_policy_json\n",
- ")"
+ "response = iam_client.update_assume_role_policy(RoleName=\"TeamRole\", PolicyDocument=mwaa_assume_policy_json)"
]
},
{
@@ -420,4 +410,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb b/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb
index 58ac8cce..8253620f 100644
--- a/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb
+++ b/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb
@@ -18,9 +18,9 @@
"\n",
"session = boto3.session.Session()\n",
"region = session.region_name\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)"
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)"
]
},
{
@@ -29,7 +29,7 @@
"metadata": {},
"outputs": [],
"source": [
- "setup_s3_bucket_passed=False\n",
+ "setup_s3_bucket_passed = False\n",
"%store -r airflow_bucket_name\n",
"%store -r s3_mwaa_private_path\n",
"%store -r s3_mwaa_dags_private_path\n",
@@ -62,9 +62,9 @@
"try:\n",
" response = s3.head_bucket(Bucket=airflow_bucket_name)\n",
" print(response)\n",
- " setup_s3_bucket_passed=True\n",
+ " setup_s3_bucket_passed = True\n",
"except ClientError as e:\n",
- " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))"
+ " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))"
]
},
{
@@ -89,49 +89,34 @@
"metadata": {},
"outputs": [],
"source": [
- "mwaa = boto3.client('mwaa')\n",
+ "mwaa = boto3.client(\"mwaa\")\n",
"\n",
- "s3_mwaa_bucket_arn= 'arn:aws:s3:::{}'.format(airflow_bucket_name)\n",
+ "s3_mwaa_bucket_arn = \"arn:aws:s3:::{}\".format(airflow_bucket_name)\n",
"\n",
"airflow_env_arn = mwaa.create_environment(\n",
- " DagS3Path='dags',\n",
+ " DagS3Path=\"dags\",\n",
" ExecutionRoleArn=team_role_arn,\n",
- " AirflowVersion='1.10.12',\n",
- " WebserverAccessMode='PUBLIC_ONLY',\n",
+ " AirflowVersion=\"1.10.12\",\n",
+ " WebserverAccessMode=\"PUBLIC_ONLY\",\n",
" LoggingConfiguration={\n",
- " 'DagProcessingLogs': {\n",
- " 'Enabled': True,\n",
- " 'LogLevel': 'ERROR'\n",
- " },\n",
- " 'SchedulerLogs': {\n",
- " 'Enabled': True,\n",
- " 'LogLevel': 'ERROR'\n",
- " },\n",
- " 'TaskLogs': {\n",
- " 'Enabled': True,\n",
- " 'LogLevel': 'INFO'\n",
- " },\n",
- " 'WebserverLogs': {\n",
- " 'Enabled': True,\n",
- " 'LogLevel': 'ERROR'\n",
- " },\n",
- " 'WorkerLogs': {\n",
- " 'Enabled': True,\n",
- " 'LogLevel': 'ERROR'\n",
- " }\n",
+ " \"DagProcessingLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n",
+ " \"SchedulerLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n",
+ " \"TaskLogs\": {\"Enabled\": True, \"LogLevel\": \"INFO\"},\n",
+ " \"WebserverLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n",
+ " \"WorkerLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n",
" },\n",
" MaxWorkers=3,\n",
" Name=airflow_env_name,\n",
" NetworkConfiguration={\n",
- " 'SecurityGroupIds': [\n",
+ " \"SecurityGroupIds\": [\n",
" airflow_sg_id,\n",
" ],\n",
- " 'SubnetIds': airflow_subnet_ids\n",
+ " \"SubnetIds\": airflow_subnet_ids,\n",
" },\n",
- " RequirementsS3ObjectVersion='latest',\n",
- " RequirementsS3Path='requirements.txt',\n",
+ " RequirementsS3ObjectVersion=\"latest\",\n",
+ " RequirementsS3Path=\"requirements.txt\",\n",
" SourceBucketArn=s3_mwaa_bucket_arn,\n",
- " EnvironmentClass='mw1.small'\n",
+ " EnvironmentClass=\"mw1.small\",\n",
")\n",
"\n",
"%store airflow_env_arn"
@@ -151,30 +136,29 @@
"outputs": [],
"source": [
"def get_airflow_check():\n",
- " response = mwaa.get_environment(\n",
- " Name=airflow_env_name\n",
- " )\n",
+ " response = mwaa.get_environment(Name=airflow_env_name)\n",
" mwaa_status = response[\"Environment\"][\"Status\"]\n",
" return mwaa_status\n",
"\n",
+ "\n",
"mwaa_status = \"CREATING\"\n",
"\n",
- "print('Checking to see if MWAA Env: {} is ready.'.format(airflow_env_name))\n",
+ "print(\"Checking to see if MWAA Env: {} is ready.\".format(airflow_env_name))\n",
"\n",
- "while (get_airflow_check() != 'AVAILABLE'):\n",
+ "while get_airflow_check() != \"AVAILABLE\":\n",
" mwaa_status\n",
" time.sleep(60)\n",
" print(\"Still waiting for MWAA Environment...\")\n",
"\n",
- "print('Sucess! MWAA Env: {} is ready!'.format(airflow_env_name)) \n"
+ "print(\"Sucess! MWAA Env: {} is ready!\".format(airflow_env_name))"
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
"source": [
"# PLEASE MAKE SURE THAT THE ABOVE COMMAND RAN SUCESSFULLY BEFORE CONTINUING"
- ],
- "cell_type": "markdown",
- "metadata": {}
+ ]
},
{
"cell_type": "code",
@@ -262,4 +246,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb b/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb
index 3d922e5c..8625352b 100644
--- a/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb
+++ b/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb
@@ -18,9 +18,9 @@
"\n",
"session = boto3.session.Session()\n",
"region = session.region_name\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "dag_name = 'bert_reviews'"
+ "dag_name = \"bert_reviews\""
]
},
{
@@ -40,30 +40,29 @@
"metadata": {},
"outputs": [],
"source": [
- "mwaa = boto3.client('mwaa')\n",
+ "mwaa = boto3.client(\"mwaa\")\n",
"mwaa_status = \"\"\n",
"\n",
+ "\n",
"def get_airflow_check():\n",
- " response = mwaa.get_environment(\n",
- " Name=airflow_env_name\n",
- " )\n",
+ " response = mwaa.get_environment(Name=airflow_env_name)\n",
" mwaa_status = response[\"Environment\"][\"Status\"]\n",
" return mwaa_status\n",
"\n",
"\n",
"mwaa_status = get_airflow_check()\n",
- "if(mwaa_status != 'AVAILABLE'):\n",
- " print('[ERROR] Cannot find MWAA {}.'.format(airflow_env_name))\n",
- "else: \n",
- " print('Sucess! {} is ready!'.format(airflow_env_name))"
+ "if mwaa_status != \"AVAILABLE\":\n",
+ " print(\"[ERROR] Cannot find MWAA {}.\".format(airflow_env_name))\n",
+ "else:\n",
+ " print(\"Sucess! {} is ready!\".format(airflow_env_name))"
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
"source": [
"# PLEASE MAKE SURE THAT THE ABOVE COMMAND RAN SUCESSFULLY BEFORE CONTINUING"
- ],
- "cell_type": "markdown",
- "metadata": {}
+ ]
},
{
"cell_type": "markdown",
@@ -78,28 +77,21 @@
"metadata": {},
"outputs": [],
"source": [
- "mwaa_cli_token = mwaa.create_cli_token(\n",
- " Name=airflow_env_name\n",
- ")\n",
+ "mwaa_cli_token = mwaa.create_cli_token(Name=airflow_env_name)\n",
"\n",
- "cli_token = 'Bearer ' + mwaa_cli_token['CliToken']\n",
- "mwaa_web_server_hostname = 'https://' + mwaa_cli_token['WebServerHostname'] + '/aws_mwaa/cli'\n",
+ "cli_token = \"Bearer \" + mwaa_cli_token[\"CliToken\"]\n",
+ "mwaa_web_server_hostname = \"https://\" + mwaa_cli_token[\"WebServerHostname\"] + \"/aws_mwaa/cli\"\n",
"\n",
- "raw_data = 'trigger_dag {}'.format(dag_name)\n",
+ "raw_data = \"trigger_dag {}\".format(dag_name)\n",
"\n",
"response = requests.post(\n",
- " mwaa_web_server_hostname,\n",
- " headers={\n",
- " 'Authorization': cli_token,\n",
- " 'Content-Type': 'text/plain'\n",
- " },\n",
- " data=raw_data\n",
- " )\n",
+ " mwaa_web_server_hostname, headers={\"Authorization\": cli_token, \"Content-Type\": \"text/plain\"}, data=raw_data\n",
+ ")\n",
"\n",
- "if (response.status_code != 200):\n",
- " print('ERROR: DAG: {} failed to get triggered!'.format(dag_name))\n",
+ "if response.status_code != 200:\n",
+ " print(\"ERROR: DAG: {} failed to get triggered!\".format(dag_name))\n",
"else:\n",
- " print('Sucess! DAG: {} was triggered successfuly'.format(dag_name))"
+ " print(\"Sucess! DAG: {} was triggered successfuly\".format(dag_name))"
]
},
{
@@ -171,4 +163,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/10_pipeline/airflow/dags/bert_reviews.py b/10_pipeline/airflow/dags/bert_reviews.py
index 81e1f680..c597eca1 100644
--- a/10_pipeline/airflow/dags/bert_reviews.py
+++ b/10_pipeline/airflow/dags/bert_reviews.py
@@ -5,7 +5,7 @@
import sys
-sys.path.append('./airflow/dags/')
+sys.path.append("./airflow/dags/")
# airflow operators
import airflow
@@ -16,12 +16,9 @@
from airflow.operators.python_operator import PythonOperator
# airflow sagemaker operators
-from airflow.contrib.operators.sagemaker_training_operator \
- import SageMakerTrainingOperator
-from airflow.contrib.operators.sagemaker_tuning_operator \
- import SageMakerTuningOperator
-from airflow.contrib.operators.sagemaker_transform_operator \
- import SageMakerTransformOperator
+from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator
+from airflow.contrib.operators.sagemaker_tuning_operator import SageMakerTuningOperator
+from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator
from airflow.contrib.hooks.aws_hook import AwsHook
# sagemaker sdk
@@ -46,11 +43,9 @@
def is_hpo_enabled():
- """check if hyper-parameter optimization is enabled in the config
- """
+ """check if hyper-parameter optimization is enabled in the config"""
hpo_enabled = False
- if "job_level" in config and \
- "run_hyperparameter_opt" in config["job_level"]:
+ if "job_level" in config and "run_hyperparameter_opt" in config["job_level"]:
run_hpo_config = config["job_level"]["run_hyperparameter_opt"]
if run_hpo_config.lower() == "yes":
hpo_enabled = True
@@ -58,10 +53,11 @@ def is_hpo_enabled():
def get_sagemaker_role_arn(role_name, region_name):
- iam = boto3.client('iam', region_name=region_name)
+ iam = boto3.client("iam", region_name=region_name)
response = iam.get_role(RoleName=role_name)
return response["Role"]["Arn"]
+
# =============================================================================
# setting up training, tuning and transform configuration
# =============================================================================
@@ -71,13 +67,11 @@ def get_sagemaker_role_arn(role_name, region_name):
config = cfg.config
# set configuration for tasks
-hook = AwsHook(aws_conn_id='airflow-sagemaker')
+hook = AwsHook(aws_conn_id="airflow-sagemaker")
region = config["job_level"]["region_name"]
sess = hook.get_session(region_name=region)
-role = get_sagemaker_role_arn(
- config["train_model"]["sagemaker_role"],
- sess.region_name)
-container = get_image_uri(sess.region_name, 'factorization-machines')
+role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name)
+container = get_image_uri(sess.region_name, "factorization-machines")
hpo_enabled = is_hpo_enabled()
# create estimator
@@ -89,20 +83,13 @@ def get_sagemaker_role_arn(role_name, region_name):
)
# train_config specifies SageMaker training configuration
-train_config = training_config(
- estimator=fm_estimator,
- inputs=config["train_model"]["inputs"])
+train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"])
# create tuner
-fm_tuner = HyperparameterTuner(
- estimator=fm_estimator,
- **config["tune_model"]["tuner_config"]
-)
+fm_tuner = HyperparameterTuner(estimator=fm_estimator, **config["tune_model"]["tuner_config"])
# create tuning config
-tuner_config = tuning_config(
- tuner=fm_tuner,
- inputs=config["tune_model"]["inputs"])
+tuner_config = tuning_config(tuner=fm_tuner, inputs=config["tune_model"]["inputs"])
# create transform config
transform_config = transform_config_from_estimator(
@@ -118,56 +105,54 @@ def get_sagemaker_role_arn(role_name, region_name):
# define airflow DAG
-args = {
- 'owner': 'airflow',
- 'start_date': airflow.utils.dates.days_ago(2)
-}
+args = {"owner": "airflow", "start_date": airflow.utils.dates.days_ago(2)}
dag = DAG(
- dag_id='bert_reviews',
+ dag_id="bert_reviews",
default_args=args,
schedule_interval=None,
concurrency=1,
max_active_runs=1,
- user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)}
+ user_defined_filters={"tojson": lambda s: json.JSONEncoder().encode(s)},
)
# set the tasks in the DAG
# dummy operator
-init = DummyOperator(
- task_id='start',
- dag=dag
-)
+init = DummyOperator(task_id="start", dag=dag)
# preprocess the data
-process_task= PythonOperator(
- task_id='process',
+process_task = PythonOperator(
+ task_id="process",
dag=dag,
provide_context=False,
python_callable=preprocess.preprocess,
- op_kwargs=config["preprocess_data"])
+ op_kwargs=config["preprocess_data"],
+)
-train_task= PythonOperator(
- task_id='train',
+train_task = PythonOperator(
+ task_id="train",
dag=dag,
provide_context=False,
python_callable=preprocess.preprocess,
- op_kwargs=config["preprocess_data"])
+ op_kwargs=config["preprocess_data"],
+)
-model_task= PythonOperator(
- task_id='model',
+model_task = PythonOperator(
+ task_id="model",
dag=dag,
provide_context=False,
python_callable=preprocess.preprocess,
- op_kwargs=config["preprocess_data"])
+ op_kwargs=config["preprocess_data"],
+)
-deploy_task= PythonOperator(
- task_id='deploy',
+deploy_task = PythonOperator(
+ task_id="deploy",
dag=dag,
provide_context=False,
python_callable=preprocess.preprocess,
- op_kwargs=config["preprocess_data"])
+ op_kwargs=config["preprocess_data"],
+)
# set the dependencies between tasks
diff --git a/10_pipeline/airflow/dags/config.py b/10_pipeline/airflow/dags/config.py
index 9a2f38aa..0b726bdf 100644
--- a/10_pipeline/airflow/dags/config.py
+++ b/10_pipeline/airflow/dags/config.py
@@ -3,24 +3,21 @@
config = {}
-config["job_level"] = {
- "region_name": "us-east-1",
- "run_hyperparameter_opt": "no"
-}
+config["job_level"] = {"region_name": "us-east-1", "run_hyperparameter_opt": "no"}
config["preprocess_data"] = {
"s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
- "s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace
+ "s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace
"s3_out_prefix": "preprocess/",
- "delimiter": "\t"
+ "delimiter": "\t",
}
config["prepare_data"] = {
- "s3_in_bucket": "sagemaker-us-east-1-835319576252", # replace
+ "s3_in_bucket": "sagemaker-us-east-1-835319576252", # replace
"s3_in_prefix": "preprocess/",
"s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace
"s3_out_prefix": "prepare/",
- "delimiter": "\t"
+ "delimiter": "\t",
}
config["train_model"] = {
@@ -37,12 +34,12 @@
"epochs": "10",
"mini_batch_size": "200",
"num_factors": "64",
- "predictor_type": 'regressor'
- }
+ "predictor_type": "regressor",
+ },
},
"inputs": {
"train": "s3://sagemaker-us-east-1-835319576252/prepare/train/train.protobuf", # replace
- }
+ },
}
config["tune_model"] = {
@@ -51,16 +48,16 @@
"objective_type": "Minimize",
"hyperparameter_ranges": {
"factors_lr": ContinuousParameter(0.0001, 0.2),
- "factors_init_sigma": ContinuousParameter(0.0001, 1)
+ "factors_init_sigma": ContinuousParameter(0.0001, 1),
},
"max_jobs": 20,
"max_parallel_jobs": 2,
- "base_tuning_job_name": "hpo-recommender"
+ "base_tuning_job_name": "hpo-recommender",
},
"inputs": {
"train": "s3://sagemaker-us-east-1-835319576252/prepare/train/train.protobuf", # replace
- "test": "s3://sagemaker-us-east-1-835319576252/prepare/validate/validate.protobuf" # replace
- }
+ "test": "s3://sagemaker-us-east-1-835319576252/prepare/validate/validate.protobuf", # replace
+ },
}
config["batch_transform"] = {
@@ -71,6 +68,6 @@
"data_type": "S3Prefix",
"content_type": "application/x-recordio-protobuf",
"strategy": "MultiRecord",
- "output_path": "s3://sagemaker-us-east-1-835319576252/transform/"
+ "output_path": "s3://sagemaker-us-east-1-835319576252/transform/",
}
}
diff --git a/10_pipeline/airflow/dags/pipeline/prepare.py b/10_pipeline/airflow/dags/pipeline/prepare.py
index ad82fed5..2f2a243a 100644
--- a/10_pipeline/airflow/dags/pipeline/prepare.py
+++ b/10_pipeline/airflow/dags/pipeline/prepare.py
@@ -20,20 +20,20 @@ def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):
# extract customers and ratings
df_X = df_val[:, 0:2]
# Features are one-hot encoded in a sparse matrix
- X = lil_matrix((nb_rows, nb_cols)).astype('float32')
+ X = lil_matrix((nb_rows, nb_cols)).astype("float32")
df_X[:, 1] = nb_customer + df_X[:, 1]
coords = df_X[:, 0:2]
X[np.arange(nb_rows), coords[:, 0]] = 1
X[np.arange(nb_rows), coords[:, 1]] = 1
# create label with ratings
- Y = df_val[:, 2].astype('float32')
+ Y = df_val[:, 2].astype("float32")
# validate size and shape
print(X.shape)
print(Y.shape)
assert X.shape == (nb_rows, nb_cols)
- assert Y.shape == (nb_rows, )
+ assert Y.shape == (nb_rows,)
return X, Y
@@ -60,24 +60,19 @@ def save_as_protobuf(X, Y, bucket, key):
buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
buf.seek(0)
- obj = '{}'.format(key)
- boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
- return 's3://{}/{}'.format(bucket, obj)
+ obj = "{}".format(key)
+ boto3.resource("s3").Bucket(bucket).Object(obj).upload_fileobj(buf)
+ return "s3://{}/{}".format(bucket, obj)
def chunk(x, batch_size):
- """split array into chunks of batch_size
- """
+ """split array into chunks of batch_size"""
chunk_range = range(0, x.shape[0], batch_size)
- chunks = [x[p: p + batch_size] for p in chunk_range]
+ chunks = [x[p : p + batch_size] for p in chunk_range]
return chunks
-def prepare(s3_in_bucket,
- s3_in_prefix,
- s3_out_bucket,
- s3_out_prefix,
- delimiter=","):
+def prepare(s3_in_bucket, s3_in_prefix, s3_out_bucket, s3_out_prefix, delimiter=","):
"""Prepare data for training with Sagemaker algorithms
- Read preprocessed data and converts to ProtoBuf format to prepare for
@@ -114,47 +109,36 @@ def prepare(s3_in_bucket,
# prepare training data set
if s3_in_prefix[-1] == "/":
s3_in_prefix = s3_in_prefix[:-1]
- s3_train_url = "s3://{}/{}/{}".format(
- s3_in_bucket, s3_in_prefix, 'train/train.csv')
- train_df = pd.read_csv(s3_train_url,
- sep=str(','), error_bad_lines=False)
+ s3_train_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "train/train.csv")
+ train_df = pd.read_csv(s3_train_url, sep=str(","), error_bad_lines=False)
# prepare validateion dataset
- s3_validate_url = "s3://{}/{}/{}".format(
- s3_in_bucket, s3_in_prefix, 'validate/validate.csv')
- validate_df = pd.read_csv(s3_validate_url,
- sep=str(','), error_bad_lines=False)
+ s3_validate_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "validate/validate.csv")
+ validate_df = pd.read_csv(s3_validate_url, sep=str(","), error_bad_lines=False)
# prepare test dataset
- s3_test_url = "s3://{}/{}/{}".format(
- s3_in_bucket, s3_in_prefix, 'test/test.csv')
- test_df = pd.read_csv(s3_test_url,
- sep=str(','), error_bad_lines=False)
+ s3_test_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "test/test.csv")
+ test_df = pd.read_csv(s3_test_url, sep=str(","), error_bad_lines=False)
# get feature dimension
all_df = pd.concat([train_df, validate_df, test_df])
- nb_customer = np.unique(all_df['customer'].values).shape[0]
- nb_products = np.unique(all_df['product'].values).shape[0]
+ nb_customer = np.unique(all_df["customer"].values).shape[0]
+ nb_products = np.unique(all_df["product"].values).shape[0]
feature_dim = nb_customer + nb_products
print(nb_customer, nb_products, feature_dim)
- train_X, train_Y = convert_sparse_matrix(
- train_df, train_df.shape[0], nb_customer, nb_products)
- validate_X, validate_Y = convert_sparse_matrix(
- validate_df, validate_df.shape[0], nb_customer, nb_products)
- test_X, test_Y = convert_sparse_matrix(
- test_df, test_df.shape[0], nb_customer, nb_products)
+ train_X, train_Y = convert_sparse_matrix(train_df, train_df.shape[0], nb_customer, nb_products)
+ validate_X, validate_Y = convert_sparse_matrix(validate_df, validate_df.shape[0], nb_customer, nb_products)
+ test_X, test_Y = convert_sparse_matrix(test_df, test_df.shape[0], nb_customer, nb_products)
# write train and test in protobuf format to s3
if s3_out_prefix[-1] == "/":
s3_out_prefix = s3_out_prefix[:-1]
- train_data = save_as_protobuf(
- train_X, train_Y, s3_out_bucket,
- s3_out_prefix + "/" + "train/train.protobuf")
+ train_data = save_as_protobuf(train_X, train_Y, s3_out_bucket, s3_out_prefix + "/" + "train/train.protobuf")
print(train_data)
validate_data = save_as_protobuf(
- validate_X, validate_Y, s3_out_bucket,
- s3_out_prefix + "/" + "validate/validate.protobuf")
+ validate_X, validate_Y, s3_out_bucket, s3_out_prefix + "/" + "validate/validate.protobuf"
+ )
print(validate_data)
# chunk test data to avoid payload size issues when batch transforming
@@ -166,7 +150,8 @@ def prepare(s3_in_bucket,
test_x_chunks[i],
test_y_chunks[i],
s3_out_bucket,
- s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf")
+ s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf",
+ )
print(test_data)
return "SUCCESS"
diff --git a/10_pipeline/airflow/dags/pipeline/preprocess.py b/10_pipeline/airflow/dags/pipeline/preprocess.py
index 5a30bfab..07256487 100644
--- a/10_pipeline/airflow/dags/pipeline/preprocess.py
+++ b/10_pipeline/airflow/dags/pipeline/preprocess.py
@@ -3,10 +3,7 @@
import s3fs
-def preprocess(s3_in_url,
- s3_out_bucket,
- s3_out_prefix,
- delimiter=","):
+def preprocess(s3_in_url, s3_out_bucket, s3_out_prefix, delimiter=","):
"""Preprocesses data based on business logic
- Reads delimited file passed as s3_url and preprocess data by filtering
@@ -42,64 +39,52 @@ def preprocess(s3_in_url,
# limit dataframe to customer_id, product_id, and star_rating
# `product_title` will be useful validating recommendations
- df = df[['customer_id', 'product_id', 'star_rating', 'product_title']]
+ df = df[["customer_id", "product_id", "star_rating", "product_title"]]
# clean out the long tail because most people haven't seen most videos,
# and people rate fewer videos than they actually watch
- customers = df['customer_id'].value_counts()
- products = df['product_id'].value_counts()
+ customers = df["customer_id"].value_counts()
+ products = df["product_id"].value_counts()
# based on data exploration only about 5% of customers have rated 5 or
# more videos, and only 25% of videos have been rated by 9+ customers
customers = customers[customers >= 5]
products = products[products >= 10]
print("# of rows before the long tail = {:10d}".format(df.shape[0]))
- reduced_df = df \
- .merge(pd.DataFrame({'customer_id': customers.index})) \
- .merge(pd.DataFrame({'product_id': products.index}))
- print("# of rows after the long tail = {:10d}".format(
- reduced_df.shape[0]))
- reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id'])
- print("# of rows after removing duplicates = {:10d}".format(
- reduced_df.shape[0]))
+ reduced_df = df.merge(pd.DataFrame({"customer_id": customers.index})).merge(
+ pd.DataFrame({"product_id": products.index})
+ )
+ print("# of rows after the long tail = {:10d}".format(reduced_df.shape[0]))
+ reduced_df = reduced_df.drop_duplicates(["customer_id", "product_id"])
+ print("# of rows after removing duplicates = {:10d}".format(reduced_df.shape[0]))
# recreate customer and product lists since there are customers with
# more than 5 reviews, but all of their reviews are on products with
# less than 5 reviews (and vice versa)
- customers = reduced_df['customer_id'].value_counts()
- products = reduced_df['product_id'].value_counts()
+ customers = reduced_df["customer_id"].value_counts()
+ products = reduced_df["product_id"].value_counts()
# sequentially index each user and item to hold the sparse format where
# the indices indicate the row and column in our ratings matrix
- customer_index = pd.DataFrame({
- 'customer_id': customers.index,
- 'customer': np.arange(customers.shape[0])})
- product_index = pd.DataFrame({
- 'product_id': products.index,
- 'product': np.arange(products.shape[0])})
- reduced_df = reduced_df \
- .merge(customer_index) \
- .merge(product_index)
-
- nb_customer = reduced_df['customer'].max() + 1
- nb_products = reduced_df['product'].max() + 1
+ customer_index = pd.DataFrame({"customer_id": customers.index, "customer": np.arange(customers.shape[0])})
+ product_index = pd.DataFrame({"product_id": products.index, "product": np.arange(products.shape[0])})
+ reduced_df = reduced_df.merge(customer_index).merge(product_index)
+
+ nb_customer = reduced_df["customer"].max() + 1
+ nb_products = reduced_df["product"].max() + 1
feature_dim = nb_customer + nb_products
print(nb_customer, nb_products, feature_dim)
- product_df = reduced_df[['customer', 'product', 'star_rating']]
+ product_df = reduced_df[["customer", "product", "star_rating"]]
# split into train, validation and test data sets
train_df, validate_df, test_df = np.split(
- product_df.sample(frac=1),
- [int(.6*len(product_df)), int(.8*len(product_df))]
+ product_df.sample(frac=1), [int(0.6 * len(product_df)), int(0.8 * len(product_df))]
)
- print("# of rows train data set = {:10d}".format(
- train_df.shape[0]))
- print("# of rows validation data set = {:10d}".format(
- validate_df.shape[0]))
- print("# of rows test data set = {:10d}".format(
- test_df.shape[0]))
+ print("# of rows train data set = {:10d}".format(train_df.shape[0]))
+ print("# of rows validation data set = {:10d}".format(validate_df.shape[0]))
+ print("# of rows test data set = {:10d}".format(test_df.shape[0]))
# select columns required for training the model
# excluding columns "customer_id", "product_id", "product_title" to
@@ -111,25 +96,21 @@ def preprocess(s3_in_url,
# write output to s3 as delimited file
fs = s3fs.S3FileSystem(anon=False)
- s3_out_prefix = s3_out_prefix[:-1] \
- if s3_out_prefix[-1] == "/" else s3_out_prefix
- s3_out_train = "s3://{}/{}/{}".format(
- s3_out_bucket, s3_out_prefix, "train/train.csv")
+ s3_out_prefix = s3_out_prefix[:-1] if s3_out_prefix[-1] == "/" else s3_out_prefix
+ s3_out_train = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "train/train.csv")
print("writing training data to {}".format(s3_out_train))
with fs.open(s3_out_train, "w") as f:
- train_df.to_csv(f, sep=str(','), index=False)
+ train_df.to_csv(f, sep=str(","), index=False)
- s3_out_validate = "s3://{}/{}/{}".format(
- s3_out_bucket, s3_out_prefix, "validate/validate.csv")
+ s3_out_validate = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "validate/validate.csv")
print("writing test data to {}".format(s3_out_validate))
with fs.open(s3_out_validate, "w") as f:
- validate_df.to_csv(f, sep=str(','), index=False)
+ validate_df.to_csv(f, sep=str(","), index=False)
- s3_out_test = "s3://{}/{}/{}".format(
- s3_out_bucket, s3_out_prefix, "test/test.csv")
+ s3_out_test = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "test/test.csv")
print("writing test data to {}".format(s3_out_test))
with fs.open(s3_out_test, "w") as f:
- test_df.to_csv(f, sep=str(','), index=False)
+ test_df.to_csv(f, sep=str(","), index=False)
print("preprocessing completed")
return "SUCCESS"
diff --git a/10_pipeline/airflow/src/config.py b/10_pipeline/airflow/src/config.py
index 5aeb92c5..77ae4a8c 100644
--- a/10_pipeline/airflow/src/config.py
+++ b/10_pipeline/airflow/src/config.py
@@ -3,24 +3,21 @@
config = {}
-config["job_level"] = {
- "region_name": "{0}",
- "run_hyperparameter_opt": "no"
-}
+config["job_level"] = {"region_name": "{0}", "run_hyperparameter_opt": "no"}
config["preprocess_data"] = {
"s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz",
- "s3_out_bucket": "{1}", # replace
+ "s3_out_bucket": "{1}", # replace
"s3_out_prefix": "preprocess/",
- "delimiter": "\t"
+ "delimiter": "\t",
}
config["prepare_data"] = {
- "s3_in_bucket": "{1}", # replace
+ "s3_in_bucket": "{1}", # replace
"s3_in_prefix": "preprocess/",
"s3_out_bucket": "{1}", # replace
"s3_out_prefix": "prepare/",
- "delimiter": "\t"
+ "delimiter": "\t",
}
config["train_model"] = {
@@ -37,12 +34,12 @@
"epochs": "10",
"mini_batch_size": "200",
"num_factors": "64",
- "predictor_type": 'regressor'
- }
+ "predictor_type": "regressor",
+ },
},
"inputs": {
"train": "s3://{1}/prepare/train/train.protobuf", # replace
- }
+ },
}
config["tune_model"] = {
@@ -51,16 +48,16 @@
"objective_type": "Minimize",
"hyperparameter_ranges": {
"factors_lr": ContinuousParameter(0.0001, 0.2),
- "factors_init_sigma": ContinuousParameter(0.0001, 1)
+ "factors_init_sigma": ContinuousParameter(0.0001, 1),
},
"max_jobs": 20,
"max_parallel_jobs": 2,
- "base_tuning_job_name": "hpo-recommender"
+ "base_tuning_job_name": "hpo-recommender",
},
"inputs": {
"train": "s3://{1}/prepare/train/train.protobuf", # replace
- "test": "s3://{1}/prepare/validate/validate.protobuf" # replace
- }
+ "test": "s3://{1}/prepare/validate/validate.protobuf", # replace
+ },
}
config["batch_transform"] = {
@@ -71,6 +68,6 @@
"data_type": "S3Prefix",
"content_type": "application/x-recordio-protobuf",
"strategy": "MultiRecord",
- "output_path": "s3://{1}/transform/"
+ "output_path": "s3://{1}/transform/",
}
}
diff --git a/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py b/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py
index c0265d09..0324b855 100644
--- a/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py
+++ b/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py
@@ -5,7 +5,7 @@
import sys
-sys.path.append('/Users/cfregly/airflow/dags/')
+sys.path.append("/Users/cfregly/airflow/dags/")
# airflow operators
import airflow
@@ -16,12 +16,9 @@
from airflow.operators.python_operator import PythonOperator
# airflow sagemaker operators
-from airflow.contrib.operators.sagemaker_training_operator \
- import SageMakerTrainingOperator
-from airflow.contrib.operators.sagemaker_tuning_operator \
- import SageMakerTuningOperator
-from airflow.contrib.operators.sagemaker_transform_operator \
- import SageMakerTransformOperator
+from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator
+from airflow.contrib.operators.sagemaker_tuning_operator import SageMakerTuningOperator
+from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator
from airflow.contrib.hooks.aws_hook import AwsHook
# sagemaker sdk
@@ -46,11 +43,9 @@
def is_hpo_enabled():
- """check if hyper-parameter optimization is enabled in the config
- """
+ """check if hyper-parameter optimization is enabled in the config"""
hpo_enabled = False
- if "job_level" in config and \
- "run_hyperparameter_opt" in config["job_level"]:
+ if "job_level" in config and "run_hyperparameter_opt" in config["job_level"]:
run_hpo_config = config["job_level"]["run_hyperparameter_opt"]
if run_hpo_config.lower() == "yes":
hpo_enabled = True
@@ -58,10 +53,11 @@ def is_hpo_enabled():
def get_sagemaker_role_arn(role_name, region_name):
- iam = boto3.client('iam', region_name=region_name)
+ iam = boto3.client("iam", region_name=region_name)
response = iam.get_role(RoleName=role_name)
return response["Role"]["Arn"]
+
# =============================================================================
# setting up training, tuning and transform configuration
# =============================================================================
@@ -71,13 +67,11 @@ def get_sagemaker_role_arn(role_name, region_name):
config = cfg.config
# set configuration for tasks
-hook = AwsHook(aws_conn_id='airflow-sagemaker')
+hook = AwsHook(aws_conn_id="airflow-sagemaker")
region = config["job_level"]["region_name"]
sess = hook.get_session(region_name=region)
-role = get_sagemaker_role_arn(
- config["train_model"]["sagemaker_role"],
- sess.region_name)
-container = get_image_uri(sess.region_name, 'factorization-machines')
+role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name)
+container = get_image_uri(sess.region_name, "factorization-machines")
hpo_enabled = is_hpo_enabled()
# create estimator
@@ -89,20 +83,13 @@ def get_sagemaker_role_arn(role_name, region_name):
)
# train_config specifies SageMaker training configuration
-train_config = training_config(
- estimator=fm_estimator,
- inputs=config["train_model"]["inputs"])
+train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"])
# create tuner
-fm_tuner = HyperparameterTuner(
- estimator=fm_estimator,
- **config["tune_model"]["tuner_config"]
-)
+fm_tuner = HyperparameterTuner(estimator=fm_estimator, **config["tune_model"]["tuner_config"])
# create tuning config
-tuner_config = tuning_config(
- tuner=fm_tuner,
- inputs=config["tune_model"]["inputs"])
+tuner_config = tuning_config(tuner=fm_tuner, inputs=config["tune_model"]["inputs"])
# create transform config
transform_config = transform_config_from_estimator(
@@ -118,84 +105,76 @@ def get_sagemaker_role_arn(role_name, region_name):
# define airflow DAG
-args = {
- 'owner': 'airflow',
- 'start_date': airflow.utils.dates.days_ago(2)
-}
+args = {"owner": "airflow", "start_date": airflow.utils.dates.days_ago(2)}
dag = DAG(
- dag_id='sagemaker-ml-pipeline',
+ dag_id="sagemaker-ml-pipeline",
default_args=args,
schedule_interval=None,
concurrency=1,
max_active_runs=1,
- user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)}
+ user_defined_filters={"tojson": lambda s: json.JSONEncoder().encode(s)},
)
# set the tasks in the DAG
# dummy operator
-init = DummyOperator(
- task_id='start',
- dag=dag
-)
+init = DummyOperator(task_id="start", dag=dag)
# preprocess the data
preprocess_task = PythonOperator(
- task_id='preprocessing',
+ task_id="preprocessing",
dag=dag,
provide_context=False,
python_callable=preprocess.preprocess,
- op_kwargs=config["preprocess_data"])
+ op_kwargs=config["preprocess_data"],
+)
# prepare the data for training
prepare_task = PythonOperator(
- task_id='preparing',
+ task_id="preparing",
dag=dag,
provide_context=False,
python_callable=prepare.prepare,
- op_kwargs=config["prepare_data"]
+ op_kwargs=config["prepare_data"],
)
branching = BranchPythonOperator(
- task_id='branching',
- dag=dag,
- python_callable=lambda: "model_tuning" if hpo_enabled else "model_training")
+ task_id="branching", dag=dag, python_callable=lambda: "model_tuning" if hpo_enabled else "model_training"
+)
# launch sagemaker training job and wait until it completes
train_model_task = SageMakerTrainingOperator(
- task_id='model_training',
+ task_id="model_training",
dag=dag,
config=train_config,
- aws_conn_id='airflow-sagemaker',
+ aws_conn_id="airflow-sagemaker",
wait_for_completion=True,
- check_interval=30
+ check_interval=30,
)
# launch sagemaker hyperparameter job and wait until it completes
tune_model_task = SageMakerTuningOperator(
- task_id='model_tuning',
+ task_id="model_tuning",
dag=dag,
config=tuner_config,
- aws_conn_id='airflow-sagemaker',
+ aws_conn_id="airflow-sagemaker",
wait_for_completion=True,
- check_interval=30
+ check_interval=30,
)
# launch sagemaker batch transform job and wait until it completes
batch_transform_task = SageMakerTransformOperator(
- task_id='predicting',
+ task_id="predicting",
dag=dag,
config=transform_config,
- aws_conn_id='airflow-sagemaker',
+ aws_conn_id="airflow-sagemaker",
wait_for_completion=True,
check_interval=30,
- trigger_rule=TriggerRule.ONE_SUCCESS
+ trigger_rule=TriggerRule.ONE_SUCCESS,
)
-cleanup_task = DummyOperator(
- task_id='cleaning_up',
- dag=dag)
+cleanup_task = DummyOperator(task_id="cleaning_up", dag=dag)
# set the dependencies between tasks
diff --git a/10_pipeline/evaluate_model_metrics.py b/10_pipeline/evaluate_model_metrics.py
index 024afdec..f3523174 100644
--- a/10_pipeline/evaluate_model_metrics.py
+++ b/10_pipeline/evaluate_model_metrics.py
@@ -4,13 +4,16 @@
from datetime import datetime
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
import pandas as pd
import os
import re
@@ -33,99 +36,99 @@
from sklearn.utils import resample
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
CLASSES = [1, 2, 3, 4, 5]
-config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
+config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+)
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--input-model', type=str,
- default='/opt/ml/processing/input/model',
+ parser.add_argument(
+ "--input-model",
+ type=str,
+ default="/opt/ml/processing/input/model",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
-
+ )
+
return parser.parse_args()
-
+
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- print('input_data: {}'.format(args.input_data))
- print('input_model: {}'.format(args.input_model))
-
- print('Listing contents of input model dir: {}'.format(args.input_model))
+ print("Current host: {}".format(args.current_host))
+
+ print("input_data: {}".format(args.input_data))
+ print("input_model: {}".format(args.input_model))
+
+ print("Listing contents of input model dir: {}".format(args.input_model))
input_files = os.listdir(args.input_model)
for file in input_files:
print(file)
- model_tar_path = '{}/model.tar.gz'.format(args.input_model)
+ model_tar_path = "{}/model.tar.gz".format(args.input_model)
model_tar = tarfile.open(model_tar_path)
model_tar.extractall(args.input_model)
- model_tar.close()
+ model_tar.close()
- model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model))
+ model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model))
print(model)
-
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=args.max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -133,81 +136,86 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
-
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
###########################################################################################
# TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz #
- ###########################################################################################
-# evaluation_data_path = '/opt/ml/processing/input/data/'
-
- print('Listing contents of input data dir: {}'.format(args.input_data))
+ ###########################################################################################
+ # evaluation_data_path = '/opt/ml/processing/input/data/'
+
+ print("Listing contents of input data dir: {}".format(args.input_data))
input_files = os.listdir(args.input_data)
- test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data)
- print('Using only {} to evaluate.'.format(test_data_path))
- df_test_reviews = pd.read_csv(test_data_path,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data)
+ print("Using only {} to evaluate.".format(test_data_path))
+ df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[
+ ["review_body", "star_rating"]
+ ]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
- y_actual = df_test_reviews['star_rating']
+ y_actual = df_test_reviews["star_rating"]
y_actual
print(classification_report(y_true=y_test, y_pred=y_actual))
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
def plot_conf_mat(cm, classes, title, cmap):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=CLASSES,
- title='Confusion Matrix',
- cmap=plt.cm.Greens)
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens)
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
- # Model Output
- metrics_path = os.path.join(args.output_data, 'metrics/')
+ # Model Output
+ metrics_path = os.path.join(args.output_data, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
report_dict = {
"metrics": {
@@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap):
evaluation_path = "{}/evaluation.json".format(metrics_path)
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))
-
- print('Listing contents of output dir: {}'.format(args.output_data))
+
+ print("Listing contents of output dir: {}".format(args.output_data))
output_files = os.listdir(args.output_data)
for file in output_files:
print(file)
- print('Listing contents of output/metrics dir: {}'.format(metrics_path))
- output_files = os.listdir('{}'.format(metrics_path))
+ print("Listing contents of output/metrics dir: {}".format(metrics_path))
+ output_files = os.listdir("{}".format(metrics_path))
for file in output_files:
print(file)
- print('Complete')
-
-
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
- process(args)
+ process(args)
diff --git a/10_pipeline/human/00_Overview.ipynb b/10_pipeline/human/00_Overview.ipynb
index cf576e4a..fa851c30 100644
--- a/10_pipeline/human/00_Overview.ipynb
+++ b/10_pipeline/human/00_Overview.ipynb
@@ -66,7 +66,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
},
diff --git a/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb b/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb
index 5c195013..fd0e09da 100644
--- a/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb
+++ b/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb
@@ -34,7 +34,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -54,10 +54,10 @@
"import botocore\n",
"\n",
"# Amazon Python SDK clients\n",
- "sagemaker = boto3.client('sagemaker', region)\n",
- "comprehend = boto3.client('comprehend', region)\n",
- "a2i = boto3.client('sagemaker-a2i-runtime')\n",
- "s3 = boto3.client('s3', region)"
+ "sagemaker = boto3.client(\"sagemaker\", region)\n",
+ "comprehend = boto3.client(\"comprehend\", region)\n",
+ "a2i = boto3.client(\"sagemaker-a2i-runtime\")\n",
+ "s3 = boto3.client(\"s3\", region)"
]
},
{
@@ -73,7 +73,7 @@
"metadata": {},
"outputs": [],
"source": [
- "output_path = f's3://{bucket}/a2i-comprehend-star-rating-results'\n",
+ "output_path = f\"s3://{bucket}/a2i-comprehend-star-rating-results\"\n",
"print(output_path)"
]
},
@@ -94,7 +94,11 @@
"metadata": {},
"outputs": [],
"source": [
- "print('https://{}.console.aws.amazon.com/sagemaker/groundtruth?region={}#/labeling-workforces/create'.format(region, region))"
+ "print(\n",
+ " \"https://{}.console.aws.amazon.com/sagemaker/groundtruth?region={}#/labeling-workforces/create\".format(\n",
+ " region, region\n",
+ " )\n",
+ ")"
]
},
{
@@ -147,9 +151,9 @@
"source": [
"import boto3\n",
"\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
"\n",
- "augmented_ai_workteam_arn = 'arn:aws:sagemaker:{}:{}:workteam/private-crowd/dsoaws'.format(region, account_id)\n",
+ "augmented_ai_workteam_arn = \"arn:aws:sagemaker:{}:{}:workteam/private-crowd/dsoaws\".format(region, account_id)\n",
"\n",
"print(augmented_ai_workteam_arn)"
]
@@ -218,13 +222,11 @@
"outputs": [],
"source": [
"# Task UI name - this value is unique per account and region. You can also provide your own value here.\n",
- "task_ui_name = 'ui-comprehend-' + str(uuid.uuid4()) \n",
+ "task_ui_name = \"ui-comprehend-\" + str(uuid.uuid4())\n",
"\n",
"# Create a Human Task UI resource.\n",
- "human_task_ui_response = sagemaker.create_human_task_ui(\n",
- " HumanTaskUiName=task_ui_name,\n",
- " UiTemplate={'Content': template})\n",
- "human_task_ui_arn = human_task_ui_response['HumanTaskUiArn']\n",
+ "human_task_ui_response = sagemaker.create_human_task_ui(HumanTaskUiName=task_ui_name, UiTemplate={\"Content\": template})\n",
+ "human_task_ui_arn = human_task_ui_response[\"HumanTaskUiArn\"]\n",
"print(human_task_ui_arn)"
]
},
@@ -260,24 +262,22 @@
"import uuid\n",
"\n",
"# Flow definition name - this value is unique per account and region. You can also provide your own value here.\n",
- "flow_definition_name = 'fd-dsoaws-comprehend-' + str(uuid.uuid4()) \n",
+ "flow_definition_name = \"fd-dsoaws-comprehend-\" + str(uuid.uuid4())\n",
"\n",
"create_workflow_definition_response = sagemaker.create_flow_definition(\n",
- " FlowDefinitionName=flow_definition_name,\n",
- " RoleArn=role,\n",
- " HumanLoopConfig={\n",
- " 'WorkteamArn': augmented_ai_workteam_arn,\n",
- " 'HumanTaskUiArn': human_task_ui_arn,\n",
- " 'TaskCount': 1,\n",
- " 'TaskDescription': 'Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)',\n",
- " 'TaskTitle': 'Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)'\n",
- " },\n",
- " OutputConfig={\n",
- " 'S3OutputPath' : output_path\n",
- " }\n",
- " )\n",
+ " FlowDefinitionName=flow_definition_name,\n",
+ " RoleArn=role,\n",
+ " HumanLoopConfig={\n",
+ " \"WorkteamArn\": augmented_ai_workteam_arn,\n",
+ " \"HumanTaskUiArn\": human_task_ui_arn,\n",
+ " \"TaskCount\": 1,\n",
+ " \"TaskDescription\": \"Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)\",\n",
+ " \"TaskTitle\": \"Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)\",\n",
+ " },\n",
+ " OutputConfig={\"S3OutputPath\": output_path},\n",
+ ")\n",
"\n",
- "augmented_ai_flow_definition_arn = create_workflow_definition_response['FlowDefinitionArn']"
+ "augmented_ai_flow_definition_arn = create_workflow_definition_response[\"FlowDefinitionArn\"]"
]
},
{
@@ -298,8 +298,8 @@
"# Describe flow definition - status should turn to \"active\"\n",
"for x in range(60):\n",
" describeFlowDefinitionResponse = sagemaker.describe_flow_definition(FlowDefinitionName=flow_definition_name)\n",
- " print(describeFlowDefinitionResponse['FlowDefinitionStatus'])\n",
- " if (describeFlowDefinitionResponse['FlowDefinitionStatus'] == 'Active'):\n",
+ " print(describeFlowDefinitionResponse[\"FlowDefinitionStatus\"])\n",
+ " if describeFlowDefinitionResponse[\"FlowDefinitionStatus\"] == \"Active\":\n",
" print(\"Flow Definition is active\")\n",
" break\n",
" time.sleep(2)"
@@ -330,7 +330,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb b/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb
index 51c5a1ff..69e0a4f6 100644
--- a/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb
+++ b/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb
@@ -17,7 +17,7 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name"
@@ -37,10 +37,10 @@
"import botocore\n",
"\n",
"# Amazon Python SDK clients\n",
- "sagemaker = boto3.client('sagemaker', region)\n",
- "comprehend = boto3.client('comprehend', region)\n",
- "a2i = boto3.client('sagemaker-a2i-runtime')\n",
- "s3 = boto3.client('s3', region)"
+ "sagemaker = boto3.client(\"sagemaker\", region)\n",
+ "comprehend = boto3.client(\"comprehend\", region)\n",
+ "a2i = boto3.client(\"sagemaker-a2i-runtime\")\n",
+ "s3 = boto3.client(\"s3\", region)"
]
},
{
@@ -99,11 +99,11 @@
},
"outputs": [],
"source": [
- "try: \n",
+ "try:\n",
" comprehend_endpoint_arn\n",
"except NameError:\n",
- " print('*** PLEASE WAIT FOR THE Comprehend JOB TO FINISH IN THE PREVIOUS SECTION BEFORE CONTINUING ***')\n",
- " print('*** YOU WILL NEED TO RESTART THIS NOTEBOOK ONCE THE JOB FINISHES ***')"
+ " print(\"*** PLEASE WAIT FOR THE Comprehend JOB TO FINISH IN THE PREVIOUS SECTION BEFORE CONTINUING ***\")\n",
+ " print(\"*** YOU WILL NEED TO RESTART THIS NOTEBOOK ONCE THE JOB FINISHES ***\")"
]
},
{
@@ -136,12 +136,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sample_reviews = [\n",
- " 'I enjoy this product', \n",
- " 'I am unhappy with this product', \n",
- " 'It is okay', \n",
- " 'sometimes it works'\n",
- " ]"
+ "sample_reviews = [\"I enjoy this product\", \"I am unhappy with this product\", \"It is okay\", \"sometimes it works\"]"
]
},
{
@@ -166,37 +161,35 @@
"\n",
"for sample_review in sample_reviews:\n",
" # Call the Comprehend Custom model that we trained earlier\n",
- " response = comprehend.classify_document(Text=sample_review, \n",
- " EndpointArn=comprehend_endpoint_arn)\n",
+ " response = comprehend.classify_document(Text=sample_review, EndpointArn=comprehend_endpoint_arn)\n",
+ "\n",
+ " star_rating = response[\"Classes\"][0][\"Name\"]\n",
+ " confidence_score = response[\"Classes\"][0][\"Score\"]\n",
+ "\n",
+ " print(f'Processing sample_review: \"{sample_review}\"')\n",
"\n",
- " star_rating = response['Classes'][0]['Name']\n",
- " confidence_score = response['Classes'][0]['Score']\n",
- " \n",
- " print(f'Processing sample_review: \\\"{sample_review}\\\"')\n",
- " \n",
" # Our condition for when we want to engage a human for review\n",
- " if (confidence_score < CONFIDENCE_SCORE_THRESHOLD):\n",
- " \n",
+ " if confidence_score < CONFIDENCE_SCORE_THRESHOLD:\n",
+ "\n",
" humanLoopName = str(uuid.uuid4())\n",
- " inputContent = {\n",
- " 'initialValue': star_rating,\n",
- " 'taskObject': sample_review\n",
- " }\n",
+ " inputContent = {\"initialValue\": star_rating, \"taskObject\": sample_review}\n",
" start_loop_response = a2i.start_human_loop(\n",
" HumanLoopName=humanLoopName,\n",
" FlowDefinitionArn=augmented_ai_flow_definition_arn,\n",
- " HumanLoopInput={\n",
- " 'InputContent': json.dumps(inputContent)\n",
- " }\n",
+ " HumanLoopInput={\"InputContent\": json.dumps(inputContent)},\n",
" )\n",
"\n",
" human_loops_started.append(humanLoopName)\n",
"\n",
- " print(f'Confidence score of {confidence_score} for star rating of {star_rating} is less than the threshold of {CONFIDENCE_SCORE_THRESHOLD}')\n",
- " print(f'*** ==> Starting human loop with name: {humanLoopName} \\n')\n",
+ " print(\n",
+ " f\"Confidence score of {confidence_score} for star rating of {star_rating} is less than the threshold of {CONFIDENCE_SCORE_THRESHOLD}\"\n",
+ " )\n",
+ " print(f\"*** ==> Starting human loop with name: {humanLoopName} \\n\")\n",
" else:\n",
- " print(f'Confidence score of {confidence_score} for star rating of {star_rating} is above threshold of {CONFIDENCE_SCORE_THRESHOLD}')\n",
- " print('No human loop created. \\n')"
+ " print(\n",
+ " f\"Confidence score of {confidence_score} for star rating of {star_rating} is above threshold of {CONFIDENCE_SCORE_THRESHOLD}\"\n",
+ " )\n",
+ " print(\"No human loop created. \\n\")"
]
},
{
@@ -215,12 +208,12 @@
"completed_human_loops = []\n",
"for human_loop_name in human_loops_started:\n",
" resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n",
- " print(f'HumanLoop Name: {human_loop_name}')\n",
+ " print(f\"HumanLoop Name: {human_loop_name}\")\n",
" print(f'HumanLoop Status: {resp[\"HumanLoopStatus\"]}')\n",
" print(f'HumanLoop Output Destination: {resp[\"HumanLoopOutput\"]}')\n",
- " print('')\n",
- " \n",
- " if resp['HumanLoopStatus'] == 'Completed':\n",
+ " print(\"\")\n",
+ "\n",
+ " if resp[\"HumanLoopStatus\"] == \"Completed\":\n",
" completed_human_loops.append(resp)"
]
},
@@ -257,13 +250,13 @@
"metadata": {},
"outputs": [],
"source": [
- "workteam_name = augmented_ai_workteam_arn[augmented_ai_workteam_arn.rfind('/') + 1:]\n",
+ "workteam_name = augmented_ai_workteam_arn[augmented_ai_workteam_arn.rfind(\"/\") + 1 :]\n",
"print(workteam_name)\n",
- "print('Navigate to the private worker portal and complete the human loop.')\n",
- "print('Make sure you have invited yourself to the workteam and received the signup email.')\n",
- "print('Note: Check your spam filter if you have not received the email.')\n",
- "print('')\n",
- "print('https://' + sagemaker.describe_workteam(WorkteamName=workteam_name)['Workteam']['SubDomain'])"
+ "print(\"Navigate to the private worker portal and complete the human loop.\")\n",
+ "print(\"Make sure you have invited yourself to the workteam and received the signup email.\")\n",
+ "print(\"Note: Check your spam filter if you have not received the email.\")\n",
+ "print(\"\")\n",
+ "print(\"https://\" + sagemaker.describe_workteam(WorkteamName=workteam_name)[\"Workteam\"][\"SubDomain\"])"
]
},
{
@@ -311,18 +304,18 @@
"completed_human_loops = []\n",
"for human_loop_name in human_loops_started:\n",
" resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n",
- " print(f'HumanLoop Name: {human_loop_name}')\n",
+ " print(f\"HumanLoop Name: {human_loop_name}\")\n",
" print(f'HumanLoop Status: {resp[\"HumanLoopStatus\"]}')\n",
" print(f'HumanLoop Output Destination: {resp[\"HumanLoopOutput\"]}')\n",
- " print('')\n",
+ " print(\"\")\n",
" while resp[\"HumanLoopStatus\"] != \"Completed\":\n",
- " print(f'Waiting for HumanLoop to complete.') \n",
+ " print(f\"Waiting for HumanLoop to complete.\")\n",
" time.sleep(10)\n",
" resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n",
" if resp[\"HumanLoopStatus\"] == \"Completed\":\n",
" completed_human_loops.append(resp)\n",
- " print(f'Completed!')\n",
- " print('')"
+ " print(f\"Completed!\")\n",
+ " print(\"\")"
]
},
{
@@ -355,17 +348,17 @@
"fixed_items = []\n",
"\n",
"for resp in completed_human_loops:\n",
- " split_string = re.split('s3://' + bucket + '/', resp['HumanLoopOutput']['OutputS3Uri'])\n",
+ " split_string = re.split(\"s3://\" + bucket + \"/\", resp[\"HumanLoopOutput\"][\"OutputS3Uri\"])\n",
" output_bucket_key = split_string[1]\n",
"\n",
" response = s3.get_object(Bucket=bucket, Key=output_bucket_key)\n",
- " content = response['Body'].read().decode('utf-8')\n",
+ " content = response[\"Body\"].read().decode(\"utf-8\")\n",
" json_output = json.loads(content)\n",
" print(json_output)\n",
"\n",
- " input_content = json_output['inputContent']\n",
- " human_answer = json_output['humanAnswers'][0]['answerContent']\n",
- " fixed_item = {'input_content': input_content, 'human_answer': human_answer}\n",
+ " input_content = json_output[\"inputContent\"]\n",
+ " human_answer = json_output[\"humanAnswers\"][0][\"answerContent\"]\n",
+ " fixed_item = {\"input_content\": input_content, \"human_answer\": human_answer}\n",
" fixed_items.append(fixed_item)"
]
},
@@ -417,7 +410,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb b/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb
index 8d0eba29..38f6617d 100644
--- a/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb
+++ b/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb
@@ -27,9 +27,9 @@
"outputs": [],
"source": [
"!wget https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz\n",
- " \n",
+ "\n",
"!tar -xzvf eksctl_$(uname -s)_amd64.tar.gz -C /tmp\n",
- " \n",
+ "\n",
"!mv /tmp/eksctl /usr/local/bin\n",
"\n",
"!eksctl version"
@@ -41,8 +41,8 @@
"metadata": {},
"outputs": [],
"source": [
- "!wget https://amazon-eks.s3.us-west-2.amazonaws.com/1.15.10/2020-02-22/bin/linux/amd64/kubectl \n",
- " \n",
+ "!wget https://amazon-eks.s3.us-west-2.amazonaws.com/1.15.10/2020-02-22/bin/linux/amd64/kubectl\n",
+ "\n",
"!chmod +x ./kubectl\n",
"\n",
"!mv ./kubectl /usr/local/bin\n",
diff --git a/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb b/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb
index 41a1bfca..97bd7912 100644
--- a/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb
+++ b/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb
@@ -252,7 +252,7 @@
"\n",
"# Shutting down your kernel for this notebook to release resources.
\n",
"# \n",
- " \n",
+ "\n",
"# "
]
},
diff --git a/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb b/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb
index 49a44db3..a739d211 100644
--- a/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb
+++ b/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb
@@ -70,6 +70,7 @@
"source": [
"# Restart the kernel to pick up pip installed libraries\n",
"from IPython.core.display import HTML\n",
+ "\n",
"HTML(\"\")"
]
},
@@ -129,23 +130,22 @@
"import kfp\n",
"from kfp import dsl\n",
"\n",
+ "\n",
"def add_two_numbers(a, b):\n",
" return dsl.ContainerOp(\n",
- " name='calculate_sum',\n",
- " image='python:3.6.8',\n",
- " command=['python', '-c'],\n",
+ " name=\"calculate_sum\",\n",
+ " image=\"python:3.6.8\",\n",
+ " command=[\"python\", \"-c\"],\n",
" arguments=['with open(\"/tmp/results.txt\", \"a\") as file: file.write(str({} + {}))'.format(a, b)],\n",
" file_outputs={\n",
- " 'data': '/tmp/results.txt',\n",
- " }\n",
+ " \"data\": \"/tmp/results.txt\",\n",
+ " },\n",
" )\n",
"\n",
+ "\n",
"def echo_op(text):\n",
" return dsl.ContainerOp(\n",
- " name='echo',\n",
- " image='library/bash:4.4.23',\n",
- " command=['sh', '-c'],\n",
- " arguments=['echo \"Result: {}\"'.format(text)]\n",
+ " name=\"echo\", image=\"library/bash:4.4.23\", command=[\"sh\", \"-c\"], arguments=['echo \"Result: {}\"'.format(text)]\n",
" )"
]
},
@@ -164,16 +164,8 @@
"metadata": {},
"outputs": [],
"source": [
- "@dsl.pipeline(\n",
- " name='Calculate sum pipeline',\n",
- " description='Calculate sum of numbers and prints the result.'\n",
- ")\n",
- "def calculate_sum(\n",
- " a=7,\n",
- " b=10,\n",
- " c=4,\n",
- " d=7\n",
- "):\n",
+ "@dsl.pipeline(name=\"Calculate sum pipeline\", description=\"Calculate sum of numbers and prints the result.\")\n",
+ "def calculate_sum(a=7, b=10, c=4, d=7):\n",
" \"\"\"A four-step pipeline with first two running in parallel.\"\"\"\n",
"\n",
" sum1 = add_two_numbers(a, b)\n",
@@ -198,7 +190,7 @@
"metadata": {},
"outputs": [],
"source": [
- "kfp.compiler.Compiler().compile(calculate_sum, 'calculate-sum-pipeline.zip')"
+ "kfp.compiler.Compiler().compile(calculate_sum, \"calculate-sum-pipeline.zip\")"
]
},
{
@@ -247,11 +239,9 @@
"source": [
"client = kfp.Client()\n",
"\n",
- "experiment = client.create_experiment(name='kubeflow')\n",
+ "experiment = client.create_experiment(name=\"kubeflow\")\n",
"\n",
- "my_run = client.run_pipeline(experiment.id, \n",
- " 'calculate-sum-pipeline', \n",
- " 'calculate-sum-pipeline.zip')"
+ "my_run = client.run_pipeline(experiment.id, \"calculate-sum-pipeline\", \"calculate-sum-pipeline.zip\")"
]
},
{
diff --git a/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb b/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb
index e3b32b6e..9fe4a191 100644
--- a/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb
+++ b/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb
@@ -50,6 +50,7 @@
"source": [
"# Restart the kernel to pick up pip installed libraries\n",
"from IPython.core.display import HTML\n",
+ "\n",
"HTML(\"\")"
]
},
@@ -85,13 +86,13 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_roles = boto3.client('iam').list_roles()['Roles']\n",
+ "iam_roles = boto3.client(\"iam\").list_roles()[\"Roles\"]\n",
"\n",
"for iam_role in iam_roles:\n",
- " if 'SageMakerExecutionRole' in iam_role['RoleName']:\n",
- " role = iam_role['Arn']\n",
+ " if \"SageMakerExecutionRole\" in iam_role[\"RoleName\"]:\n",
+ " role = iam_role[\"Arn\"]\n",
" break\n",
- "print('Role: {}'.format(role))"
+ "print(\"Role: {}\".format(role))"
]
},
{
@@ -107,7 +108,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'"
+ "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\""
]
},
{
@@ -116,7 +117,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n",
+ "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n",
"print(s3_private_path_tsv)"
]
},
@@ -137,7 +138,7 @@
"metadata": {},
"outputs": [],
"source": [
- "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)"
+ "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)"
]
},
{
@@ -165,7 +166,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_process_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml')"
+ "sagemaker_process_op = components.load_component_from_url(\n",
+ " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml\"\n",
+ ")"
]
},
{
@@ -174,7 +177,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml')"
+ "sagemaker_train_op = components.load_component_from_url(\n",
+ " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml\"\n",
+ ")"
]
},
{
@@ -183,7 +188,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml')"
+ "sagemaker_model_op = components.load_component_from_url(\n",
+ " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml\"\n",
+ ")"
]
},
{
@@ -192,7 +199,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml')"
+ "sagemaker_deploy_op = components.load_component_from_url(\n",
+ " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml\"\n",
+ ")"
]
},
{
@@ -210,7 +219,7 @@
},
"outputs": [],
"source": [
- "processing_code_s3_uri = 's3://{}/processing_code/preprocess-scikit-text-to-bert-feature-store.py'.format(bucket)\n",
+ "processing_code_s3_uri = \"s3://{}/processing_code/preprocess-scikit-text-to-bert-feature-store.py\".format(bucket)\n",
"print(processing_code_s3_uri)\n",
"\n",
"!aws s3 cp ./preprocess-scikit-text-to-bert-feature-store.py $processing_code_s3_uri"
@@ -238,7 +247,7 @@
"metadata": {},
"outputs": [],
"source": [
- "training_code_s3_uri = 's3://{}/training_code/sourcedir.tar.gz'.format(bucket)\n",
+ "training_code_s3_uri = \"s3://{}/training_code/sourcedir.tar.gz\".format(bucket)\n",
"print(training_code_s3_uri)\n",
"\n",
"!aws s3 cp sourcedir.tar.gz $training_code_s3_uri"
@@ -262,14 +271,11 @@
" },\n",
" }\n",
"\n",
+ "\n",
"def processing_output(output_name, s3_uri, local_path, s3_upload_mode):\n",
" return {\n",
" \"OutputName\": output_name,\n",
- " \"S3Output\": {\n",
- " \"LocalPath\": local_path, \n",
- " \"S3Uri\": s3_uri,\n",
- " \"S3UploadMode\": s3_upload_mode\n",
- " },\n",
+ " \"S3Output\": {\"LocalPath\": local_path, \"S3Uri\": s3_uri, \"S3UploadMode\": s3_upload_mode},\n",
" }"
]
},
@@ -284,9 +290,9 @@
" \"ChannelName\": input_name,\n",
" \"DataSource\": {\n",
" \"S3DataSource\": {\n",
- " \"S3Uri\": s3_uri, \n",
+ " \"S3Uri\": s3_uri,\n",
" \"S3DataType\": \"S3Prefix\",\n",
- " \"S3DataDistributionType\": s3_data_distribution_type \n",
+ " \"S3DataDistributionType\": s3_data_distribution_type,\n",
" }\n",
" },\n",
" }"
@@ -309,48 +315,45 @@
" name=\"BERT Pipeline\",\n",
" description=\"BERT Pipeline\",\n",
")\n",
- "def bert_pipeline(role=role, \n",
- " bucket=bucket, \n",
- " region=region,\n",
- " raw_input_data_s3_uri=raw_input_data_s3_uri):\n",
+ "def bert_pipeline(role=role, bucket=bucket, region=region, raw_input_data_s3_uri=raw_input_data_s3_uri):\n",
"\n",
" import time\n",
" import json\n",
- " \n",
- " pipeline_name = 'kubeflow-pipeline-sagemaker-{}'.format(int(time.time()))\n",
"\n",
- " network_isolation=False\n",
- " \n",
+ " pipeline_name = \"kubeflow-pipeline-sagemaker-{}\".format(int(time.time()))\n",
+ "\n",
+ " network_isolation = False\n",
+ "\n",
" ########################\n",
" # FEATURE ENGINEERING\n",
- " ######################## \n",
- " \n",
- " max_seq_length=64\n",
- " train_split_percentage=0.90\n",
- " validation_split_percentage=0.05\n",
- " test_split_percentage=0.05\n",
- " balance_dataset=True\n",
- "\n",
- " processed_train_data_s3_uri = 's3://{}/{}/processing/output/bert-train'.format(bucket, pipeline_name)\n",
- " processed_validation_data_s3_uri = 's3://{}/{}/processing/output/bert-validation'.format(bucket, pipeline_name)\n",
- " processed_test_data_s3_uri = 's3://{}/{}/processing/output/bert-test'.format(bucket, pipeline_name)\n",
- "\n",
- " processing_instance_type = 'ml.c5.2xlarge'\n",
+ " ########################\n",
+ "\n",
+ " max_seq_length = 64\n",
+ " train_split_percentage = 0.90\n",
+ " validation_split_percentage = 0.05\n",
+ " test_split_percentage = 0.05\n",
+ " balance_dataset = True\n",
+ "\n",
+ " processed_train_data_s3_uri = \"s3://{}/{}/processing/output/bert-train\".format(bucket, pipeline_name)\n",
+ " processed_validation_data_s3_uri = \"s3://{}/{}/processing/output/bert-validation\".format(bucket, pipeline_name)\n",
+ " processed_test_data_s3_uri = \"s3://{}/{}/processing/output/bert-test\".format(bucket, pipeline_name)\n",
+ "\n",
+ " processing_instance_type = \"ml.c5.2xlarge\"\n",
" processing_instance_count = 2\n",
- " \n",
+ "\n",
" timestamp = int(time.time())\n",
"\n",
- " feature_store_offline_prefix = 'reviews-feature-store-' + str(timestamp)\n",
- " feature_group_name = 'reviews-feature-group-' + str(timestamp)\n",
+ " feature_store_offline_prefix = \"reviews-feature-store-\" + str(timestamp)\n",
+ " feature_group_name = \"reviews-feature-group-\" + str(timestamp)\n",
"\n",
" # hard-coding to avoid the wrong ECR account id with create_image_uri()\n",
- " processing_image = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3'\n",
- "# import sagemaker\n",
- "# processing_image = sagemaker.fw_utils.create_image_uri(framework='scikit-learn',\n",
- "# framework_version='0.23-1',\n",
- "# py_version='py3',\n",
- "# instance_type='ml.c5.9xlarge',\n",
- "# region='us-east-1') # hard-coding to avoid serialization issue\n",
+ " processing_image = \"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\"\n",
+ " # import sagemaker\n",
+ " # processing_image = sagemaker.fw_utils.create_image_uri(framework='scikit-learn',\n",
+ " # framework_version='0.23-1',\n",
+ " # py_version='py3',\n",
+ " # instance_type='ml.c5.9xlarge',\n",
+ " # region='us-east-1') # hard-coding to avoid serialization issue\n",
"\n",
" process = sagemaker_process_op(\n",
" role=role,\n",
@@ -359,15 +362,23 @@
" network_isolation=network_isolation,\n",
" instance_type=processing_instance_type,\n",
" instance_count=processing_instance_count,\n",
- " container_arguments=['--train-split-percentage', str(train_split_percentage),\n",
- " '--validation-split-percentage', str(validation_split_percentage),\n",
- " '--test-split-percentage', str(test_split_percentage),\n",
- " '--max-seq-length', str(max_seq_length),\n",
- " '--balance-dataset', str(balance_dataset),\n",
- " '--feature-store-offline-prefix', str(feature_store_offline_prefix),\n",
- " '--feature-group-name', str(feature_group_name)\n",
- " ], \n",
- " environment={'AWS_DEFAULT_REGION': 'us-east-1'}, # hard-coding to avoid serialization issue\n",
+ " container_arguments=[\n",
+ " \"--train-split-percentage\",\n",
+ " str(train_split_percentage),\n",
+ " \"--validation-split-percentage\",\n",
+ " str(validation_split_percentage),\n",
+ " \"--test-split-percentage\",\n",
+ " str(test_split_percentage),\n",
+ " \"--max-seq-length\",\n",
+ " str(max_seq_length),\n",
+ " \"--balance-dataset\",\n",
+ " str(balance_dataset),\n",
+ " \"--feature-store-offline-prefix\",\n",
+ " str(feature_store_offline_prefix),\n",
+ " \"--feature-group-name\",\n",
+ " str(feature_group_name),\n",
+ " ],\n",
+ " environment={\"AWS_DEFAULT_REGION\": \"us-east-1\"}, # hard-coding to avoid serialization issue\n",
" container_entrypoint=[\n",
" \"python3\",\n",
" \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert-feature-store.py\",\n",
@@ -377,13 +388,13 @@
" input_name=\"raw-input-data\",\n",
" s3_uri=\"{}\".format(raw_input_data_s3_uri),\n",
" local_path=\"/opt/ml/processing/input/data/\",\n",
- " s3_data_distribution_type=\"ShardedByS3Key\"\n",
+ " s3_data_distribution_type=\"ShardedByS3Key\",\n",
" ),\n",
" processing_input(\n",
" input_name=\"code\",\n",
" s3_uri=\"{}\".format(processing_code_s3_uri),\n",
" local_path=\"/opt/ml/processing/input/code\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
" ),\n",
" ],\n",
" output_config=[\n",
@@ -391,150 +402,146 @@
" output_name=\"bert-train\",\n",
" s3_uri=\"{}\".format(processed_train_data_s3_uri),\n",
" local_path=\"/opt/ml/processing/output/bert/train\",\n",
- " s3_upload_mode=\"EndOfJob\"\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
" ),\n",
" processing_output(\n",
" output_name=\"bert-validation\",\n",
" s3_uri=\"{}\".format(processed_validation_data_s3_uri),\n",
" local_path=\"/opt/ml/processing/output/bert/validation\",\n",
- " s3_upload_mode=\"EndOfJob\"\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
" ),\n",
" processing_output(\n",
" output_name=\"bert-test\",\n",
" s3_uri=\"{}\".format(processed_test_data_s3_uri),\n",
" local_path=\"/opt/ml/processing/output/bert/test\",\n",
- " s3_upload_mode=\"EndOfJob\"\n",
+ " s3_upload_mode=\"EndOfJob\",\n",
" ),\n",
" ],\n",
" )\n",
"\n",
- "\n",
" ########################\n",
" # TRAIN\n",
" ########################\n",
- " \n",
+ "\n",
" train_channels = [\n",
- " training_input(input_name=\"train\", \n",
- " s3_uri=processed_train_data_s3_uri,\n",
- " s3_data_distribution_type=\"ShardedByS3Key\"\n",
+ " training_input(\n",
+ " input_name=\"train\", s3_uri=processed_train_data_s3_uri, s3_data_distribution_type=\"ShardedByS3Key\"\n",
+ " ),\n",
+ " training_input(\n",
+ " input_name=\"validation\",\n",
+ " s3_uri=processed_validation_data_s3_uri,\n",
+ " s3_data_distribution_type=\"ShardedByS3Key\",\n",
+ " ),\n",
+ " training_input(\n",
+ " input_name=\"test\", s3_uri=processed_test_data_s3_uri, s3_data_distribution_type=\"ShardedByS3Key\"\n",
" ),\n",
- " training_input(input_name=\"validation\", \n",
- " s3_uri=processed_validation_data_s3_uri,\n",
- " s3_data_distribution_type=\"ShardedByS3Key\"\n",
- " ), \n",
- " training_input(input_name=\"test\", \n",
- " s3_uri=processed_test_data_s3_uri,\n",
- " s3_data_distribution_type=\"ShardedByS3Key\"\n",
- " )\n",
" ]\n",
"\n",
- " epochs=1\n",
- " learning_rate=0.00001\n",
- " epsilon=0.00000001\n",
- " train_batch_size=128\n",
- " validation_batch_size=128\n",
- " test_batch_size=128\n",
- " train_steps_per_epoch=100\n",
- " validation_steps=100\n",
- " test_steps=100\n",
- " train_volume_size=1024\n",
- " use_xla=True\n",
- " use_amp=True\n",
- " freeze_bert_layer=False\n",
- " enable_sagemaker_debugger=False\n",
- " enable_checkpointing=False\n",
- " enable_tensorboard=False\n",
- " input_mode='File'\n",
- " run_validation=True\n",
- " run_test=True\n",
- " run_sample_predictions=True\n",
- "\n",
- " train_instance_type='ml.c5.9xlarge' \n",
- " train_instance_count=1\n",
+ " epochs = 1\n",
+ " learning_rate = 0.00001\n",
+ " epsilon = 0.00000001\n",
+ " train_batch_size = 128\n",
+ " validation_batch_size = 128\n",
+ " test_batch_size = 128\n",
+ " train_steps_per_epoch = 100\n",
+ " validation_steps = 100\n",
+ " test_steps = 100\n",
+ " train_volume_size = 1024\n",
+ " use_xla = True\n",
+ " use_amp = True\n",
+ " freeze_bert_layer = False\n",
+ " enable_sagemaker_debugger = False\n",
+ " enable_checkpointing = False\n",
+ " enable_tensorboard = False\n",
+ " input_mode = \"File\"\n",
+ " run_validation = True\n",
+ " run_test = True\n",
+ " run_sample_predictions = True\n",
+ "\n",
+ " train_instance_type = \"ml.c5.9xlarge\"\n",
+ " train_instance_count = 1\n",
"\n",
" train_output_location = \"s3://{}/{}/output\".format(bucket, pipeline_name)\n",
- " \n",
- " hyperparameters={\n",
- " 'epochs': '{}'.format(epochs),\n",
- " 'learning_rate': '{}'.format(learning_rate),\n",
- " 'epsilon': '{}'.format(epsilon),\n",
- " 'train_batch_size': '{}'.format(train_batch_size),\n",
- " 'validation_batch_size': '{}'.format(validation_batch_size),\n",
- " 'test_batch_size': '{}'.format(test_batch_size), \n",
- " 'train_steps_per_epoch': '{}'.format(train_steps_per_epoch),\n",
- " 'validation_steps': '{}'.format(validation_steps),\n",
- " 'test_steps': '{}'.format(test_steps),\n",
- " 'use_xla': '{}'.format(use_xla),\n",
- " 'use_amp': '{}'.format(use_amp), \n",
- " 'max_seq_length': '{}'.format(max_seq_length),\n",
- " 'freeze_bert_layer': '{}'.format(freeze_bert_layer),\n",
- " 'enable_sagemaker_debugger': '{}'.format(enable_sagemaker_debugger),\n",
- " 'enable_checkpointing': '{}'.format(enable_checkpointing),\n",
- " 'enable_tensorboard': '{}'.format(enable_tensorboard), \n",
- " 'run_validation': '{}'.format(run_validation),\n",
- " 'run_test': '{}'.format(run_test),\n",
- " 'run_sample_predictions': '{}'.format(run_sample_predictions),\n",
- " 'model_dir': '{}'.format(train_output_location),\n",
- " 'sagemaker_program': 'tf_bert_reviews.py',\n",
- " 'sagemaker_region': '{}'.format(region),\n",
- " 'sagemaker_submit_directory': training_code_s3_uri\n",
+ "\n",
+ " hyperparameters = {\n",
+ " \"epochs\": \"{}\".format(epochs),\n",
+ " \"learning_rate\": \"{}\".format(learning_rate),\n",
+ " \"epsilon\": \"{}\".format(epsilon),\n",
+ " \"train_batch_size\": \"{}\".format(train_batch_size),\n",
+ " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n",
+ " \"test_batch_size\": \"{}\".format(test_batch_size),\n",
+ " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n",
+ " \"validation_steps\": \"{}\".format(validation_steps),\n",
+ " \"test_steps\": \"{}\".format(test_steps),\n",
+ " \"use_xla\": \"{}\".format(use_xla),\n",
+ " \"use_amp\": \"{}\".format(use_amp),\n",
+ " \"max_seq_length\": \"{}\".format(max_seq_length),\n",
+ " \"freeze_bert_layer\": \"{}\".format(freeze_bert_layer),\n",
+ " \"enable_sagemaker_debugger\": \"{}\".format(enable_sagemaker_debugger),\n",
+ " \"enable_checkpointing\": \"{}\".format(enable_checkpointing),\n",
+ " \"enable_tensorboard\": \"{}\".format(enable_tensorboard),\n",
+ " \"run_validation\": \"{}\".format(run_validation),\n",
+ " \"run_test\": \"{}\".format(run_test),\n",
+ " \"run_sample_predictions\": \"{}\".format(run_sample_predictions),\n",
+ " \"model_dir\": \"{}\".format(train_output_location),\n",
+ " \"sagemaker_program\": \"tf_bert_reviews.py\",\n",
+ " \"sagemaker_region\": \"{}\".format(region),\n",
+ " \"sagemaker_submit_directory\": training_code_s3_uri,\n",
" }\n",
" hyperparameters_json = json.dumps(hyperparameters)\n",
- " \n",
+ "\n",
" # metric_definitions='{\"val_acc\": \"val_accuracy: ([0-9\\\\\\\\.]+)\"}',\n",
" metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
" ]\n",
" metrics_definitions_json = json.dumps(metrics_definitions)\n",
" print(metrics_definitions_json)\n",
"\n",
- "\n",
" # .after(process) is explicitly appended below\n",
- " train_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04'.format(region) \n",
+ " train_image = \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04\".format(region)\n",
" training = sagemaker_train_op(\n",
" region=region,\n",
" image=train_image,\n",
- " network_isolation=network_isolation, \n",
+ " network_isolation=network_isolation,\n",
" instance_type=train_instance_type,\n",
" instance_count=train_instance_count,\n",
" hyperparameters=hyperparameters_json,\n",
- " training_input_mode=input_mode, \n",
- " channels=train_channels, \n",
+ " training_input_mode=input_mode,\n",
+ " channels=train_channels,\n",
" model_artifact_path=train_output_location,\n",
" # metric_definitions=metrics_definitions_json,\n",
" # TODO: Add rules\n",
- " role=role \n",
+ " role=role,\n",
" ).after(process)\n",
"\n",
- "\n",
" ########################\n",
" # DEPLOY\n",
" ########################\n",
- " \n",
+ "\n",
" # .after(training) is implied because we depend on training.outputs[]\n",
- " serve_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.3.1-cpu'.format(region)\n",
+ " serve_image = \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.3.1-cpu\".format(region)\n",
" create_model = sagemaker_model_op(\n",
" region=region,\n",
" model_name=training.outputs[\"job_name\"],\n",
" image=serve_image,\n",
- " network_isolation=network_isolation, \n",
+ " network_isolation=network_isolation,\n",
" model_artifact_url=training.outputs[\"model_artifact_url\"],\n",
- " role=role\n",
+ " role=role,\n",
" )\n",
"\n",
- " deploy_instance_type='ml.c5.9xlarge'\n",
- " deploy_instance_count=1\n",
+ " deploy_instance_type = \"ml.c5.9xlarge\"\n",
+ " deploy_instance_count = 1\n",
"\n",
" # .after(create_model) is implied because we depend on create_model.outputs\n",
" deploy_model = sagemaker_deploy_op(\n",
" region=region,\n",
- " variant_name_1='AllTraffic',\n",
+ " variant_name_1=\"AllTraffic\",\n",
" model_name_1=create_model.output,\n",
" instance_type_1=deploy_instance_type,\n",
- " initial_instance_count_1=deploy_instance_count \n",
+ " initial_instance_count_1=deploy_instance_count,\n",
" )"
]
},
@@ -551,7 +558,7 @@
"metadata": {},
"outputs": [],
"source": [
- "kfp.compiler.Compiler().compile(bert_pipeline, 'bert-pipeline.zip')"
+ "kfp.compiler.Compiler().compile(bert_pipeline, \"bert-pipeline.zip\")"
]
},
{
@@ -598,11 +605,9 @@
"source": [
"client = kfp.Client()\n",
"\n",
- "experiment = client.create_experiment(name='kubeflow')\n",
+ "experiment = client.create_experiment(name=\"kubeflow\")\n",
"\n",
- "my_run = client.run_pipeline(experiment.id, \n",
- " 'bert-pipeline', \n",
- " 'bert-pipeline.zip')"
+ "my_run = client.run_pipeline(experiment.id, \"bert-pipeline\", \"bert-pipeline.zip\")"
]
},
{
@@ -660,9 +665,9 @@
"source": [
"import boto3\n",
"\n",
- "sm_runtime = boto3.Session(region_name=region).client('sagemaker-runtime')\n",
+ "sm_runtime = boto3.Session(region_name=region).client(\"sagemaker-runtime\")\n",
"\n",
- "endpoint_name = ''"
+ "endpoint_name = \"\""
]
},
{
@@ -673,28 +678,25 @@
"source": [
"import json\n",
"\n",
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "] \n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"response = sm_runtime.invoke_endpoint(\n",
- " EndpointName=endpoint_name,\n",
- " ContentType='application/jsonlines', \n",
- " Accept='application/jsonlines', \n",
- " Body=json.dumps(inputs).encode('utf-8')\n",
+ " EndpointName=endpoint_name,\n",
+ " ContentType=\"application/jsonlines\",\n",
+ " Accept=\"application/jsonlines\",\n",
+ " Body=json.dumps(inputs).encode(\"utf-8\"),\n",
")\n",
- "print('response: {}'.format(response))\n",
+ "print(\"response: {}\".format(response))\n",
"\n",
- "predicted_classes_str = response['Body'].read().decode()\n",
+ "predicted_classes_str = response[\"Body\"].read().decode()\n",
"predicted_classes_json = json.loads(predicted_classes_str)\n",
"\n",
"predicted_classes = predicted_classes_json.splitlines()\n",
- "print('predicted_classes: {}'.format(predicted_classes))\n",
+ "print(\"predicted_classes: {}\".format(predicted_classes))\n",
"\n",
"for predicted_class_json, input_data in zip(predicted_classes, inputs):\n",
- " predicted_class = json.loads(predicted_class_json)['predicted_label']\n",
- " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0])) "
+ " predicted_class = json.loads(predicted_class_json)[\"predicted_label\"]\n",
+ " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0]))"
]
},
{
diff --git a/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb b/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb
index 86a70c1e..8e6b432c 100644
--- a/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb
+++ b/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb
@@ -42,7 +42,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/10_pipeline/kubeflow/code/inference.py b/10_pipeline/kubeflow/code/inference.py
index 2975dc2d..53196737 100644
--- a/10_pipeline/kubeflow/code/inference.py
+++ b/10_pipeline/kubeflow/code/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/10_pipeline/kubeflow/code/tf_bert_reviews.py b/10_pipeline/kubeflow/code/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/10_pipeline/kubeflow/code/tf_bert_reviews.py
+++ b/10_pipeline/kubeflow/code/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/10_pipeline/kubeflow/evaluate_model_metrics.py b/10_pipeline/kubeflow/evaluate_model_metrics.py
index 024afdec..f3523174 100644
--- a/10_pipeline/kubeflow/evaluate_model_metrics.py
+++ b/10_pipeline/kubeflow/evaluate_model_metrics.py
@@ -4,13 +4,16 @@
from datetime import datetime
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
import pandas as pd
import os
import re
@@ -33,99 +36,99 @@
from sklearn.utils import resample
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
CLASSES = [1, 2, 3, 4, 5]
-config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
+config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+)
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--input-model', type=str,
- default='/opt/ml/processing/input/model',
+ parser.add_argument(
+ "--input-model",
+ type=str,
+ default="/opt/ml/processing/input/model",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
-
+ )
+
return parser.parse_args()
-
+
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- print('input_data: {}'.format(args.input_data))
- print('input_model: {}'.format(args.input_model))
-
- print('Listing contents of input model dir: {}'.format(args.input_model))
+ print("Current host: {}".format(args.current_host))
+
+ print("input_data: {}".format(args.input_data))
+ print("input_model: {}".format(args.input_model))
+
+ print("Listing contents of input model dir: {}".format(args.input_model))
input_files = os.listdir(args.input_model)
for file in input_files:
print(file)
- model_tar_path = '{}/model.tar.gz'.format(args.input_model)
+ model_tar_path = "{}/model.tar.gz".format(args.input_model)
model_tar = tarfile.open(model_tar_path)
model_tar.extractall(args.input_model)
- model_tar.close()
+ model_tar.close()
- model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model))
+ model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model))
print(model)
-
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=args.max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -133,81 +136,86 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
-
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
###########################################################################################
# TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz #
- ###########################################################################################
-# evaluation_data_path = '/opt/ml/processing/input/data/'
-
- print('Listing contents of input data dir: {}'.format(args.input_data))
+ ###########################################################################################
+ # evaluation_data_path = '/opt/ml/processing/input/data/'
+
+ print("Listing contents of input data dir: {}".format(args.input_data))
input_files = os.listdir(args.input_data)
- test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data)
- print('Using only {} to evaluate.'.format(test_data_path))
- df_test_reviews = pd.read_csv(test_data_path,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data)
+ print("Using only {} to evaluate.".format(test_data_path))
+ df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[
+ ["review_body", "star_rating"]
+ ]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
- y_actual = df_test_reviews['star_rating']
+ y_actual = df_test_reviews["star_rating"]
y_actual
print(classification_report(y_true=y_test, y_pred=y_actual))
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
def plot_conf_mat(cm, classes, title, cmap):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=CLASSES,
- title='Confusion Matrix',
- cmap=plt.cm.Greens)
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens)
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
- # Model Output
- metrics_path = os.path.join(args.output_data, 'metrics/')
+ # Model Output
+ metrics_path = os.path.join(args.output_data, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
report_dict = {
"metrics": {
@@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap):
evaluation_path = "{}/evaluation.json".format(metrics_path)
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))
-
- print('Listing contents of output dir: {}'.format(args.output_data))
+
+ print("Listing contents of output dir: {}".format(args.output_data))
output_files = os.listdir(args.output_data)
for file in output_files:
print(file)
- print('Listing contents of output/metrics dir: {}'.format(metrics_path))
- output_files = os.listdir('{}'.format(metrics_path))
+ print("Listing contents of output/metrics dir: {}".format(metrics_path))
+ output_files = os.listdir("{}".format(metrics_path))
for file in output_files:
print(file)
- print('Complete')
-
-
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
- process(args)
+ process(args)
diff --git a/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py
index 1211ba85..7e1cd385 100644
--- a/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py
+++ b/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py
@@ -20,16 +20,18 @@
import subprocess
## PIP INSTALLS ##
-# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
+# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
# use anaconda and anaconda only supports 2.3.0 at this time
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"])
import pandas as pd
import re
import sagemaker
@@ -40,51 +42,55 @@
FeatureTypeEnum,
)
-region = os.environ['AWS_DEFAULT_REGION']
-print('Region: {}'.format(region))
+region = os.environ["AWS_DEFAULT_REGION"]
+print("Region: {}".format(region))
#############################
## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc.
## Role and Bucket are malformed if we do this later.
-sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region)
+sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region)
caller_identity = sts.get_caller_identity()
-print('caller_identity: {}'.format(caller_identity))
+print("caller_identity: {}".format(caller_identity))
-assumed_role_arn = caller_identity['Arn']
-print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn))
+assumed_role_arn = caller_identity["Arn"]
+print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn))
-assumed_role_name = assumed_role_arn.split('/')[-2]
+assumed_role_name = assumed_role_arn.split("/")[-2]
-iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region)
-get_role_response = iam.get_role(RoleName=assumed_role_name)
-print('get_role_response {}'.format(get_role_response))
-role = get_role_response['Role']['Arn']
-print('role {}'.format(role))
+iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region)
+get_role_response = iam.get_role(RoleName=assumed_role_name)
+print("get_role_response {}".format(get_role_response))
+role = get_role_response["Role"]["Arn"]
+print("role {}".format(role))
bucket = sagemaker.Session().default_bucket()
-print('The DEFAULT BUCKET is {}'.format(bucket))
+print("The DEFAULT BUCKET is {}".format(bucket))
#############################
-sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region)
+sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region)
-featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region)
+featurestore_runtime = boto3.Session(region_name=region).client(
+ service_name="sagemaker-featurestore-runtime", region_name=region
+)
-s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region)
+s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region)
-sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region),
- sagemaker_client=sm,
- sagemaker_featurestore_runtime_client=featurestore_runtime)
+sagemaker_session = sagemaker.Session(
+ boto_session=boto3.Session(region_name=region),
+ sagemaker_client=sm,
+ sagemaker_featurestore_runtime_client=featurestore_runtime,
+)
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-REVIEW_BODY_COLUMN = 'review_body'
-REVIEW_ID_COLUMN = 'review_id'
+REVIEW_BODY_COLUMN = "review_body"
+REVIEW_ID_COLUMN = "review_id"
# DATE_COLUMN = 'date'
-LABEL_COLUMN = 'star_rating'
+LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]
-
+
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
label_map[label] = i
@@ -92,94 +98,88 @@
def cast_object_to_string(data_frame):
for label in data_frame.columns:
- if data_frame.dtypes[label] == 'object':
+ if data_frame.dtypes[label] == "object":
data_frame[label] = data_frame[label].astype("str").astype("string")
return data_frame
-
+
def wait_for_feature_group_creation_complete(feature_group):
try:
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
while status == "Creating":
print("Waiting for Feature Group Creation")
time.sleep(5)
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
if status != "Created":
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
raise RuntimeError(f"Failed to create feature group {feature_group.name}")
print(f"FeatureGroup {feature_group.name} successfully created.")
except:
- print('No feature group created yet.')
-
-
+ print("No feature group created yet.")
+
+
def create_or_load_feature_group(prefix, feature_group_name):
# Feature Definitions for our records
- feature_definitions= [
- FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),
- FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),
-# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING)
+ feature_definitions = [
+ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
+ FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
+ # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]
-
+
feature_group = FeatureGroup(
- name=feature_group_name,
- feature_definitions=feature_definitions,
- sagemaker_session=sagemaker_session)
-
- print('Feature Group: {}'.format(feature_group))
-
- try:
- print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...')
+ name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
+ )
+
+ print("Feature Group: {}".format(feature_group))
+
+ try:
+ print(
+ "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
+ )
wait_for_feature_group_creation_complete(feature_group)
except Exception as e:
- print('Before CREATE FG wait exeption: {}'.format(e))
-# pass
-
+ print("Before CREATE FG wait exeption: {}".format(e))
+ # pass
+
try:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"
-
- print('Creating Feature Group with role {}...'.format(role))
+
+ print("Creating Feature Group with role {}...".format(role))
feature_group.create(
s3_uri=f"s3://{bucket}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name=event_time_feature_name,
role_arn=role,
- enable_online_store=True
+ enable_online_store=True,
)
- print('Creating Feature Group. Completed.')
-
- print('Waiting for new Feature Group to become available...')
+ print("Creating Feature Group. Completed.")
+
+ print("Waiting for new Feature Group to become available...")
wait_for_feature_group_creation_complete(feature_group)
- print('Feature Group available.')
+ print("Feature Group available.")
feature_group.describe()
-
+
except Exception as e:
- print('Exception: {}'.format(e))
-
+ print("Exception: {}".format(e))
+
return feature_group
-
+
class InputFeatures(object):
- """BERT feature vectors."""
-
- def __init__(self,
- input_ids,
- input_mask,
- segment_ids,
- label_id,
- review_id,
- date,
- label):
-# review_body):
+ """BERT feature vectors."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
+ # review_body):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
@@ -187,36 +187,38 @@ def __init__(self,
self.review_id = review_id
self.date = date
self.label = label
+
+
# self.review_body = review_body
-
-
+
+
class Input(object):
- """A single training/test input for sequence classification."""
-
- def __init__(self, text, review_id, date, label=None):
- """Constructs an Input.
- Args:
- text: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.text = text
- self.review_id = review_id
- self.date = date
- self.label = label
-
-
+ """A single training/test input for sequence classification."""
+
+ def __init__(self, text, review_id, date, label=None):
+ """Constructs an Input.
+ Args:
+ text: string. The untokenized text of the first sequence. For single
+ sequence tasks, only this sequence must be specified.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.text = text
+ self.review_id = review_id
+ self.date = date
+ self.label = label
+
+
def convert_input(the_input, max_seq_length):
# First, we need to preprocess our data so that it matches the data BERT was trained on:
#
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- #
+ #
# Fortunately, the Transformers tokenizer does this for us!
#
- tokens = tokenizer.tokenize(the_input.text)
+ tokens = tokenizer.tokenize(the_input.text)
# Next, we need to do the following:
#
@@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length):
#
# Again, the Transformers tokenizer does this for us!
#
- encode_plus_tokens = tokenizer.encode_plus(the_input.text,
- pad_to_max_length=True,
- max_length=max_seq_length,
-# truncation=True
- )
+ encode_plus_tokens = tokenizer.encode_plus(
+ the_input.text,
+ pad_to_max_length=True,
+ max_length=max_seq_length,
+ # truncation=True
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
-
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ input_ids = encode_plus_tokens["input_ids"]
+
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
# Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
segment_ids = [0] * max_seq_length
@@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length):
label_id=label_id,
review_id=the_input.review_id,
date=the_input.date,
- label=the_input.label)
-# review_body=the_input.text)
-
-# print('**input_ids**\n{}\n'.format(features.input_ids))
-# print('**input_mask**\n{}\n'.format(features.input_mask))
-# print('**segment_ids**\n{}\n'.format(features.segment_ids))
-# print('**label_id**\n{}\n'.format(features.label_id))
-# print('**review_id**\n{}\n'.format(features.review_id))
-# print('**date**\n{}\n'.format(features.date))
-# print('**label**\n{}\n'.format(features.label))
-# print('**review_body**\n{}\n'.format(features.review_body))
+ label=the_input.label,
+ )
+ # review_body=the_input.text)
+
+ # print('**input_ids**\n{}\n'.format(features.input_ids))
+ # print('**input_mask**\n{}\n'.format(features.input_mask))
+ # print('**segment_ids**\n{}\n'.format(features.segment_ids))
+ # print('**label_id**\n{}\n'.format(features.label_id))
+ # print('**review_id**\n{}\n'.format(features.review_id))
+ # print('**date**\n{}\n'.format(features.date))
+ # print('**label**\n{}\n'.format(features.label))
+ # print('**review_body**\n{}\n'.format(features.review_body))
return features
-def transform_inputs_to_tfrecord(inputs,
- output_file,
- max_seq_length):
+def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
"""Convert a set of `Input`s to a TFRecord file."""
records = []
tf_record_writer = tf.io.TFRecordWriter(output_file)
-
+
for (input_idx, the_input) in enumerate(inputs):
if input_idx % 10000 == 0:
- print('Writing input {} of {}\n'.format(input_idx, len(inputs)))
+ print("Writing input {} of {}\n".format(input_idx, len(inputs)))
features = convert_input(the_input, max_seq_length)
all_features = collections.OrderedDict()
- all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
- all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
- all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
- all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
+ all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
+ all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
+ all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
+ all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
tf_record_writer.write(tf_record.SerializeToString())
- records.append({#'tf_record': tf_record.SerializeToString(),
- 'input_ids': features.input_ids,
- 'input_mask': features.input_mask,
- 'segment_ids': features.segment_ids,
- 'label_id': features.label_id,
- 'review_id': the_input.review_id,
- 'date': the_input.date,
- 'label': features.label,
-# 'review_body': features.review_body
- })
+ records.append(
+ { #'tf_record': tf_record.SerializeToString(),
+ "input_ids": features.input_ids,
+ "input_mask": features.input_mask,
+ "segment_ids": features.segment_ids,
+ "label_id": features.label_id,
+ "review_id": the_input.review_id,
+ "date": the_input.date,
+ "label": features.label,
+ # 'review_body': features.review_body
+ }
+ )
#####################################
####### TODO: REMOVE THIS BREAK #######
- #####################################
+ #####################################
# break
-
+
tf_record_writer.close()
-
+
return records
-
+
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--train-split-percentage', type=float,
+ parser.add_argument(
+ "--train-split-percentage",
+ type=float,
default=0.90,
)
- parser.add_argument('--validation-split-percentage', type=float,
- default=0.05,
- )
- parser.add_argument('--test-split-percentage', type=float,
+ parser.add_argument(
+ "--validation-split-percentage",
+ type=float,
default=0.05,
)
- parser.add_argument('--balance-dataset', type=eval,
- default=True
+ parser.add_argument(
+ "--test-split-percentage",
+ type=float,
+ default=0.05,
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument("--balance-dataset", type=eval, default=True)
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
- parser.add_argument('--feature-store-offline-prefix', type=str,
+ )
+ parser.add_argument(
+ "--feature-store-offline-prefix",
+ type=str,
default=None,
- )
- parser.add_argument('--feature-group-name', type=str,
+ )
+ parser.add_argument(
+ "--feature-group-name",
+ type=str,
default=None,
- )
-
+ )
+
return parser.parse_args()
-
-def _transform_tsv_to_tfrecord(file,
- max_seq_length,
- balance_dataset,
- prefix,
- feature_group_name):
- print('file {}'.format(file))
- print('max_seq_length {}'.format(max_seq_length))
- print('balance_dataset {}'.format(balance_dataset))
- print('prefix {}'.format(prefix))
- print('feature_group_name {}'.format(feature_group_name))
+
+def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name):
+ print("file {}".format(file))
+ print("max_seq_length {}".format(max_seq_length))
+ print("balance_dataset {}".format(balance_dataset))
+ print("prefix {}".format(prefix))
+ print("feature_group_name {}".format(feature_group_name))
# need to re-load since we can't pass feature_group object in _partial functions for some reason
feature_group = create_or_load_feature_group(prefix, feature_group_name)
-
+
filename_without_extension = Path(Path(file).stem).stem
- df = pd.read_csv(file,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')
+ df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")
df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)
- print('Shape of dataframe {}'.format(df.shape))
+ print("Shape of dataframe {}".format(df.shape))
- if balance_dataset:
+ if balance_dataset:
# Balance the dataset down to the minority class
from sklearn.utils import resample
- five_star_df = df.query('star_rating == 5')
- four_star_df = df.query('star_rating == 4')
- three_star_df = df.query('star_rating == 3')
- two_star_df = df.query('star_rating == 2')
- one_star_df = df.query('star_rating == 1')
-
- minority_count = min(five_star_df.shape[0],
- four_star_df.shape[0],
- three_star_df.shape[0],
- two_star_df.shape[0],
- one_star_df.shape[0])
-
- five_star_df = resample(five_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- four_star_df = resample(four_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- three_star_df = resample(three_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- two_star_df = resample(two_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- one_star_df = resample(one_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
+ five_star_df = df.query("star_rating == 5")
+ four_star_df = df.query("star_rating == 4")
+ three_star_df = df.query("star_rating == 3")
+ two_star_df = df.query("star_rating == 2")
+ one_star_df = df.query("star_rating == 1")
+
+ minority_count = min(
+ five_star_df.shape[0],
+ four_star_df.shape[0],
+ three_star_df.shape[0],
+ two_star_df.shape[0],
+ one_star_df.shape[0],
+ )
+
+ five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)
df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])
- df_balanced = df_balanced.reset_index(drop=True)
- print('Shape of balanced dataframe {}'.format(df_balanced.shape))
- print(df_balanced['star_rating'].head(100))
+ df_balanced = df_balanced.reset_index(drop=True)
+ print("Shape of balanced dataframe {}".format(df_balanced.shape))
+ print(df_balanced["star_rating"].head(100))
df = df_balanced
-
- print('Shape of dataframe before splitting {}'.format(df.shape))
-
- print('train split percentage {}'.format(args.train_split_percentage))
- print('validation split percentage {}'.format(args.validation_split_percentage))
- print('test split percentage {}'.format(args.test_split_percentage))
-
+
+ print("Shape of dataframe before splitting {}".format(df.shape))
+
+ print("train split percentage {}".format(args.train_split_percentage))
+ print("validation split percentage {}".format(args.validation_split_percentage))
+ print("test split percentage {}".format(args.test_split_percentage))
+
holdout_percentage = 1.00 - args.train_split_percentage
- print('holdout percentage {}'.format(holdout_percentage))
- df_train, df_holdout = train_test_split(df,
- test_size=holdout_percentage,
- stratify=df['star_rating'])
+ print("holdout percentage {}".format(holdout_percentage))
+ df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])
test_holdout_percentage = args.test_split_percentage / holdout_percentage
- print('test holdout percentage {}'.format(test_holdout_percentage))
- df_validation, df_test = train_test_split(df_holdout,
- test_size=test_holdout_percentage,
- stratify=df_holdout['star_rating'])
-
+ print("test holdout percentage {}".format(test_holdout_percentage))
+ df_validation, df_test = train_test_split(
+ df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"]
+ )
+
df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
- print('Shape of train dataframe {}'.format(df_train.shape))
- print('Shape of validation dataframe {}'.format(df_validation.shape))
- print('Shape of test dataframe {}'.format(df_test.shape))
+ print("Shape of train dataframe {}".format(df_train.shape))
+ print("Shape of validation dataframe {}".format(df_validation.shape))
+ print("Shape of test dataframe {}".format(df_test.shape))
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)
- train_inputs = df_train.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- validation_inputs = df_validation.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- test_inputs = df_test.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
+ train_inputs = df_train.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ validation_inputs = df_validation.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ test_inputs = df_test.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
# Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
- #
- #
+ #
+ #
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
# 4. Map our words to indexes using a vocab file that BERT provides
# 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
# 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
- #
+ #
# We don't have to worry about these details. The Transformers tokenizer does this for us.
- #
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
+ #
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
# Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
- train_records = transform_inputs_to_tfrecord(train_inputs,
- '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- validation_records = transform_inputs_to_tfrecord(validation_inputs,
- '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- test_records = transform_inputs_to_tfrecord(test_inputs,
- '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension),
- max_seq_length)
-
+ train_records = transform_inputs_to_tfrecord(
+ train_inputs,
+ "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ validation_records = transform_inputs_to_tfrecord(
+ validation_inputs,
+ "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ test_records = transform_inputs_to_tfrecord(
+ test_inputs,
+ "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
df_train_records = pd.DataFrame.from_dict(train_records)
- df_train_records['split_type'] = 'train'
- df_train_records.head()
-
+ df_train_records["split_type"] = "train"
+ df_train_records.head()
+
df_validation_records = pd.DataFrame.from_dict(validation_records)
- df_validation_records['split_type'] = 'validation'
- df_validation_records.head()
+ df_validation_records["split_type"] = "validation"
+ df_validation_records.head()
df_test_records = pd.DataFrame.from_dict(test_records)
- df_test_records['split_type'] = 'test'
- df_test_records.head()
-
- # Add record to feature store
+ df_test_records["split_type"] = "test"
+ df_test_records.head()
+
+ # Add record to feature store
df_fs_train_records = cast_object_to_string(df_train_records)
df_fs_validation_records = cast_object_to_string(df_validation_records)
df_fs_test_records = cast_object_to_string(df_test_records)
- print('Ingesting Features...')
- feature_group.ingest(
- data_frame=df_fs_train_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_validation_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_test_records, max_workers=3, wait=True
- )
- print('Feature ingest completed.')
+ print("Ingesting Features...")
+ feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True)
+ print("Feature ingest completed.")
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
+ print("Current host: {}".format(args.current_host))
+
+ feature_group = create_or_load_feature_group(
+ prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name
+ )
feature_group.describe()
-
+
print(feature_group.as_hive_ddl())
-
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
-
- transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord,
- max_seq_length=args.max_seq_length,
- balance_dataset=args.balance_dataset,
- prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
-
- input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data))
+
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
+
+ transform_tsv_to_tfrecord = functools.partial(
+ _transform_tsv_to_tfrecord,
+ max_seq_length=args.max_seq_length,
+ balance_dataset=args.balance_dataset,
+ prefix=args.feature_store_offline_prefix,
+ feature_group_name=args.feature_group_name,
+ )
+
+ input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
num_cpus = multiprocessing.cpu_count()
- print('num_cpus {}'.format(num_cpus))
+ print("num_cpus {}".format(num_cpus))
p = multiprocessing.Pool(num_cpus)
p.map(transform_tsv_to_tfrecord, input_files)
- print('Listing contents of {}'.format(args.output_data))
+ print("Listing contents of {}".format(args.output_data))
dirs_output = os.listdir(args.output_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(train_data))
+ print("Listing contents of {}".format(train_data))
dirs_output = os.listdir(train_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(validation_data))
+ print("Listing contents of {}".format(validation_data))
dirs_output = os.listdir(validation_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(test_data))
+ print("Listing contents of {}".format(test_data))
dirs_output = os.listdir(test_data)
for file in dirs_output:
print(file)
-
+
offline_store_contents = None
- while (offline_store_contents is None):
- objects_in_bucket = s3.list_objects(Bucket=bucket,
- Prefix=args.feature_store_offline_prefix)
- if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
- offline_store_contents = objects_in_bucket['Contents']
+ while offline_store_contents is None:
+ objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix)
+ if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
+ offline_store_contents = objects_in_bucket["Contents"]
else:
- print('Waiting for data in offline store...\n')
+ print("Waiting for data in offline store...\n")
sleep(60)
- print('Data available.')
-
- print('Complete')
-
-
+ print("Data available.")
+
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
process(args)
diff --git a/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb b/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb
index cfef442d..1914e62f 100644
--- a/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb
+++ b/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb
@@ -26,16 +26,16 @@
"import pandas as pd\n",
"from pprint import pprint\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "sc = boto3.Session().client(service_name='servicecatalog', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "codepipeline = boto3.Session().client('codepipeline', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "sc = boto3.Session().client(service_name=\"servicecatalog\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "codepipeline = boto3.Session().client(\"codepipeline\", region_name=region)"
]
},
{
@@ -45,15 +45,10 @@
"outputs": [],
"source": [
"search_response = sc.search_products(\n",
- " Filters={\n",
- " 'FullTextSearch': \n",
- " [\n",
- " 'MLOps template for model building, training, and deployment'\n",
- " ]\n",
- " }\n",
+ " Filters={\"FullTextSearch\": [\"MLOps template for model building, training, and deployment\"]}\n",
")\n",
"\n",
- "sagemaker_pipeline_product_id = search_response['ProductViewSummaries'][0]['ProductId']\n",
+ "sagemaker_pipeline_product_id = search_response[\"ProductViewSummaries\"][0][\"ProductId\"]\n",
"print(sagemaker_pipeline_product_id)"
]
},
@@ -65,7 +60,7 @@
"source": [
"describe_response = sc.describe_product(Id=sagemaker_pipeline_product_id)\n",
"\n",
- "sagemaker_pipeline_product_provisioning_artifact_id = describe_response['ProvisioningArtifacts'][0]['Id']"
+ "sagemaker_pipeline_product_provisioning_artifact_id = describe_response[\"ProvisioningArtifacts\"][0][\"Id\"]"
]
},
{
@@ -101,22 +96,22 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_project_name = 'dsoaws-{}'.format(timestamp)\n",
+ "sagemaker_project_name = \"dsoaws-{}\".format(timestamp)\n",
"\n",
"create_response = sm.create_project(\n",
" ProjectName=sagemaker_project_name,\n",
- " ProjectDescription='dsoaws-{}'.format(timestamp),\n",
+ " ProjectDescription=\"dsoaws-{}\".format(timestamp),\n",
" ServiceCatalogProvisioningDetails={\n",
- " 'ProductId': sagemaker_pipeline_product_id,\n",
- " 'ProvisioningArtifactId': sagemaker_pipeline_product_provisioning_artifact_id\n",
- " }\n",
+ " \"ProductId\": sagemaker_pipeline_product_id,\n",
+ " \"ProvisioningArtifactId\": sagemaker_pipeline_product_provisioning_artifact_id,\n",
+ " },\n",
")\n",
"\n",
- "sagemaker_project_id = create_response['ProjectId']\n",
- "sagemaker_project_arn = create_response['ProjectArn']\n",
+ "sagemaker_project_id = create_response[\"ProjectId\"]\n",
+ "sagemaker_project_arn = create_response[\"ProjectArn\"]\n",
"\n",
- "print('Project ID {}'.format(sagemaker_project_id))\n",
- "print('Project ARN {}'.format(sagemaker_project_arn))"
+ "print(\"Project ID {}\".format(sagemaker_project_id))\n",
+ "print(\"Project ARN {}\".format(sagemaker_project_arn))"
]
},
{
@@ -125,9 +120,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_project_name_and_id = '{}-{}'.format(sagemaker_project_name, sagemaker_project_id)\n",
+ "sagemaker_project_name_and_id = \"{}-{}\".format(sagemaker_project_name, sagemaker_project_id)\n",
"\n",
- "print('Combined Project ID and ARN combined: {}'.format(sagemaker_project_name_and_id))"
+ "print(\"Combined Project ID and ARN combined: {}\".format(sagemaker_project_name_and_id))"
]
},
{
@@ -149,26 +144,26 @@
"\n",
"try:\n",
" describe_project_response = sm.describe_project(ProjectName=sagemaker_project_name)\n",
- " project_status = describe_project_response['ProjectStatus']\n",
- " print('Creating Project...')\n",
+ " project_status = describe_project_response[\"ProjectStatus\"]\n",
+ " print(\"Creating Project...\")\n",
"\n",
- " while project_status in ['Pending', 'CreateInProgress']:\n",
- " print('Please wait...')\n",
+ " while project_status in [\"Pending\", \"CreateInProgress\"]:\n",
+ " print(\"Please wait...\")\n",
" time.sleep(30)\n",
" describe_project_response = sm.describe_project(ProjectName=sagemaker_project_name)\n",
- " project_status = describe_project_response['ProjectStatus']\n",
- " print('Project status: {}'.format(project_status))\n",
+ " project_status = describe_project_response[\"ProjectStatus\"]\n",
+ " print(\"Project status: {}\".format(project_status))\n",
"\n",
- " if project_status == 'CreateCompleted': \n",
- " print('Project {}'.format(project_status))\n",
+ " if project_status == \"CreateCompleted\":\n",
+ " print(\"Project {}\".format(project_status))\n",
"\n",
" else:\n",
- " print('Project status: {}'.format(project_status))\n",
- " raise Exception('Project not created.')\n",
- " \n",
+ " print(\"Project status: {}\".format(project_status))\n",
+ " raise Exception(\"Project not created.\")\n",
+ "\n",
"except Exception as e:\n",
" print(e)\n",
- " \n",
+ "\n",
"print(describe_project_response)"
]
},
@@ -193,7 +188,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sc_role_name='AmazonSageMakerServiceCatalogProductsUseRole'"
+ "sc_role_name = \"AmazonSageMakerServiceCatalogProductsUseRole\""
]
},
{
@@ -202,7 +197,7 @@
"metadata": {},
"outputs": [],
"source": [
- "account_id = sts.get_caller_identity()['Account']\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"print(account_id)"
]
},
@@ -212,10 +207,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.attach_role_policy(\n",
- " RoleName=sc_role_name,\n",
- " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n",
- ")\n",
+ "response = iam.attach_role_policy(RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/AmazonSageMakerFullAccess\")\n",
"\n",
"print(response)"
]
@@ -229,8 +221,7 @@
"outputs": [],
"source": [
"response = iam.attach_role_policy(\n",
- " RoleName=sc_role_name,\n",
- " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFeatureStoreAccess'\n",
+ " RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/AmazonSageMakerFeatureStoreAccess\"\n",
")\n",
"\n",
"print(response)"
@@ -242,10 +233,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.attach_role_policy(\n",
- " RoleName=sc_role_name,\n",
- " PolicyArn='arn:aws:iam::aws:policy/IAMFullAccess'\n",
- ")\n",
+ "response = iam.attach_role_policy(RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/IAMFullAccess\")\n",
"\n",
"print(response)"
]
@@ -264,7 +252,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sample_abalone_pipeline_execution_arn = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries'][0]['PipelineExecutionArn']\n",
+ "sample_abalone_pipeline_execution_arn = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n",
+ " \"PipelineExecutionSummaries\"\n",
+ "][0][\"PipelineExecutionArn\"]\n",
"\n",
"print(sample_abalone_pipeline_execution_arn)"
]
@@ -287,25 +277,29 @@
"%%time\n",
"\n",
"try:\n",
- " describe_pipeline_execution_response = sm.describe_pipeline_execution(PipelineExecutionArn=sample_abalone_pipeline_execution_arn)\n",
- " pipeline_execution_status = describe_pipeline_execution_response['PipelineExecutionStatus']\n",
+ " describe_pipeline_execution_response = sm.describe_pipeline_execution(\n",
+ " PipelineExecutionArn=sample_abalone_pipeline_execution_arn\n",
+ " )\n",
+ " pipeline_execution_status = describe_pipeline_execution_response[\"PipelineExecutionStatus\"]\n",
"\n",
- " while pipeline_execution_status not in ['Stopped', 'Failed']:\n",
- " print('Please wait...')\n",
+ " while pipeline_execution_status not in [\"Stopped\", \"Failed\"]:\n",
+ " print(\"Please wait...\")\n",
" time.sleep(30)\n",
- " describe_pipeline_execution_response = sm.describe_pipeline_execution(PipelineExecutionArn=sample_abalone_pipeline_execution_arn)\n",
- " pipeline_execution_status = describe_pipeline_execution_response['PipelineExecutionStatus']\n",
- " print('Pipeline execution status: {}'.format(pipeline_execution_status))\n",
- "\n",
- " if pipeline_execution_status in ['Stopped', 'Failed']: \n",
- " print('Pipeline execution status {}'.format(pipeline_execution_status))\n",
+ " describe_pipeline_execution_response = sm.describe_pipeline_execution(\n",
+ " PipelineExecutionArn=sample_abalone_pipeline_execution_arn\n",
+ " )\n",
+ " pipeline_execution_status = describe_pipeline_execution_response[\"PipelineExecutionStatus\"]\n",
+ " print(\"Pipeline execution status: {}\".format(pipeline_execution_status))\n",
+ "\n",
+ " if pipeline_execution_status in [\"Stopped\", \"Failed\"]:\n",
+ " print(\"Pipeline execution status {}\".format(pipeline_execution_status))\n",
" else:\n",
- " print('Pipeline execution status: {}'.format(pipeline_execution_status))\n",
- " raise Exception('Pipeline execution not deleted.')\n",
- " \n",
+ " print(\"Pipeline execution status: {}\".format(pipeline_execution_status))\n",
+ " raise Exception(\"Pipeline execution not deleted.\")\n",
+ "\n",
"except Exception as e:\n",
" print(e)\n",
- " \n",
+ "\n",
"print(describe_pipeline_execution_response)"
]
},
@@ -333,8 +327,8 @@
"source": [
"import os\n",
"\n",
- "sm_studio_root_path='/root/' \n",
- "sm_notebooks_root_path='/home/ec2-user/SageMaker/'\n",
+ "sm_studio_root_path = \"/root/\"\n",
+ "sm_notebooks_root_path = \"/home/ec2-user/SageMaker/\"\n",
"\n",
"root_path = sm_notebooks_root_path if os.path.isdir(sm_notebooks_root_path) else sm_studio_root_path\n",
"\n",
@@ -356,7 +350,9 @@
"metadata": {},
"outputs": [],
"source": [
- "code_commit_repo1 = 'https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modelbuild'.format(region, sagemaker_project_name_and_id)\n",
+ "code_commit_repo1 = \"https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modelbuild\".format(\n",
+ " region, sagemaker_project_name_and_id\n",
+ ")\n",
"print(code_commit_repo1)"
]
},
@@ -366,7 +362,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_mlops_build_code = '{}{}/sagemaker-{}-modelbuild'.format(root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id)\n",
+ "sagemaker_mlops_build_code = \"{}{}/sagemaker-{}-modelbuild\".format(\n",
+ " root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id\n",
+ ")\n",
"print(sagemaker_mlops_build_code)"
]
},
@@ -376,7 +374,9 @@
"metadata": {},
"outputs": [],
"source": [
- "code_commit_repo2 = 'https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modeldeploy'.format(region, sagemaker_project_name_and_id)\n",
+ "code_commit_repo2 = \"https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modeldeploy\".format(\n",
+ " region, sagemaker_project_name_and_id\n",
+ ")\n",
"print(code_commit_repo2)"
]
},
@@ -386,7 +386,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_mlops_deploy_code = '{}{}/sagemaker-{}-modeldeploy'.format(root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id)\n",
+ "sagemaker_mlops_deploy_code = \"{}{}/sagemaker-{}-modeldeploy\".format(\n",
+ " root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id\n",
+ ")\n",
"print(sagemaker_mlops_deploy_code)"
]
},
@@ -447,7 +449,7 @@
"metadata": {},
"outputs": [],
"source": [
- "workshop_project_build_code='{}workshop/10_pipeline/mlops/sagemaker-project-modelbuild'.format(root_path)\n",
+ "workshop_project_build_code = \"{}workshop/10_pipeline/mlops/sagemaker-project-modelbuild\".format(root_path)\n",
"print(workshop_project_build_code)"
]
},
@@ -457,7 +459,7 @@
"metadata": {},
"outputs": [],
"source": [
- "workshop_project_deploy_code='{}workshop/10_pipeline/mlops/sagemaker-project-modeldeploy'.format(root_path)\n",
+ "workshop_project_deploy_code = \"{}workshop/10_pipeline/mlops/sagemaker-project-modeldeploy\".format(root_path)\n",
"print(workshop_project_deploy_code)"
]
},
@@ -579,13 +581,15 @@
"\n",
"while True:\n",
" try:\n",
- " print('Listing executions for our pipeline...')\n",
- " list_executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n",
- " break;\n",
+ " print(\"Listing executions for our pipeline...\")\n",
+ " list_executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n",
+ " \"PipelineExecutionSummaries\"\n",
+ " ]\n",
+ " break\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(list_executions_response)"
]
},
@@ -595,7 +599,7 @@
"metadata": {},
"outputs": [],
"source": [
- "build_pipeline_name = 'sagemaker-{}-modelbuild'.format(sagemaker_project_name_and_id)"
+ "build_pipeline_name = \"sagemaker-{}-modelbuild\".format(sagemaker_project_name_and_id)"
]
},
{
@@ -606,7 +610,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Check ModelBuild Pipeline'.format(build_pipeline_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Check ModelBuild Pipeline'.format(\n",
+ " build_pipeline_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -629,20 +639,24 @@
"import time\n",
"from pprint import pprint\n",
"\n",
- "executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n",
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n",
+ " \"PipelineExecutionSummaries\"\n",
+ "]\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)\n",
"\n",
- "while pipeline_execution_status=='Executing':\n",
+ "while pipeline_execution_status == \"Executing\":\n",
" try:\n",
- " executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n",
- " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
- "# print('Executions for our pipeline...')\n",
- "# print(pipeline_execution_status)\n",
+ " executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n",
+ " \"PipelineExecutionSummaries\"\n",
+ " ]\n",
+ " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
+ " # print('Executions for our pipeline...')\n",
+ " # print(pipeline_execution_status)\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(executions_response)"
]
},
@@ -666,7 +680,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n",
+ "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n",
"print(pipeline_execution_status)"
]
},
@@ -676,7 +690,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n",
+ "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n",
"print(pipeline_execution_arn)"
]
},
@@ -713,15 +727,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -754,13 +768,13 @@
"\n",
"while True:\n",
" try:\n",
- " print('Executions for our pipeline...')\n",
+ " print(\"Executions for our pipeline...\")\n",
" list_model_packages_response = sm.list_model_packages(ModelPackageGroupName=sagemaker_project_name_and_id)\n",
- " break;\n",
+ " break\n",
" except Exception as e:\n",
- " print('Please wait...')\n",
- " time.sleep(30) \n",
- " \n",
+ " print(\"Please wait...\")\n",
+ " time.sleep(30)\n",
+ "\n",
"pprint(list_model_packages_response)"
]
},
@@ -772,7 +786,7 @@
"source": [
"time.sleep(30)\n",
"\n",
- "model_package_arn = list_model_packages_response['ModelPackageSummaryList'][0]['ModelPackageArn']\n",
+ "model_package_arn = list_model_packages_response[\"ModelPackageSummaryList\"][0][\"ModelPackageArn\"]\n",
"print(model_package_arn)"
]
},
@@ -798,7 +812,7 @@
"source": [
"time.sleep(30)\n",
"\n",
- "model_name = sm.list_models()['Models'][0]['ModelName']\n",
+ "model_name = sm.list_models()[\"Models\"][0][\"ModelName\"]\n",
"print(model_name)"
]
},
@@ -810,7 +824,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Model'.format(region, model_name)))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Model'.format(\n",
+ " region, model_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -819,7 +839,7 @@
"metadata": {},
"outputs": [],
"source": [
- "deploy_pipeline_name = 'sagemaker-{}-modeldeploy'.format(sagemaker_project_name_and_id)"
+ "deploy_pipeline_name = \"sagemaker-{}-modeldeploy\".format(sagemaker_project_name_and_id)"
]
},
{
@@ -830,7 +850,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Check ModelDeploy Pipeline'.format(deploy_pipeline_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Check ModelDeploy Pipeline'.format(\n",
+ " deploy_pipeline_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -839,7 +865,7 @@
"metadata": {},
"outputs": [],
"source": [
- "staging_endpoint_name='{}-staging'.format(sagemaker_project_name)"
+ "staging_endpoint_name = \"{}-staging\".format(sagemaker_project_name)"
]
},
{
@@ -850,7 +876,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker Staging REST Endpoint'.format(region, staging_endpoint_name)))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker Staging REST Endpoint'.format(\n",
+ " region, staging_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -869,16 +901,16 @@
"%%time\n",
"\n",
"while True:\n",
- " try: \n",
- " waiter = sm.get_waiter('endpoint_in_service')\n",
- " print('Waiting for staging endpoint to be in `InService`...')\n",
+ " try:\n",
+ " waiter = sm.get_waiter(\"endpoint_in_service\")\n",
+ " print(\"Waiting for staging endpoint to be in `InService`...\")\n",
" waiter.wait(EndpointName=staging_endpoint_name)\n",
- " break;\n",
+ " break\n",
" except:\n",
- " print('Waiting for staging endpoint to be in `Creating`...')\n",
+ " print(\"Waiting for staging endpoint to be in `Creating`...\")\n",
" time.sleep(30)\n",
- " \n",
- "print('Staging endpoint deployed.')"
+ "\n",
+ "print(\"Staging endpoint deployed.\")"
]
},
{
@@ -917,15 +949,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
@@ -951,14 +983,16 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=staging_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer()) "
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=staging_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -967,15 +1001,12 @@
"metadata": {},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -993,7 +1024,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Deploy to Production Pipeline '.format(sagemaker_project_name_and_id, region)))"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Deploy to Production Pipeline '.format(\n",
+ " sagemaker_project_name_and_id, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1002,8 +1039,8 @@
"metadata": {},
"outputs": [],
"source": [
- "stage_name = 'DeployStaging'\n",
- "action_name = 'ApproveDeployment'"
+ "stage_name = \"DeployStaging\"\n",
+ "action_name = \"ApproveDeployment\""
]
},
{
@@ -1014,15 +1051,15 @@
"source": [
"time.sleep(30)\n",
"\n",
- "stage_states = codepipeline.get_pipeline_state(name=deploy_pipeline_name)['stageStates'] \n",
+ "stage_states = codepipeline.get_pipeline_state(name=deploy_pipeline_name)[\"stageStates\"]\n",
"\n",
"for stage_state in stage_states:\n",
"\n",
- " if stage_state['stageName'] == stage_name:\n",
- " for action_state in stage_state['actionStates']:\n",
- " if action_state['actionName'] == action_name:\n",
- " token = action_state['latestExecution']['token']\n",
- " \n",
+ " if stage_state[\"stageName\"] == stage_name:\n",
+ " for action_state in stage_state[\"actionStates\"]:\n",
+ " if action_state[\"actionName\"] == action_name:\n",
+ " token = action_state[\"latestExecution\"][\"token\"]\n",
+ "\n",
"print(token)"
]
},
@@ -1036,11 +1073,8 @@
" pipelineName=deploy_pipeline_name,\n",
" stageName=stage_name,\n",
" actionName=action_name,\n",
- " result={\n",
- " 'summary': 'Approve from Staging to Production',\n",
- " 'status': 'Approved'\n",
- " },\n",
- " token=token\n",
+ " result={\"summary\": \"Approve from Staging to Production\", \"status\": \"Approved\"},\n",
+ " token=token,\n",
")"
]
},
@@ -1059,7 +1093,7 @@
"source": [
"time.sleep(30)\n",
"\n",
- "production_endpoint_name='{}-prod'.format(sagemaker_project_name)"
+ "production_endpoint_name = \"{}-prod\".format(sagemaker_project_name)"
]
},
{
@@ -1070,7 +1104,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review SageMaker Production REST Endpoint'.format(region, production_endpoint_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review SageMaker Production REST Endpoint'.format(\n",
+ " region, production_endpoint_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1089,16 +1129,16 @@
"%%time\n",
"\n",
"while True:\n",
- " try: \n",
- " waiter = sm.get_waiter('endpoint_in_service')\n",
- " print('Waiting for production endpoint to be in `InService`...')\n",
+ " try:\n",
+ " waiter = sm.get_waiter(\"endpoint_in_service\")\n",
+ " print(\"Waiting for production endpoint to be in `InService`...\")\n",
" waiter.wait(EndpointName=production_endpoint_name)\n",
- " break;\n",
+ " break\n",
" except:\n",
- " print('Waiting for production endpoint to be in `Creating`...')\n",
+ " print(\"Waiting for production endpoint to be in `Creating`...\")\n",
" time.sleep(30)\n",
- " \n",
- "print('Production endpoint deployed.')"
+ "\n",
+ "print(\"Production endpoint deployed.\")"
]
},
{
@@ -1119,14 +1159,16 @@
"from sagemaker.serializers import JSONLinesSerializer\n",
"from sagemaker.deserializers import JSONLinesDeserializer\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=production_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0,\n",
- " content_type='application/jsonlines',\n",
- " accept_type='application/jsonlines',\n",
- " serializer=JSONLinesSerializer(),\n",
- " deserializer=JSONLinesDeserializer()) "
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=production_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ " content_type=\"application/jsonlines\",\n",
+ " accept_type=\"application/jsonlines\",\n",
+ " serializer=JSONLinesSerializer(),\n",
+ " deserializer=JSONLinesDeserializer(),\n",
+ ")"
]
},
{
@@ -1135,15 +1177,12 @@
"metadata": {},
"outputs": [],
"source": [
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "]\n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"predicted_classes = predictor.predict(inputs)\n",
"\n",
"for predicted_class in predicted_classes:\n",
- " print('Predicted star_rating: {}'.format(predicted_class))"
+ " print(\"Predicted star_rating: {}\".format(predicted_class))"
]
},
{
@@ -1175,15 +1214,15 @@
"\n",
"viz = LineageTableVisualizer(sagemaker.session.Session())\n",
"\n",
- "for execution_step in reversed(steps['PipelineExecutionSteps']):\n",
+ "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n",
" print(execution_step)\n",
" # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n",
- " if execution_step['StepName'] == 'Processing':\n",
- " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n",
+ " if execution_step[\"StepName\"] == \"Processing\":\n",
+ " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(processing_job_name)\n",
" display(viz.show(processing_job_name=processing_job_name))\n",
- " elif execution_step['StepName'] == 'Train':\n",
- " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n",
+ " elif execution_step[\"StepName\"] == \"Train\":\n",
+ " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n",
" print(training_job_name)\n",
" display(viz.show(training_job_name=training_job_name))\n",
" else:\n",
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py
index 024afdec..f3523174 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py
@@ -4,13 +4,16 @@
from datetime import datetime
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
import pandas as pd
import os
import re
@@ -33,99 +36,99 @@
from sklearn.utils import resample
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
CLASSES = [1, 2, 3, 4, 5]
-config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
+config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+)
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--input-model', type=str,
- default='/opt/ml/processing/input/model',
+ parser.add_argument(
+ "--input-model",
+ type=str,
+ default="/opt/ml/processing/input/model",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
-
+ )
+
return parser.parse_args()
-
+
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- print('input_data: {}'.format(args.input_data))
- print('input_model: {}'.format(args.input_model))
-
- print('Listing contents of input model dir: {}'.format(args.input_model))
+ print("Current host: {}".format(args.current_host))
+
+ print("input_data: {}".format(args.input_data))
+ print("input_model: {}".format(args.input_model))
+
+ print("Listing contents of input model dir: {}".format(args.input_model))
input_files = os.listdir(args.input_model)
for file in input_files:
print(file)
- model_tar_path = '{}/model.tar.gz'.format(args.input_model)
+ model_tar_path = "{}/model.tar.gz".format(args.input_model)
model_tar = tarfile.open(model_tar_path)
model_tar.extractall(args.input_model)
- model_tar.close()
+ model_tar.close()
- model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model))
+ model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model))
print(model)
-
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=args.max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -133,81 +136,86 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
-
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
###########################################################################################
# TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz #
- ###########################################################################################
-# evaluation_data_path = '/opt/ml/processing/input/data/'
-
- print('Listing contents of input data dir: {}'.format(args.input_data))
+ ###########################################################################################
+ # evaluation_data_path = '/opt/ml/processing/input/data/'
+
+ print("Listing contents of input data dir: {}".format(args.input_data))
input_files = os.listdir(args.input_data)
- test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data)
- print('Using only {} to evaluate.'.format(test_data_path))
- df_test_reviews = pd.read_csv(test_data_path,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data)
+ print("Using only {} to evaluate.".format(test_data_path))
+ df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[
+ ["review_body", "star_rating"]
+ ]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
- y_actual = df_test_reviews['star_rating']
+ y_actual = df_test_reviews["star_rating"]
y_actual
print(classification_report(y_true=y_test, y_pred=y_actual))
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
def plot_conf_mat(cm, classes, title, cmap):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=CLASSES,
- title='Confusion Matrix',
- cmap=plt.cm.Greens)
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens)
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
- # Model Output
- metrics_path = os.path.join(args.output_data, 'metrics/')
+ # Model Output
+ metrics_path = os.path.join(args.output_data, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
report_dict = {
"metrics": {
@@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap):
evaluation_path = "{}/evaluation.json".format(metrics_path)
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))
-
- print('Listing contents of output dir: {}'.format(args.output_data))
+
+ print("Listing contents of output dir: {}".format(args.output_data))
output_files = os.listdir(args.output_data)
for file in output_files:
print(file)
- print('Listing contents of output/metrics dir: {}'.format(metrics_path))
- output_files = os.listdir('{}'.format(metrics_path))
+ print("Listing contents of output/metrics dir: {}".format(metrics_path))
+ output_files = os.listdir("{}".format(metrics_path))
for file in output_files:
print(file)
- print('Complete')
-
-
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
- process(args)
+ process(args)
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py
index 2975dc2d..53196737 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py
index 20561f85..07b2b0ef 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py
@@ -35,20 +35,12 @@
ScriptProcessor,
)
-from sagemaker.workflow.parameters import (
- ParameterInteger,
- ParameterString,
- ParameterFloat
-)
+from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat
from sagemaker.workflow.pipeline import Pipeline
-from sagemaker.workflow.steps import (
- ProcessingStep,
- TrainingStep,
- CreateModelStep
-)
+from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep
-from sagemaker.model_metrics import MetricsSource, ModelMetrics
+from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
ConditionStep,
@@ -62,22 +54,15 @@
from sagemaker.inputs import CreateModelInput
-sess = sagemaker.Session()
+sess = sagemaker.Session()
bucket = sess.default_bucket()
timestamp = int(time.time())
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
-print('BASE_DIR: {}'.format(BASE_DIR))
+print("BASE_DIR: {}".format(BASE_DIR))
-def get_pipeline(
- region,
- role,
- default_bucket,
- pipeline_name,
- model_package_group_name,
- base_job_prefix
-):
+def get_pipeline(region, role, default_bucket, pipeline_name, model_package_group_name, base_job_prefix):
"""Gets a SageMaker ML Pipeline instance working with BERT.
Args:
@@ -91,23 +76,17 @@ def get_pipeline(
Returns:
an instance of a pipeline
"""
-
- sm = boto3.Session().client(service_name='sagemaker', region_name=region)
-
+
+ sm = boto3.Session().client(service_name="sagemaker", region_name=region)
+
input_data = ParameterString(
name="InputDataUrl",
default_value="s3://{}/amazon-reviews-pds/tsv/".format(bucket),
)
-
- processing_instance_count = ParameterInteger(
- name="ProcessingInstanceCount",
- default_value=1
- )
- processing_instance_type = ParameterString(
- name="ProcessingInstanceType",
- default_value="ml.c5.2xlarge"
- )
+ processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
+
+ processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.c5.2xlarge")
max_seq_length = ParameterInteger(
name="MaxSeqLength",
@@ -140,58 +119,53 @@ def get_pipeline(
)
feature_group_name = ParameterString(
- name="FeatureGroupName",
- default_value="reviews-feature-group-" + str(timestamp)
- )
-
- train_instance_type = ParameterString(
- name="TrainingInstanceType",
- default_value="ml.c5.9xlarge"
+ name="FeatureGroupName", default_value="reviews-feature-group-" + str(timestamp)
)
- train_instance_count = ParameterInteger(
- name="TrainingInstanceCount",
- default_value=1
- )
-
-
-
+ train_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.c5.9xlarge")
+
+ train_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1)
+
#########################
# PROCESSING STEP
#########################
-
+
processor = SKLearnProcessor(
- framework_version='0.23-1',
+ framework_version="0.23-1",
role=role,
instance_type=processing_instance_type,
instance_count=processing_instance_count,
- env={'AWS_DEFAULT_REGION': region},
- max_runtime_in_seconds=7200)
-
- processing_inputs=[
+ env={"AWS_DEFAULT_REGION": region},
+ max_runtime_in_seconds=7200,
+ )
+
+ processing_inputs = [
ProcessingInput(
- input_name='raw-input-data',
+ input_name="raw-input-data",
source=input_data,
- destination='/opt/ml/processing/input/data/',
- s3_data_distribution_type='ShardedByS3Key'
+ destination="/opt/ml/processing/input/data/",
+ s3_data_distribution_type="ShardedByS3Key",
)
]
-
- processing_outputs=[
- ProcessingOutput(output_name='bert-train',
- s3_upload_mode='EndOfJob',
- source='/opt/ml/processing/output/bert/train',
- ),
- ProcessingOutput(output_name='bert-validation',
- s3_upload_mode='EndOfJob',
- source='/opt/ml/processing/output/bert/validation',
- ),
- ProcessingOutput(output_name='bert-test',
- s3_upload_mode='EndOfJob',
- source='/opt/ml/processing/output/bert/test',
- ),
+
+ processing_outputs = [
+ ProcessingOutput(
+ output_name="bert-train",
+ s3_upload_mode="EndOfJob",
+ source="/opt/ml/processing/output/bert/train",
+ ),
+ ProcessingOutput(
+ output_name="bert-validation",
+ s3_upload_mode="EndOfJob",
+ source="/opt/ml/processing/output/bert/validation",
+ ),
+ ProcessingOutput(
+ output_name="bert-test",
+ s3_upload_mode="EndOfJob",
+ source="/opt/ml/processing/output/bert/test",
+ ),
]
-
+
# TODO: Figure out why the Parameter's are not resolving properly to their native type when user here.
# We shouldn't be using `default_value`
processing_step = ProcessingStep(
@@ -200,72 +174,48 @@ def get_pipeline(
inputs=processing_inputs,
outputs=processing_outputs,
job_arguments=[
- '--train-split-percentage', str(train_split_percentage.default_value),
- '--validation-split-percentage', str(validation_split_percentage.default_value),
- '--test-split-percentage', str(test_split_percentage.default_value),
- '--max-seq-length', str(max_seq_length.default_value),
- '--balance-dataset', str(balance_dataset.default_value),
- '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),
- '--feature-group-name', str(feature_group_name.default_value)
+ "--train-split-percentage",
+ str(train_split_percentage.default_value),
+ "--validation-split-percentage",
+ str(validation_split_percentage.default_value),
+ "--test-split-percentage",
+ str(test_split_percentage.default_value),
+ "--max-seq-length",
+ str(max_seq_length.default_value),
+ "--balance-dataset",
+ str(balance_dataset.default_value),
+ "--feature-store-offline-prefix",
+ str(feature_store_offline_prefix.default_value),
+ "--feature-group-name",
+ str(feature_group_name.default_value),
],
- code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py")
+ code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py"),
)
-
-
+
#########################
# TRAINING STEP
#########################
-
- epochs = ParameterInteger(
- name="Epochs",
- default_value=1
- )
-
- learning_rate = ParameterFloat(
- name="LearningRate",
- default_value=0.00001
- )
-
- epsilon = ParameterFloat(
- name="Epsilon",
- default_value=0.00000001
- )
-
- train_batch_size = ParameterInteger(
- name="TrainBatchSize",
- default_value=128
- )
-
- validation_batch_size = ParameterInteger(
- name="ValidationBatchSize",
- default_value=128
- )
-
- test_batch_size = ParameterInteger(
- name="TestBatchSize",
- default_value=128
- )
-
- train_steps_per_epoch = ParameterInteger(
- name="TrainStepsPerEpoch",
- default_value=50
- )
-
- validation_steps = ParameterInteger(
- name="ValidationSteps",
- default_value=50
- )
-
- test_steps = ParameterInteger(
- name="TestSteps",
- default_value=50
- )
-
- train_volume_size = ParameterInteger(
- name="TrainVolumeSize",
- default_value=1024
- )
-
+
+ epochs = ParameterInteger(name="Epochs", default_value=1)
+
+ learning_rate = ParameterFloat(name="LearningRate", default_value=0.00001)
+
+ epsilon = ParameterFloat(name="Epsilon", default_value=0.00000001)
+
+ train_batch_size = ParameterInteger(name="TrainBatchSize", default_value=128)
+
+ validation_batch_size = ParameterInteger(name="ValidationBatchSize", default_value=128)
+
+ test_batch_size = ParameterInteger(name="TestBatchSize", default_value=128)
+
+ train_steps_per_epoch = ParameterInteger(name="TrainStepsPerEpoch", default_value=50)
+
+ validation_steps = ParameterInteger(name="ValidationSteps", default_value=50)
+
+ test_steps = ParameterInteger(name="TestSteps", default_value=50)
+
+ train_volume_size = ParameterInteger(name="TrainVolumeSize", default_value=1024)
+
use_xla = ParameterString(
name="UseXLA",
default_value="True",
@@ -275,7 +225,7 @@ def get_pipeline(
name="UseAMP",
default_value="True",
)
-
+
freeze_bert_layer = ParameterString(
name="FreezeBERTLayer",
default_value="False",
@@ -285,7 +235,7 @@ def get_pipeline(
name="EnableSageMakerDebugger",
default_value="False",
)
-
+
enable_checkpointing = ParameterString(
name="EnableCheckpointing",
default_value="False",
@@ -295,7 +245,7 @@ def get_pipeline(
name="EnableTensorboard",
default_value="False",
)
-
+
input_mode = ParameterString(
name="InputMode",
default_value="File",
@@ -305,188 +255,171 @@ def get_pipeline(
name="RunValidation",
default_value="True",
)
-
+
run_test = ParameterString(
name="RunTest",
default_value="False",
)
-
+
run_sample_predictions = ParameterString(
name="RunSamplePredictions",
default_value="False",
)
-
-
+
metrics_definitions = [
- {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
- {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
- {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
- {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'}
+ {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"},
+ {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
+ {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"},
+ {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
]
-
- train_src=os.path.join(BASE_DIR, "src")
+
+ train_src = os.path.join(BASE_DIR, "src")
model_path = f"s3://{default_bucket}/{base_job_prefix}/output/model"
-
+
estimator = TensorFlow(
- entry_point='tf_bert_reviews.py',
+ entry_point="tf_bert_reviews.py",
source_dir=BASE_DIR,
role=role,
output_path=model_path,
instance_count=train_instance_count,
instance_type=train_instance_type,
volume_size=train_volume_size,
- py_version='py37',
- framework_version='2.3.1',
+ py_version="py37",
+ framework_version="2.3.1",
hyperparameters={
- 'epochs': epochs,
- 'learning_rate': learning_rate,
- 'epsilon': epsilon,
- 'train_batch_size': train_batch_size,
- 'validation_batch_size': validation_batch_size,
- 'test_batch_size': test_batch_size,
- 'train_steps_per_epoch': train_steps_per_epoch,
- 'validation_steps': validation_steps,
- 'test_steps': test_steps,
- 'use_xla': use_xla,
- 'use_amp': use_amp,
- 'max_seq_length': max_seq_length,
- 'freeze_bert_layer': freeze_bert_layer,
- 'enable_sagemaker_debugger': enable_sagemaker_debugger,
- 'enable_checkpointing': enable_checkpointing,
- 'enable_tensorboard': enable_tensorboard,
- 'run_validation': run_validation,
- 'run_test': run_test,
- 'run_sample_predictions': run_sample_predictions},
+ "epochs": epochs,
+ "learning_rate": learning_rate,
+ "epsilon": epsilon,
+ "train_batch_size": train_batch_size,
+ "validation_batch_size": validation_batch_size,
+ "test_batch_size": test_batch_size,
+ "train_steps_per_epoch": train_steps_per_epoch,
+ "validation_steps": validation_steps,
+ "test_steps": test_steps,
+ "use_xla": use_xla,
+ "use_amp": use_amp,
+ "max_seq_length": max_seq_length,
+ "freeze_bert_layer": freeze_bert_layer,
+ "enable_sagemaker_debugger": enable_sagemaker_debugger,
+ "enable_checkpointing": enable_checkpointing,
+ "enable_tensorboard": enable_tensorboard,
+ "run_validation": run_validation,
+ "run_test": run_test,
+ "run_sample_predictions": run_sample_predictions,
+ },
input_mode=input_mode,
metric_definitions=metrics_definitions,
-# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
- )
+ # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
+ )
training_step = TrainingStep(
- name='Train',
+ name="Train",
estimator=estimator,
inputs={
- 'train': TrainingInput(
- s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
- 'bert-train'
- ].S3Output.S3Uri,
- content_type='text/csv'
+ "train": TrainingInput(
+ s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-train"].S3Output.S3Uri,
+ content_type="text/csv",
+ ),
+ "validation": TrainingInput(
+ s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-validation"].S3Output.S3Uri,
+ content_type="text/csv",
),
- 'validation': TrainingInput(
- s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
- 'bert-validation'
- ].S3Output.S3Uri,
- content_type='text/csv'
+ "test": TrainingInput(
+ s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-test"].S3Output.S3Uri,
+ content_type="text/csv",
),
- 'test': TrainingInput(
- s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
- 'bert-test'
- ].S3Output.S3Uri,
- content_type='text/csv'
- )
- }
- )
-
-
+ },
+ )
+
#########################
# EVALUATION STEP
#########################
-
- evaluation_processor = SKLearnProcessor(framework_version='0.23-1',
- role=role,
- instance_type=processing_instance_type,
- instance_count=processing_instance_count,
- env={'AWS_DEFAULT_REGION': region},
- max_runtime_in_seconds=7200)
-
- evaluation_report = PropertyFile(
- name='EvaluationReport',
- output_name='metrics',
- path='evaluation.json'
- )
-
+
+ evaluation_processor = SKLearnProcessor(
+ framework_version="0.23-1",
+ role=role,
+ instance_type=processing_instance_type,
+ instance_count=processing_instance_count,
+ env={"AWS_DEFAULT_REGION": region},
+ max_runtime_in_seconds=7200,
+ )
+
+ evaluation_report = PropertyFile(name="EvaluationReport", output_name="metrics", path="evaluation.json")
+
evaluation_step = ProcessingStep(
- name='EvaluateModel',
+ name="EvaluateModel",
processor=evaluation_processor,
code=os.path.join(BASE_DIR, "evaluate_model_metrics.py"),
inputs=[
ProcessingInput(
source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
- destination='/opt/ml/processing/input/model'
+ destination="/opt/ml/processing/input/model",
),
ProcessingInput(
- source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri,
- destination='/opt/ml/processing/input/data'
- )
+ source=processing_step.properties.ProcessingInputs["raw-input-data"].S3Input.S3Uri,
+ destination="/opt/ml/processing/input/data",
+ ),
],
outputs=[
- ProcessingOutput(output_name='metrics',
- s3_upload_mode='EndOfJob',
- source='/opt/ml/processing/output/metrics/'),
+ ProcessingOutput(
+ output_name="metrics", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/metrics/"
+ ),
],
job_arguments=[
- '--max-seq-length', str(max_seq_length.default_value),
- ],
+ "--max-seq-length",
+ str(max_seq_length.default_value),
+ ],
property_files=[evaluation_report], # these cause deserialization issues
- )
-
+ )
+
model_metrics = ModelMetrics(
model_statistics=MetricsSource(
s3_uri="{}/evaluation.json".format(
evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
),
- content_type="application/json"
+ content_type="application/json",
)
- )
-
-
- #########################
- ## REGISTER TRAINED MODEL STEP
+ )
+
+ #########################
+ ## REGISTER TRAINED MODEL STEP
#########################
-
- model_approval_status = ParameterString(
- name="ModelApprovalStatus",
- default_value="PendingManualApproval"
- )
-
- deploy_instance_type = ParameterString(
- name="DeployInstanceType",
- default_value="ml.m5.4xlarge"
- )
-
- deploy_instance_count = ParameterInteger(
- name="DeployInstanceCount",
- default_value=1
- )
-
+
+ model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
+
+ deploy_instance_type = ParameterString(name="DeployInstanceType", default_value="ml.m5.4xlarge")
+
+ deploy_instance_count = ParameterInteger(name="DeployInstanceCount", default_value=1)
+
inference_image_uri = sagemaker.image_uris.retrieve(
framework="tensorflow",
region=region,
version="2.3.1",
py_version="py37",
instance_type=deploy_instance_type,
- image_scope="inference"
+ image_scope="inference",
)
print(inference_image_uri)
register_step = RegisterModel(
name="RegisterModel",
estimator=estimator,
- image_uri=inference_image_uri, # we have to specify, by default it's using training image
+ image_uri=inference_image_uri, # we have to specify, by default it's using training image
model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
content_types=["text/csv"],
response_types=["text/csv"],
- inference_instances=[deploy_instance_type], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception
+ inference_instances=[
+ deploy_instance_type
+ ], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception
transform_instances=[deploy_instance_type],
model_package_group_name=model_package_group_name,
approval_status=model_approval_status,
)
-
-
+
#########################
## CREATE MODEL FOR DEPLOYMENT STEP
#########################
-
+
model = Model(
image_uri=inference_image_uri,
model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
@@ -503,38 +436,33 @@ def get_pipeline(
model=model,
inputs=create_inputs,
)
-
#########################
## CONDITION STEP: EVALUATE THE MODEL
#########################
-
- min_accuracy_value = ParameterFloat(
- name="MinAccuracyValue",
- default_value=0.01
- )
-
+
+ min_accuracy_value = ParameterFloat(name="MinAccuracyValue", default_value=0.01)
+
minimum_accuracy_condition = ConditionGreaterThanOrEqualTo(
left=JsonGet(
step=evaluation_step,
property_file=evaluation_report,
json_path="metrics.accuracy.value",
),
- right=min_accuracy_value # accuracy
+ right=min_accuracy_value, # accuracy
)
minimum_accuracy_condition_step = ConditionStep(
name="AccuracyCondition",
conditions=[minimum_accuracy_condition],
- if_steps=[register_step, create_step], # success, continue with model registration
- else_steps=[], # fail, end the pipeline
+ if_steps=[register_step, create_step], # success, continue with model registration
+ else_steps=[], # fail, end the pipeline
)
-
#########################
## CREATE PIPELINE
#########################
-
+
pipeline = Pipeline(
name=pipeline_name,
parameters=[
@@ -569,20 +497,18 @@ def get_pipeline(
input_mode,
run_validation,
run_test,
- run_sample_predictions,
+ run_sample_predictions,
min_accuracy_value,
model_approval_status,
deploy_instance_type,
- deploy_instance_count
+ deploy_instance_count,
],
- steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],
- sagemaker_session=sess
+ steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],
+ sagemaker_session=sess,
)
-
-
- #########################
+
+ #########################
## RETURN PIPELINE
#########################
-
- return pipeline
+ return pipeline
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py
index 1211ba85..7e1cd385 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py
@@ -20,16 +20,18 @@
import subprocess
## PIP INSTALLS ##
-# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
+# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
# use anaconda and anaconda only supports 2.3.0 at this time
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"])
import pandas as pd
import re
import sagemaker
@@ -40,51 +42,55 @@
FeatureTypeEnum,
)
-region = os.environ['AWS_DEFAULT_REGION']
-print('Region: {}'.format(region))
+region = os.environ["AWS_DEFAULT_REGION"]
+print("Region: {}".format(region))
#############################
## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc.
## Role and Bucket are malformed if we do this later.
-sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region)
+sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region)
caller_identity = sts.get_caller_identity()
-print('caller_identity: {}'.format(caller_identity))
+print("caller_identity: {}".format(caller_identity))
-assumed_role_arn = caller_identity['Arn']
-print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn))
+assumed_role_arn = caller_identity["Arn"]
+print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn))
-assumed_role_name = assumed_role_arn.split('/')[-2]
+assumed_role_name = assumed_role_arn.split("/")[-2]
-iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region)
-get_role_response = iam.get_role(RoleName=assumed_role_name)
-print('get_role_response {}'.format(get_role_response))
-role = get_role_response['Role']['Arn']
-print('role {}'.format(role))
+iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region)
+get_role_response = iam.get_role(RoleName=assumed_role_name)
+print("get_role_response {}".format(get_role_response))
+role = get_role_response["Role"]["Arn"]
+print("role {}".format(role))
bucket = sagemaker.Session().default_bucket()
-print('The DEFAULT BUCKET is {}'.format(bucket))
+print("The DEFAULT BUCKET is {}".format(bucket))
#############################
-sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region)
+sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region)
-featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region)
+featurestore_runtime = boto3.Session(region_name=region).client(
+ service_name="sagemaker-featurestore-runtime", region_name=region
+)
-s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region)
+s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region)
-sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region),
- sagemaker_client=sm,
- sagemaker_featurestore_runtime_client=featurestore_runtime)
+sagemaker_session = sagemaker.Session(
+ boto_session=boto3.Session(region_name=region),
+ sagemaker_client=sm,
+ sagemaker_featurestore_runtime_client=featurestore_runtime,
+)
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-REVIEW_BODY_COLUMN = 'review_body'
-REVIEW_ID_COLUMN = 'review_id'
+REVIEW_BODY_COLUMN = "review_body"
+REVIEW_ID_COLUMN = "review_id"
# DATE_COLUMN = 'date'
-LABEL_COLUMN = 'star_rating'
+LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]
-
+
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
label_map[label] = i
@@ -92,94 +98,88 @@
def cast_object_to_string(data_frame):
for label in data_frame.columns:
- if data_frame.dtypes[label] == 'object':
+ if data_frame.dtypes[label] == "object":
data_frame[label] = data_frame[label].astype("str").astype("string")
return data_frame
-
+
def wait_for_feature_group_creation_complete(feature_group):
try:
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
while status == "Creating":
print("Waiting for Feature Group Creation")
time.sleep(5)
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
if status != "Created":
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
raise RuntimeError(f"Failed to create feature group {feature_group.name}")
print(f"FeatureGroup {feature_group.name} successfully created.")
except:
- print('No feature group created yet.')
-
-
+ print("No feature group created yet.")
+
+
def create_or_load_feature_group(prefix, feature_group_name):
# Feature Definitions for our records
- feature_definitions= [
- FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),
- FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),
-# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING)
+ feature_definitions = [
+ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
+ FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
+ # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]
-
+
feature_group = FeatureGroup(
- name=feature_group_name,
- feature_definitions=feature_definitions,
- sagemaker_session=sagemaker_session)
-
- print('Feature Group: {}'.format(feature_group))
-
- try:
- print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...')
+ name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
+ )
+
+ print("Feature Group: {}".format(feature_group))
+
+ try:
+ print(
+ "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
+ )
wait_for_feature_group_creation_complete(feature_group)
except Exception as e:
- print('Before CREATE FG wait exeption: {}'.format(e))
-# pass
-
+ print("Before CREATE FG wait exeption: {}".format(e))
+ # pass
+
try:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"
-
- print('Creating Feature Group with role {}...'.format(role))
+
+ print("Creating Feature Group with role {}...".format(role))
feature_group.create(
s3_uri=f"s3://{bucket}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name=event_time_feature_name,
role_arn=role,
- enable_online_store=True
+ enable_online_store=True,
)
- print('Creating Feature Group. Completed.')
-
- print('Waiting for new Feature Group to become available...')
+ print("Creating Feature Group. Completed.")
+
+ print("Waiting for new Feature Group to become available...")
wait_for_feature_group_creation_complete(feature_group)
- print('Feature Group available.')
+ print("Feature Group available.")
feature_group.describe()
-
+
except Exception as e:
- print('Exception: {}'.format(e))
-
+ print("Exception: {}".format(e))
+
return feature_group
-
+
class InputFeatures(object):
- """BERT feature vectors."""
-
- def __init__(self,
- input_ids,
- input_mask,
- segment_ids,
- label_id,
- review_id,
- date,
- label):
-# review_body):
+ """BERT feature vectors."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
+ # review_body):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
@@ -187,36 +187,38 @@ def __init__(self,
self.review_id = review_id
self.date = date
self.label = label
+
+
# self.review_body = review_body
-
-
+
+
class Input(object):
- """A single training/test input for sequence classification."""
-
- def __init__(self, text, review_id, date, label=None):
- """Constructs an Input.
- Args:
- text: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.text = text
- self.review_id = review_id
- self.date = date
- self.label = label
-
-
+ """A single training/test input for sequence classification."""
+
+ def __init__(self, text, review_id, date, label=None):
+ """Constructs an Input.
+ Args:
+ text: string. The untokenized text of the first sequence. For single
+ sequence tasks, only this sequence must be specified.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.text = text
+ self.review_id = review_id
+ self.date = date
+ self.label = label
+
+
def convert_input(the_input, max_seq_length):
# First, we need to preprocess our data so that it matches the data BERT was trained on:
#
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- #
+ #
# Fortunately, the Transformers tokenizer does this for us!
#
- tokens = tokenizer.tokenize(the_input.text)
+ tokens = tokenizer.tokenize(the_input.text)
# Next, we need to do the following:
#
@@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length):
#
# Again, the Transformers tokenizer does this for us!
#
- encode_plus_tokens = tokenizer.encode_plus(the_input.text,
- pad_to_max_length=True,
- max_length=max_seq_length,
-# truncation=True
- )
+ encode_plus_tokens = tokenizer.encode_plus(
+ the_input.text,
+ pad_to_max_length=True,
+ max_length=max_seq_length,
+ # truncation=True
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
-
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ input_ids = encode_plus_tokens["input_ids"]
+
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
# Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
segment_ids = [0] * max_seq_length
@@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length):
label_id=label_id,
review_id=the_input.review_id,
date=the_input.date,
- label=the_input.label)
-# review_body=the_input.text)
-
-# print('**input_ids**\n{}\n'.format(features.input_ids))
-# print('**input_mask**\n{}\n'.format(features.input_mask))
-# print('**segment_ids**\n{}\n'.format(features.segment_ids))
-# print('**label_id**\n{}\n'.format(features.label_id))
-# print('**review_id**\n{}\n'.format(features.review_id))
-# print('**date**\n{}\n'.format(features.date))
-# print('**label**\n{}\n'.format(features.label))
-# print('**review_body**\n{}\n'.format(features.review_body))
+ label=the_input.label,
+ )
+ # review_body=the_input.text)
+
+ # print('**input_ids**\n{}\n'.format(features.input_ids))
+ # print('**input_mask**\n{}\n'.format(features.input_mask))
+ # print('**segment_ids**\n{}\n'.format(features.segment_ids))
+ # print('**label_id**\n{}\n'.format(features.label_id))
+ # print('**review_id**\n{}\n'.format(features.review_id))
+ # print('**date**\n{}\n'.format(features.date))
+ # print('**label**\n{}\n'.format(features.label))
+ # print('**review_body**\n{}\n'.format(features.review_body))
return features
-def transform_inputs_to_tfrecord(inputs,
- output_file,
- max_seq_length):
+def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
"""Convert a set of `Input`s to a TFRecord file."""
records = []
tf_record_writer = tf.io.TFRecordWriter(output_file)
-
+
for (input_idx, the_input) in enumerate(inputs):
if input_idx % 10000 == 0:
- print('Writing input {} of {}\n'.format(input_idx, len(inputs)))
+ print("Writing input {} of {}\n".format(input_idx, len(inputs)))
features = convert_input(the_input, max_seq_length)
all_features = collections.OrderedDict()
- all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
- all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
- all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
- all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
+ all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
+ all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
+ all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
+ all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
tf_record_writer.write(tf_record.SerializeToString())
- records.append({#'tf_record': tf_record.SerializeToString(),
- 'input_ids': features.input_ids,
- 'input_mask': features.input_mask,
- 'segment_ids': features.segment_ids,
- 'label_id': features.label_id,
- 'review_id': the_input.review_id,
- 'date': the_input.date,
- 'label': features.label,
-# 'review_body': features.review_body
- })
+ records.append(
+ { #'tf_record': tf_record.SerializeToString(),
+ "input_ids": features.input_ids,
+ "input_mask": features.input_mask,
+ "segment_ids": features.segment_ids,
+ "label_id": features.label_id,
+ "review_id": the_input.review_id,
+ "date": the_input.date,
+ "label": features.label,
+ # 'review_body': features.review_body
+ }
+ )
#####################################
####### TODO: REMOVE THIS BREAK #######
- #####################################
+ #####################################
# break
-
+
tf_record_writer.close()
-
+
return records
-
+
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--train-split-percentage', type=float,
+ parser.add_argument(
+ "--train-split-percentage",
+ type=float,
default=0.90,
)
- parser.add_argument('--validation-split-percentage', type=float,
- default=0.05,
- )
- parser.add_argument('--test-split-percentage', type=float,
+ parser.add_argument(
+ "--validation-split-percentage",
+ type=float,
default=0.05,
)
- parser.add_argument('--balance-dataset', type=eval,
- default=True
+ parser.add_argument(
+ "--test-split-percentage",
+ type=float,
+ default=0.05,
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument("--balance-dataset", type=eval, default=True)
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
- parser.add_argument('--feature-store-offline-prefix', type=str,
+ )
+ parser.add_argument(
+ "--feature-store-offline-prefix",
+ type=str,
default=None,
- )
- parser.add_argument('--feature-group-name', type=str,
+ )
+ parser.add_argument(
+ "--feature-group-name",
+ type=str,
default=None,
- )
-
+ )
+
return parser.parse_args()
-
-def _transform_tsv_to_tfrecord(file,
- max_seq_length,
- balance_dataset,
- prefix,
- feature_group_name):
- print('file {}'.format(file))
- print('max_seq_length {}'.format(max_seq_length))
- print('balance_dataset {}'.format(balance_dataset))
- print('prefix {}'.format(prefix))
- print('feature_group_name {}'.format(feature_group_name))
+
+def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name):
+ print("file {}".format(file))
+ print("max_seq_length {}".format(max_seq_length))
+ print("balance_dataset {}".format(balance_dataset))
+ print("prefix {}".format(prefix))
+ print("feature_group_name {}".format(feature_group_name))
# need to re-load since we can't pass feature_group object in _partial functions for some reason
feature_group = create_or_load_feature_group(prefix, feature_group_name)
-
+
filename_without_extension = Path(Path(file).stem).stem
- df = pd.read_csv(file,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')
+ df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")
df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)
- print('Shape of dataframe {}'.format(df.shape))
+ print("Shape of dataframe {}".format(df.shape))
- if balance_dataset:
+ if balance_dataset:
# Balance the dataset down to the minority class
from sklearn.utils import resample
- five_star_df = df.query('star_rating == 5')
- four_star_df = df.query('star_rating == 4')
- three_star_df = df.query('star_rating == 3')
- two_star_df = df.query('star_rating == 2')
- one_star_df = df.query('star_rating == 1')
-
- minority_count = min(five_star_df.shape[0],
- four_star_df.shape[0],
- three_star_df.shape[0],
- two_star_df.shape[0],
- one_star_df.shape[0])
-
- five_star_df = resample(five_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- four_star_df = resample(four_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- three_star_df = resample(three_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- two_star_df = resample(two_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- one_star_df = resample(one_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
+ five_star_df = df.query("star_rating == 5")
+ four_star_df = df.query("star_rating == 4")
+ three_star_df = df.query("star_rating == 3")
+ two_star_df = df.query("star_rating == 2")
+ one_star_df = df.query("star_rating == 1")
+
+ minority_count = min(
+ five_star_df.shape[0],
+ four_star_df.shape[0],
+ three_star_df.shape[0],
+ two_star_df.shape[0],
+ one_star_df.shape[0],
+ )
+
+ five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)
df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])
- df_balanced = df_balanced.reset_index(drop=True)
- print('Shape of balanced dataframe {}'.format(df_balanced.shape))
- print(df_balanced['star_rating'].head(100))
+ df_balanced = df_balanced.reset_index(drop=True)
+ print("Shape of balanced dataframe {}".format(df_balanced.shape))
+ print(df_balanced["star_rating"].head(100))
df = df_balanced
-
- print('Shape of dataframe before splitting {}'.format(df.shape))
-
- print('train split percentage {}'.format(args.train_split_percentage))
- print('validation split percentage {}'.format(args.validation_split_percentage))
- print('test split percentage {}'.format(args.test_split_percentage))
-
+
+ print("Shape of dataframe before splitting {}".format(df.shape))
+
+ print("train split percentage {}".format(args.train_split_percentage))
+ print("validation split percentage {}".format(args.validation_split_percentage))
+ print("test split percentage {}".format(args.test_split_percentage))
+
holdout_percentage = 1.00 - args.train_split_percentage
- print('holdout percentage {}'.format(holdout_percentage))
- df_train, df_holdout = train_test_split(df,
- test_size=holdout_percentage,
- stratify=df['star_rating'])
+ print("holdout percentage {}".format(holdout_percentage))
+ df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])
test_holdout_percentage = args.test_split_percentage / holdout_percentage
- print('test holdout percentage {}'.format(test_holdout_percentage))
- df_validation, df_test = train_test_split(df_holdout,
- test_size=test_holdout_percentage,
- stratify=df_holdout['star_rating'])
-
+ print("test holdout percentage {}".format(test_holdout_percentage))
+ df_validation, df_test = train_test_split(
+ df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"]
+ )
+
df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
- print('Shape of train dataframe {}'.format(df_train.shape))
- print('Shape of validation dataframe {}'.format(df_validation.shape))
- print('Shape of test dataframe {}'.format(df_test.shape))
+ print("Shape of train dataframe {}".format(df_train.shape))
+ print("Shape of validation dataframe {}".format(df_validation.shape))
+ print("Shape of test dataframe {}".format(df_test.shape))
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)
- train_inputs = df_train.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- validation_inputs = df_validation.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- test_inputs = df_test.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
+ train_inputs = df_train.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ validation_inputs = df_validation.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ test_inputs = df_test.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
# Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
- #
- #
+ #
+ #
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
# 4. Map our words to indexes using a vocab file that BERT provides
# 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
# 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
- #
+ #
# We don't have to worry about these details. The Transformers tokenizer does this for us.
- #
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
+ #
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
# Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
- train_records = transform_inputs_to_tfrecord(train_inputs,
- '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- validation_records = transform_inputs_to_tfrecord(validation_inputs,
- '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- test_records = transform_inputs_to_tfrecord(test_inputs,
- '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension),
- max_seq_length)
-
+ train_records = transform_inputs_to_tfrecord(
+ train_inputs,
+ "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ validation_records = transform_inputs_to_tfrecord(
+ validation_inputs,
+ "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ test_records = transform_inputs_to_tfrecord(
+ test_inputs,
+ "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
df_train_records = pd.DataFrame.from_dict(train_records)
- df_train_records['split_type'] = 'train'
- df_train_records.head()
-
+ df_train_records["split_type"] = "train"
+ df_train_records.head()
+
df_validation_records = pd.DataFrame.from_dict(validation_records)
- df_validation_records['split_type'] = 'validation'
- df_validation_records.head()
+ df_validation_records["split_type"] = "validation"
+ df_validation_records.head()
df_test_records = pd.DataFrame.from_dict(test_records)
- df_test_records['split_type'] = 'test'
- df_test_records.head()
-
- # Add record to feature store
+ df_test_records["split_type"] = "test"
+ df_test_records.head()
+
+ # Add record to feature store
df_fs_train_records = cast_object_to_string(df_train_records)
df_fs_validation_records = cast_object_to_string(df_validation_records)
df_fs_test_records = cast_object_to_string(df_test_records)
- print('Ingesting Features...')
- feature_group.ingest(
- data_frame=df_fs_train_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_validation_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_test_records, max_workers=3, wait=True
- )
- print('Feature ingest completed.')
+ print("Ingesting Features...")
+ feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True)
+ print("Feature ingest completed.")
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
+ print("Current host: {}".format(args.current_host))
+
+ feature_group = create_or_load_feature_group(
+ prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name
+ )
feature_group.describe()
-
+
print(feature_group.as_hive_ddl())
-
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
-
- transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord,
- max_seq_length=args.max_seq_length,
- balance_dataset=args.balance_dataset,
- prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
-
- input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data))
+
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
+
+ transform_tsv_to_tfrecord = functools.partial(
+ _transform_tsv_to_tfrecord,
+ max_seq_length=args.max_seq_length,
+ balance_dataset=args.balance_dataset,
+ prefix=args.feature_store_offline_prefix,
+ feature_group_name=args.feature_group_name,
+ )
+
+ input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
num_cpus = multiprocessing.cpu_count()
- print('num_cpus {}'.format(num_cpus))
+ print("num_cpus {}".format(num_cpus))
p = multiprocessing.Pool(num_cpus)
p.map(transform_tsv_to_tfrecord, input_files)
- print('Listing contents of {}'.format(args.output_data))
+ print("Listing contents of {}".format(args.output_data))
dirs_output = os.listdir(args.output_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(train_data))
+ print("Listing contents of {}".format(train_data))
dirs_output = os.listdir(train_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(validation_data))
+ print("Listing contents of {}".format(validation_data))
dirs_output = os.listdir(validation_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(test_data))
+ print("Listing contents of {}".format(test_data))
dirs_output = os.listdir(test_data)
for file in dirs_output:
print(file)
-
+
offline_store_contents = None
- while (offline_store_contents is None):
- objects_in_bucket = s3.list_objects(Bucket=bucket,
- Prefix=args.feature_store_offline_prefix)
- if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
- offline_store_contents = objects_in_bucket['Contents']
+ while offline_store_contents is None:
+ objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix)
+ if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
+ offline_store_contents = objects_in_bucket["Contents"]
else:
- print('Waiting for data in offline store...\n')
+ print("Waiting for data in offline store...\n")
sleep(60)
- print('Data available.')
-
- print('Complete')
-
-
+ print("Data available.")
+
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
process(args)
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py
index 31951e62..9063ac1e 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py
@@ -26,7 +26,8 @@
from smexperiments import tracker
import boto3
-sm = boto3.Session().client(service_name='sagemaker')
+
+sm = boto3.Session().client(service_name="sagemaker")
import sagemaker
@@ -36,9 +37,7 @@ def main(): # pragma: no cover
Creates or updates the pipeline and runs it.
"""
- parser = argparse.ArgumentParser(
- "Creates or updates and runs the pipeline for the pipeline script."
- )
+ parser = argparse.ArgumentParser("Creates or updates and runs the pipeline for the pipeline script.")
parser.add_argument(
"-n",
@@ -89,9 +88,7 @@ def main(): # pragma: no cover
parsed = json.loads(pipeline.definition())
print(json.dumps(parsed, indent=2, sort_keys=True))
- upsert_response = pipeline.upsert(
- role_arn=args.role_arn, description=args.description, tags=tags
- )
+ upsert_response = pipeline.upsert(role_arn=args.role_arn, description=args.description, tags=tags)
print("\n###### Created/Updated SageMaker Pipeline: Response received:")
print(upsert_response)
@@ -100,125 +97,113 @@ def main(): # pragma: no cover
# Now we describe execution instance and list the steps in the execution to find out more about the execution.
execution_run = execution.describe()
- print(execution_run)
-
- # Create or Load the 'Experiment'
+ print(execution_run)
+
+ # Create or Load the 'Experiment'
try:
experiment = Experiment.create(
- experiment_name=pipeline.name,
- description='Amazon Customer Reviews BERT Pipeline Experiment'
- )
- except:
- experiment = Experiment.load(
- experiment_name=pipeline.name
+ experiment_name=pipeline.name, description="Amazon Customer Reviews BERT Pipeline Experiment"
)
-
- print('Experiment name: {}'.format(experiment.experiment_name))
-
+ except:
+ experiment = Experiment.load(experiment_name=pipeline.name)
+
+ print("Experiment name: {}".format(experiment.experiment_name))
+
# Add Execution Run as Trial to Experiments
- execution_run_name = execution_run['PipelineExecutionDisplayName']
+ execution_run_name = execution_run["PipelineExecutionDisplayName"]
print(execution_run_name)
-
+
# Create the `Trial`
timestamp = int(time.time())
- trial = Trial.create(trial_name=execution_run_name,
- experiment_name=experiment.experiment_name,
- sagemaker_boto_client=sm)
+ trial = Trial.create(
+ trial_name=execution_run_name, experiment_name=experiment.experiment_name, sagemaker_boto_client=sm
+ )
trial_name = trial.trial_name
- print('Trial name: {}'.format(trial_name))
-
+ print("Trial name: {}".format(trial_name))
+
######################################################
## Parse Pipeline Definition For Processing Job Args
######################################################
-
+
processing_param_dict = {}
-
- for step in parsed['Steps']:
- print('step: {}'.format(step))
- if step['Name']=='Processing':
- print('Step Name is Processing...')
- arg_list = step['Arguments']['AppSpecification']['ContainerArguments']
+
+ for step in parsed["Steps"]:
+ print("step: {}".format(step))
+ if step["Name"] == "Processing":
+ print("Step Name is Processing...")
+ arg_list = step["Arguments"]["AppSpecification"]["ContainerArguments"]
print(arg_list)
num_args = len(arg_list)
print(num_args)
-
- # arguments are (key, value) pairs in this list, so we extract them in pairs
+
+ # arguments are (key, value) pairs in this list, so we extract them in pairs
# using [i] and [i+1] indexes and stepping by 2 through the list
for i in range(0, num_args, 2):
- key = arg_list[i].replace('--', '')
- value = arg_list[i+1]
- print('arg key: {}'.format(key))
- print('arg value: {}'.format(value))
+ key = arg_list[i].replace("--", "")
+ value = arg_list[i + 1]
+ print("arg key: {}".format(key))
+ print("arg value: {}".format(value))
processing_param_dict[key] = value
-
##############################
## Wait For Execution To Finish
##############################
-
+
print("Waiting for the execution to finish...")
execution.wait()
- print("\n#####Execution completed. Execution step details:")
-
+ print("\n#####Execution completed. Execution step details:")
+
# List Execution Steps
- print(execution.list_steps())
-
+ print(execution.list_steps())
+
# List All Artifacts Generated By The Pipeline
- processing_job_name=None
- training_job_name=None
-
+ processing_job_name = None
+ training_job_name = None
+
from sagemaker.lineage.visualizer import LineageTableVisualizer
viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
print(execution_step)
# We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step
- if execution_step['StepName'] == 'Processing':
- processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]
+ if execution_step["StepName"] == "Processing":
+ processing_job_name = execution_step["Metadata"]["ProcessingJob"]["Arn"].split("/")[-1]
print(processing_job_name)
- #display(viz.show(processing_job_name=processing_job_name))
- elif execution_step['StepName'] == 'Train':
- training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]
+ # display(viz.show(processing_job_name=processing_job_name))
+ elif execution_step["StepName"] == "Train":
+ training_job_name = execution_step["Metadata"]["TrainingJob"]["Arn"].split("/")[-1]
print(training_job_name)
- #display(viz.show(training_job_name=training_job_name))
+ # display(viz.show(training_job_name=training_job_name))
else:
- #display(viz.show(pipeline_execution_step=execution_step))
+ # display(viz.show(pipeline_execution_step=execution_step))
time.sleep(5)
- # Add Trial Compontents To Experiment Trial
- processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)
+ # Add Trial Compontents To Experiment Trial
+ processing_job_tc = "{}-aws-processing-job".format(processing_job_name)
print(processing_job_tc)
# -aws-processing-job is the default name assigned by ProcessingJob
- response = sm.associate_trial_component(
- TrialComponentName=processing_job_tc,
- TrialName=trial_name
- )
-
+ response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=trial_name)
+
# -aws-training-job is the default name assigned by TrainingJob
- training_job_tc = '{}-aws-training-job'.format(training_job_name)
+ training_job_tc = "{}-aws-training-job".format(training_job_name)
print(training_job_tc)
- response = sm.associate_trial_component(
- TrialComponentName=training_job_tc,
- TrialName=trial_name
- )
-
+ response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=trial_name)
+
##############
# Log Additional Parameters within Trial
##############
- print('Logging Processing Job Parameters within Experiment Trial...')
- processing_job_tracker = tracker.Tracker.load(trial_component_name=processing_job_tc)
-
+ print("Logging Processing Job Parameters within Experiment Trial...")
+ processing_job_tracker = tracker.Tracker.load(trial_component_name=processing_job_tc)
+
for key, value in processing_param_dict.items():
- print('key: {}, value: {}'.format(key, value))
- processing_job_tracker.log_parameters({
- key: str(value)
- })
+ print("key: {}, value: {}".format(key, value))
+ processing_job_tracker.log_parameters({key: str(value)})
# must save after logging
- processing_job_tracker.trial_component.save();
+ processing_job_tracker.trial_component.save()
except Exception as e: # pylint: disable=W0703
print(f"Exception: {e}")
diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py b/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py
index b6b8b179..224153d5 100644
--- a/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py
+++ b/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py
@@ -12,7 +12,13 @@
readme = f.read()
-required_packages = ["sagemaker==2.24.3", "sagemaker-experiments==0.1.26", "pandas==1.0.1", "boto3==1.17.4", "botocore==1.20.4"]
+required_packages = [
+ "sagemaker==2.24.3",
+ "sagemaker-experiments==0.1.26",
+ "pandas==1.0.1",
+ "boto3==1.17.4",
+ "botocore==1.20.4",
+]
extras = {
"test": [
"black",
diff --git a/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py b/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py
index a9c66cf9..7825f488 100644
--- a/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py
+++ b/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py
@@ -61,9 +61,7 @@ def test_endpoint(endpoint_name):
config = json.load(f)
# Get the endpoint name from sagemaker project name
- endpoint_name = "{}-{}".format(
- config["Parameters"]["SageMakerProjectName"], config["Parameters"]["StageName"]
- )
+ endpoint_name = "{}-{}".format(config["Parameters"]["SageMakerProjectName"], config["Parameters"]["StageName"])
results = test_endpoint(endpoint_name)
# Print results and write to file
diff --git a/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py
index 1211ba85..7e1cd385 100644
--- a/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py
+++ b/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py
@@ -20,16 +20,18 @@
import subprocess
## PIP INSTALLS ##
-# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
+# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to
# use anaconda and anaconda only supports 2.3.0 at this time
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y'])
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"])
import tensorflow as tf
from tensorflow import keras
-subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y'])
+
+subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"])
import pandas as pd
import re
import sagemaker
@@ -40,51 +42,55 @@
FeatureTypeEnum,
)
-region = os.environ['AWS_DEFAULT_REGION']
-print('Region: {}'.format(region))
+region = os.environ["AWS_DEFAULT_REGION"]
+print("Region: {}".format(region))
#############################
## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc.
## Role and Bucket are malformed if we do this later.
-sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region)
+sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region)
caller_identity = sts.get_caller_identity()
-print('caller_identity: {}'.format(caller_identity))
+print("caller_identity: {}".format(caller_identity))
-assumed_role_arn = caller_identity['Arn']
-print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn))
+assumed_role_arn = caller_identity["Arn"]
+print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn))
-assumed_role_name = assumed_role_arn.split('/')[-2]
+assumed_role_name = assumed_role_arn.split("/")[-2]
-iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region)
-get_role_response = iam.get_role(RoleName=assumed_role_name)
-print('get_role_response {}'.format(get_role_response))
-role = get_role_response['Role']['Arn']
-print('role {}'.format(role))
+iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region)
+get_role_response = iam.get_role(RoleName=assumed_role_name)
+print("get_role_response {}".format(get_role_response))
+role = get_role_response["Role"]["Arn"]
+print("role {}".format(role))
bucket = sagemaker.Session().default_bucket()
-print('The DEFAULT BUCKET is {}'.format(bucket))
+print("The DEFAULT BUCKET is {}".format(bucket))
#############################
-sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region)
+sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region)
-featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region)
+featurestore_runtime = boto3.Session(region_name=region).client(
+ service_name="sagemaker-featurestore-runtime", region_name=region
+)
-s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region)
+s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region)
-sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region),
- sagemaker_client=sm,
- sagemaker_featurestore_runtime_client=featurestore_runtime)
+sagemaker_session = sagemaker.Session(
+ boto_session=boto3.Session(region_name=region),
+ sagemaker_client=sm,
+ sagemaker_featurestore_runtime_client=featurestore_runtime,
+)
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-REVIEW_BODY_COLUMN = 'review_body'
-REVIEW_ID_COLUMN = 'review_id'
+REVIEW_BODY_COLUMN = "review_body"
+REVIEW_ID_COLUMN = "review_id"
# DATE_COLUMN = 'date'
-LABEL_COLUMN = 'star_rating'
+LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]
-
+
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
label_map[label] = i
@@ -92,94 +98,88 @@
def cast_object_to_string(data_frame):
for label in data_frame.columns:
- if data_frame.dtypes[label] == 'object':
+ if data_frame.dtypes[label] == "object":
data_frame[label] = data_frame[label].astype("str").astype("string")
return data_frame
-
+
def wait_for_feature_group_creation_complete(feature_group):
try:
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
while status == "Creating":
print("Waiting for Feature Group Creation")
time.sleep(5)
status = feature_group.describe().get("FeatureGroupStatus")
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
if status != "Created":
- print('Feature Group status: {}'.format(status))
+ print("Feature Group status: {}".format(status))
raise RuntimeError(f"Failed to create feature group {feature_group.name}")
print(f"FeatureGroup {feature_group.name} successfully created.")
except:
- print('No feature group created yet.')
-
-
+ print("No feature group created yet.")
+
+
def create_or_load_feature_group(prefix, feature_group_name):
# Feature Definitions for our records
- feature_definitions= [
- FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),
- FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),
-# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
- FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING)
+ feature_definitions = [
+ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
+ FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
+ # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
+ FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]
-
+
feature_group = FeatureGroup(
- name=feature_group_name,
- feature_definitions=feature_definitions,
- sagemaker_session=sagemaker_session)
-
- print('Feature Group: {}'.format(feature_group))
-
- try:
- print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...')
+ name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
+ )
+
+ print("Feature Group: {}".format(feature_group))
+
+ try:
+ print(
+ "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
+ )
wait_for_feature_group_creation_complete(feature_group)
except Exception as e:
- print('Before CREATE FG wait exeption: {}'.format(e))
-# pass
-
+ print("Before CREATE FG wait exeption: {}".format(e))
+ # pass
+
try:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"
-
- print('Creating Feature Group with role {}...'.format(role))
+
+ print("Creating Feature Group with role {}...".format(role))
feature_group.create(
s3_uri=f"s3://{bucket}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name=event_time_feature_name,
role_arn=role,
- enable_online_store=True
+ enable_online_store=True,
)
- print('Creating Feature Group. Completed.')
-
- print('Waiting for new Feature Group to become available...')
+ print("Creating Feature Group. Completed.")
+
+ print("Waiting for new Feature Group to become available...")
wait_for_feature_group_creation_complete(feature_group)
- print('Feature Group available.')
+ print("Feature Group available.")
feature_group.describe()
-
+
except Exception as e:
- print('Exception: {}'.format(e))
-
+ print("Exception: {}".format(e))
+
return feature_group
-
+
class InputFeatures(object):
- """BERT feature vectors."""
-
- def __init__(self,
- input_ids,
- input_mask,
- segment_ids,
- label_id,
- review_id,
- date,
- label):
-# review_body):
+ """BERT feature vectors."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
+ # review_body):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
@@ -187,36 +187,38 @@ def __init__(self,
self.review_id = review_id
self.date = date
self.label = label
+
+
# self.review_body = review_body
-
-
+
+
class Input(object):
- """A single training/test input for sequence classification."""
-
- def __init__(self, text, review_id, date, label=None):
- """Constructs an Input.
- Args:
- text: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.text = text
- self.review_id = review_id
- self.date = date
- self.label = label
-
-
+ """A single training/test input for sequence classification."""
+
+ def __init__(self, text, review_id, date, label=None):
+ """Constructs an Input.
+ Args:
+ text: string. The untokenized text of the first sequence. For single
+ sequence tasks, only this sequence must be specified.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.text = text
+ self.review_id = review_id
+ self.date = date
+ self.label = label
+
+
def convert_input(the_input, max_seq_length):
# First, we need to preprocess our data so that it matches the data BERT was trained on:
#
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- #
+ #
# Fortunately, the Transformers tokenizer does this for us!
#
- tokens = tokenizer.tokenize(the_input.text)
+ tokens = tokenizer.tokenize(the_input.text)
# Next, we need to do the following:
#
@@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length):
#
# Again, the Transformers tokenizer does this for us!
#
- encode_plus_tokens = tokenizer.encode_plus(the_input.text,
- pad_to_max_length=True,
- max_length=max_seq_length,
-# truncation=True
- )
+ encode_plus_tokens = tokenizer.encode_plus(
+ the_input.text,
+ pad_to_max_length=True,
+ max_length=max_seq_length,
+ # truncation=True
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
-
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ input_ids = encode_plus_tokens["input_ids"]
+
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
# Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
segment_ids = [0] * max_seq_length
@@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length):
label_id=label_id,
review_id=the_input.review_id,
date=the_input.date,
- label=the_input.label)
-# review_body=the_input.text)
-
-# print('**input_ids**\n{}\n'.format(features.input_ids))
-# print('**input_mask**\n{}\n'.format(features.input_mask))
-# print('**segment_ids**\n{}\n'.format(features.segment_ids))
-# print('**label_id**\n{}\n'.format(features.label_id))
-# print('**review_id**\n{}\n'.format(features.review_id))
-# print('**date**\n{}\n'.format(features.date))
-# print('**label**\n{}\n'.format(features.label))
-# print('**review_body**\n{}\n'.format(features.review_body))
+ label=the_input.label,
+ )
+ # review_body=the_input.text)
+
+ # print('**input_ids**\n{}\n'.format(features.input_ids))
+ # print('**input_mask**\n{}\n'.format(features.input_mask))
+ # print('**segment_ids**\n{}\n'.format(features.segment_ids))
+ # print('**label_id**\n{}\n'.format(features.label_id))
+ # print('**review_id**\n{}\n'.format(features.review_id))
+ # print('**date**\n{}\n'.format(features.date))
+ # print('**label**\n{}\n'.format(features.label))
+ # print('**review_body**\n{}\n'.format(features.review_body))
return features
-def transform_inputs_to_tfrecord(inputs,
- output_file,
- max_seq_length):
+def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
"""Convert a set of `Input`s to a TFRecord file."""
records = []
tf_record_writer = tf.io.TFRecordWriter(output_file)
-
+
for (input_idx, the_input) in enumerate(inputs):
if input_idx % 10000 == 0:
- print('Writing input {} of {}\n'.format(input_idx, len(inputs)))
+ print("Writing input {} of {}\n".format(input_idx, len(inputs)))
features = convert_input(the_input, max_seq_length)
all_features = collections.OrderedDict()
- all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
- all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
- all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
- all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
+ all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
+ all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
+ all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
+ all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))
tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
tf_record_writer.write(tf_record.SerializeToString())
- records.append({#'tf_record': tf_record.SerializeToString(),
- 'input_ids': features.input_ids,
- 'input_mask': features.input_mask,
- 'segment_ids': features.segment_ids,
- 'label_id': features.label_id,
- 'review_id': the_input.review_id,
- 'date': the_input.date,
- 'label': features.label,
-# 'review_body': features.review_body
- })
+ records.append(
+ { #'tf_record': tf_record.SerializeToString(),
+ "input_ids": features.input_ids,
+ "input_mask": features.input_mask,
+ "segment_ids": features.segment_ids,
+ "label_id": features.label_id,
+ "review_id": the_input.review_id,
+ "date": the_input.date,
+ "label": features.label,
+ # 'review_body': features.review_body
+ }
+ )
#####################################
####### TODO: REMOVE THIS BREAK #######
- #####################################
+ #####################################
# break
-
+
tf_record_writer.close()
-
+
return records
-
+
def list_arg(raw_value):
"""argparse type for a list of strings"""
- return str(raw_value).split(',')
+ return str(raw_value).split(",")
def parse_args():
# Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly
resconfig = {}
try:
- with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile:
+ with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile:
resconfig = json.load(cfgfile)
except FileNotFoundError:
- print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.')
- pass # Ignore
+ print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.")
+ pass # Ignore
# Local testing with CLI args
- parser = argparse.ArgumentParser(description='Process')
+ parser = argparse.ArgumentParser(description="Process")
- parser.add_argument('--hosts', type=list_arg,
- default=resconfig.get('hosts', ['unknown']),
- help='Comma-separated list of host names running the job'
+ parser.add_argument(
+ "--hosts",
+ type=list_arg,
+ default=resconfig.get("hosts", ["unknown"]),
+ help="Comma-separated list of host names running the job",
)
- parser.add_argument('--current-host', type=str,
- default=resconfig.get('current_host', 'unknown'),
- help='Name of this host running the job'
+ parser.add_argument(
+ "--current-host",
+ type=str,
+ default=resconfig.get("current_host", "unknown"),
+ help="Name of this host running the job",
)
- parser.add_argument('--input-data', type=str,
- default='/opt/ml/processing/input/data',
+ parser.add_argument(
+ "--input-data",
+ type=str,
+ default="/opt/ml/processing/input/data",
)
- parser.add_argument('--output-data', type=str,
- default='/opt/ml/processing/output',
+ parser.add_argument(
+ "--output-data",
+ type=str,
+ default="/opt/ml/processing/output",
)
- parser.add_argument('--train-split-percentage', type=float,
+ parser.add_argument(
+ "--train-split-percentage",
+ type=float,
default=0.90,
)
- parser.add_argument('--validation-split-percentage', type=float,
- default=0.05,
- )
- parser.add_argument('--test-split-percentage', type=float,
+ parser.add_argument(
+ "--validation-split-percentage",
+ type=float,
default=0.05,
)
- parser.add_argument('--balance-dataset', type=eval,
- default=True
+ parser.add_argument(
+ "--test-split-percentage",
+ type=float,
+ default=0.05,
)
- parser.add_argument('--max-seq-length', type=int,
+ parser.add_argument("--balance-dataset", type=eval, default=True)
+ parser.add_argument(
+ "--max-seq-length",
+ type=int,
default=64,
- )
- parser.add_argument('--feature-store-offline-prefix', type=str,
+ )
+ parser.add_argument(
+ "--feature-store-offline-prefix",
+ type=str,
default=None,
- )
- parser.add_argument('--feature-group-name', type=str,
+ )
+ parser.add_argument(
+ "--feature-group-name",
+ type=str,
default=None,
- )
-
+ )
+
return parser.parse_args()
-
-def _transform_tsv_to_tfrecord(file,
- max_seq_length,
- balance_dataset,
- prefix,
- feature_group_name):
- print('file {}'.format(file))
- print('max_seq_length {}'.format(max_seq_length))
- print('balance_dataset {}'.format(balance_dataset))
- print('prefix {}'.format(prefix))
- print('feature_group_name {}'.format(feature_group_name))
+
+def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name):
+ print("file {}".format(file))
+ print("max_seq_length {}".format(max_seq_length))
+ print("balance_dataset {}".format(balance_dataset))
+ print("prefix {}".format(prefix))
+ print("feature_group_name {}".format(feature_group_name))
# need to re-load since we can't pass feature_group object in _partial functions for some reason
feature_group = create_or_load_feature_group(prefix, feature_group_name)
-
+
filename_without_extension = Path(Path(file).stem).stem
- df = pd.read_csv(file,
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')
+ df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")
df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)
- print('Shape of dataframe {}'.format(df.shape))
+ print("Shape of dataframe {}".format(df.shape))
- if balance_dataset:
+ if balance_dataset:
# Balance the dataset down to the minority class
from sklearn.utils import resample
- five_star_df = df.query('star_rating == 5')
- four_star_df = df.query('star_rating == 4')
- three_star_df = df.query('star_rating == 3')
- two_star_df = df.query('star_rating == 2')
- one_star_df = df.query('star_rating == 1')
-
- minority_count = min(five_star_df.shape[0],
- four_star_df.shape[0],
- three_star_df.shape[0],
- two_star_df.shape[0],
- one_star_df.shape[0])
-
- five_star_df = resample(five_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- four_star_df = resample(four_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- three_star_df = resample(three_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- two_star_df = resample(two_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
-
- one_star_df = resample(one_star_df,
- replace = False,
- n_samples = minority_count,
- random_state = 27)
+ five_star_df = df.query("star_rating == 5")
+ four_star_df = df.query("star_rating == 4")
+ three_star_df = df.query("star_rating == 3")
+ two_star_df = df.query("star_rating == 2")
+ one_star_df = df.query("star_rating == 1")
+
+ minority_count = min(
+ five_star_df.shape[0],
+ four_star_df.shape[0],
+ three_star_df.shape[0],
+ two_star_df.shape[0],
+ one_star_df.shape[0],
+ )
+
+ five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)
+
+ one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)
df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])
- df_balanced = df_balanced.reset_index(drop=True)
- print('Shape of balanced dataframe {}'.format(df_balanced.shape))
- print(df_balanced['star_rating'].head(100))
+ df_balanced = df_balanced.reset_index(drop=True)
+ print("Shape of balanced dataframe {}".format(df_balanced.shape))
+ print(df_balanced["star_rating"].head(100))
df = df_balanced
-
- print('Shape of dataframe before splitting {}'.format(df.shape))
-
- print('train split percentage {}'.format(args.train_split_percentage))
- print('validation split percentage {}'.format(args.validation_split_percentage))
- print('test split percentage {}'.format(args.test_split_percentage))
-
+
+ print("Shape of dataframe before splitting {}".format(df.shape))
+
+ print("train split percentage {}".format(args.train_split_percentage))
+ print("validation split percentage {}".format(args.validation_split_percentage))
+ print("test split percentage {}".format(args.test_split_percentage))
+
holdout_percentage = 1.00 - args.train_split_percentage
- print('holdout percentage {}'.format(holdout_percentage))
- df_train, df_holdout = train_test_split(df,
- test_size=holdout_percentage,
- stratify=df['star_rating'])
+ print("holdout percentage {}".format(holdout_percentage))
+ df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])
test_holdout_percentage = args.test_split_percentage / holdout_percentage
- print('test holdout percentage {}'.format(test_holdout_percentage))
- df_validation, df_test = train_test_split(df_holdout,
- test_size=test_holdout_percentage,
- stratify=df_holdout['star_rating'])
-
+ print("test holdout percentage {}".format(test_holdout_percentage))
+ df_validation, df_test = train_test_split(
+ df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"]
+ )
+
df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
- print('Shape of train dataframe {}'.format(df_train.shape))
- print('Shape of validation dataframe {}'.format(df_validation.shape))
- print('Shape of test dataframe {}'.format(df_test.shape))
+ print("Shape of train dataframe {}".format(df_train.shape))
+ print("Shape of validation dataframe {}".format(df_validation.shape))
+ print("Shape of test dataframe {}".format(df_test.shape))
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)
- train_inputs = df_train.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- validation_inputs = df_validation.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
-
- test_inputs = df_test.apply(lambda x: Input(
- label = x[LABEL_COLUMN],
- text = x[REVIEW_BODY_COLUMN],
- review_id = x[REVIEW_ID_COLUMN],
- date = timestamp
- ),
- axis = 1)
+ train_inputs = df_train.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ validation_inputs = df_validation.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
+
+ test_inputs = df_test.apply(
+ lambda x: Input(
+ label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
+ ),
+ axis=1,
+ )
# Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
- #
- #
+ #
+ #
# 1. Lowercase our text (if we're using a BERT lowercase model)
# 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
# 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
# 4. Map our words to indexes using a vocab file that BERT provides
# 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
# 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
- #
+ #
# We don't have to worry about these details. The Transformers tokenizer does this for us.
- #
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
+ #
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
# Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
- train_records = transform_inputs_to_tfrecord(train_inputs,
- '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- validation_records = transform_inputs_to_tfrecord(validation_inputs,
- '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension),
- max_seq_length)
-
- test_records = transform_inputs_to_tfrecord(test_inputs,
- '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension),
- max_seq_length)
-
+ train_records = transform_inputs_to_tfrecord(
+ train_inputs,
+ "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ validation_records = transform_inputs_to_tfrecord(
+ validation_inputs,
+ "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
+ test_records = transform_inputs_to_tfrecord(
+ test_inputs,
+ "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
+ max_seq_length,
+ )
+
df_train_records = pd.DataFrame.from_dict(train_records)
- df_train_records['split_type'] = 'train'
- df_train_records.head()
-
+ df_train_records["split_type"] = "train"
+ df_train_records.head()
+
df_validation_records = pd.DataFrame.from_dict(validation_records)
- df_validation_records['split_type'] = 'validation'
- df_validation_records.head()
+ df_validation_records["split_type"] = "validation"
+ df_validation_records.head()
df_test_records = pd.DataFrame.from_dict(test_records)
- df_test_records['split_type'] = 'test'
- df_test_records.head()
-
- # Add record to feature store
+ df_test_records["split_type"] = "test"
+ df_test_records.head()
+
+ # Add record to feature store
df_fs_train_records = cast_object_to_string(df_train_records)
df_fs_validation_records = cast_object_to_string(df_validation_records)
df_fs_test_records = cast_object_to_string(df_test_records)
- print('Ingesting Features...')
- feature_group.ingest(
- data_frame=df_fs_train_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_validation_records, max_workers=3, wait=True
- )
- feature_group.ingest(
- data_frame=df_fs_test_records, max_workers=3, wait=True
- )
- print('Feature ingest completed.')
+ print("Ingesting Features...")
+ feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True)
+ feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True)
+ print("Feature ingest completed.")
def process(args):
- print('Current host: {}'.format(args.current_host))
-
- feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
+ print("Current host: {}".format(args.current_host))
+
+ feature_group = create_or_load_feature_group(
+ prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name
+ )
feature_group.describe()
-
+
print(feature_group.as_hive_ddl())
-
- train_data = '{}/bert/train'.format(args.output_data)
- validation_data = '{}/bert/validation'.format(args.output_data)
- test_data = '{}/bert/test'.format(args.output_data)
-
- transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord,
- max_seq_length=args.max_seq_length,
- balance_dataset=args.balance_dataset,
- prefix=args.feature_store_offline_prefix,
- feature_group_name=args.feature_group_name)
-
- input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data))
+
+ train_data = "{}/bert/train".format(args.output_data)
+ validation_data = "{}/bert/validation".format(args.output_data)
+ test_data = "{}/bert/test".format(args.output_data)
+
+ transform_tsv_to_tfrecord = functools.partial(
+ _transform_tsv_to_tfrecord,
+ max_seq_length=args.max_seq_length,
+ balance_dataset=args.balance_dataset,
+ prefix=args.feature_store_offline_prefix,
+ feature_group_name=args.feature_group_name,
+ )
+
+ input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
num_cpus = multiprocessing.cpu_count()
- print('num_cpus {}'.format(num_cpus))
+ print("num_cpus {}".format(num_cpus))
p = multiprocessing.Pool(num_cpus)
p.map(transform_tsv_to_tfrecord, input_files)
- print('Listing contents of {}'.format(args.output_data))
+ print("Listing contents of {}".format(args.output_data))
dirs_output = os.listdir(args.output_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(train_data))
+ print("Listing contents of {}".format(train_data))
dirs_output = os.listdir(train_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(validation_data))
+ print("Listing contents of {}".format(validation_data))
dirs_output = os.listdir(validation_data)
for file in dirs_output:
print(file)
- print('Listing contents of {}'.format(test_data))
+ print("Listing contents of {}".format(test_data))
dirs_output = os.listdir(test_data)
for file in dirs_output:
print(file)
-
+
offline_store_contents = None
- while (offline_store_contents is None):
- objects_in_bucket = s3.list_objects(Bucket=bucket,
- Prefix=args.feature_store_offline_prefix)
- if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
- offline_store_contents = objects_in_bucket['Contents']
+ while offline_store_contents is None:
+ objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix)
+ if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
+ offline_store_contents = objects_in_bucket["Contents"]
else:
- print('Waiting for data in offline store...\n')
+ print("Waiting for data in offline store...\n")
sleep(60)
- print('Data available.')
-
- print('Complete')
-
-
+ print("Data available.")
+
+ print("Complete")
+
+
if __name__ == "__main__":
args = parse_args()
- print('Loaded arguments:')
+ print("Loaded arguments:")
print(args)
-
- print('Environment variables:')
+
+ print("Environment variables:")
print(os.environ)
process(args)
diff --git a/10_pipeline/src/inference.py b/10_pipeline/src/inference.py
index 2975dc2d..53196737 100644
--- a/10_pipeline/src/inference.py
+++ b/10_pipeline/src/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/10_pipeline/src/tf_bert_reviews.py b/10_pipeline/src/tf_bert_reviews.py
index 79ae535c..34e1d0a7 100644
--- a/10_pipeline/src/tf_bert_reviews.py
+++ b/10_pipeline/src/tf_bert_reviews.py
@@ -9,96 +9,99 @@
import sys
import os
import csv
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
import tensorflow as tf
import pandas as pd
import numpy as np
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import DistilBertConfig
from transformers import TFDistilBertModel
-#from transformers import TFBertForSequenceClassification
+
+# from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -111,236 +114,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES),
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',
- config=config)
-
- input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
- input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32')
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained(
+ "distilbert-base-uncased",
+ num_labels=len(CLASSES),
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config)
+
+ input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
+ input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")
embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
- X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
+ X = tf.keras.layers.Bidirectional(
+ tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
+ )(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
- X = tf.keras.layers.Dense(50, activation='relu')(X)
+ X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
- X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)
+ X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X)
- model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)
+ model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)
for layer in model.layers[:3]:
layer.trainable = not freeze_bert_layer
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
-# model.layers[0].trainable = not freeze_bert_layer
+ print("Compiled model {}".format(model))
+ # model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
transformer_model.save_pretrained(transformer_fine_tuned_model_path)
- print('Model inputs after save_pretrained: {}'.format(model.inputs))
-
+ print("Model inputs after save_pretrained: {}".format(model.inputs))
+
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path,
- include_optimizer=False,
- overwrite=True,
- save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
# Copy test data for the evaluation step
- os.system('cp -R ./test_data/ {}'.format(local_model_dir))
-
+ os.system("cp -R ./test_data/ {}".format(local_model_dir))
+
if run_sample_predictions:
+
def predict(text):
- encode_plus_tokens = tokenizer.encode_plus(text,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True,
- return_tensors='tf')
+ encode_plus_tokens = tokenizer.encode_plus(
+ text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf"
+ )
# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
- input_ids = encode_plus_tokens['input_ids']
+ input_ids = encode_plus_tokens["input_ids"]
- # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
- input_mask = encode_plus_tokens['attention_mask']
+ # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.
+ input_mask = encode_plus_tokens["attention_mask"]
outputs = model.predict(x=(input_ids, input_mask))
@@ -561,59 +499,73 @@ def predict(text):
prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]
- return prediction[0]['label']
+ return prediction[0]["label"]
+
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ predict("""I loved it! I will recommend this to everyone."""),
+ )
- print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone."""))
-
print("""It's OK.""", predict("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ predict("""Really bad. I hope they don't make this anymore."""),
+ )
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
- y_test = df_test_reviews['review_body'].map(predict)
+
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
- print('Test accuracy: ', accuracy)
-
+
+ accuracy = accuracy_score(y_true=y_test, y_pred=y_actual)
+ print("Test accuracy: ", accuracy)
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
@@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
-
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
+
report_dict = {
"metrics": {
"accuracy": {
diff --git a/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
index ad70633f..1df15a8a 100644
--- a/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
+++ b/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb
@@ -17,12 +17,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -49,11 +49,11 @@
"source": [
"try:\n",
" step_functions_pipeline_endpoint_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -80,8 +80,8 @@
"metadata": {},
"outputs": [],
"source": [
- "client = boto3.client('sagemaker')\n",
- "waiter = client.get_waiter('endpoint_in_service')\n",
+ "client = boto3.client(\"sagemaker\")\n",
+ "waiter = client.get_waiter(\"endpoint_in_service\")\n",
"waiter.wait(EndpointName=step_functions_pipeline_endpoint_name)"
]
},
@@ -101,10 +101,12 @@
"import json\n",
"from sagemaker.tensorflow.model import TensorFlowPredictor\n",
"\n",
- "predictor = TensorFlowPredictor(endpoint_name=step_functions_pipeline_endpoint_name,\n",
- " sagemaker_session=sess,\n",
- " model_name='saved_model',\n",
- " model_version=0)"
+ "predictor = TensorFlowPredictor(\n",
+ " endpoint_name=step_functions_pipeline_endpoint_name,\n",
+ " sagemaker_session=sess,\n",
+ " model_name=\"saved_model\",\n",
+ " model_version=0,\n",
+ ")"
]
},
{
@@ -118,7 +120,7 @@
"predicted_classes = predictor.predict(reviews)\n",
"\n",
"for predicted_class, review in zip(predicted_classes, reviews):\n",
- " print('[Predicted Star Rating: {}]'.format(predicted_class), review)"
+ " print(\"[Predicted Star Rating: {}]\".format(predicted_class), review)"
]
},
{
@@ -137,9 +139,7 @@
},
"outputs": [],
"source": [
- "sm.delete_endpoint(\n",
- " EndpointName=step_functions_pipeline_endpoint_name\n",
- ")"
+ "sm.delete_endpoint(EndpointName=step_functions_pipeline_endpoint_name)"
]
},
{
@@ -149,7 +149,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb b/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb
index 669539be..f396a696 100644
--- a/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb
+++ b/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb
@@ -38,13 +38,13 @@
"import json\n",
"from botocore.exceptions import ClientError\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "account_id = boto3.client('sts').get_caller_identity().get('Account')"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")"
]
},
{
@@ -71,11 +71,11 @@
"source": [
"try:\n",
" stepfunction_arn\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -104,11 +104,11 @@
"source": [
"try:\n",
" stepfunction_name\n",
- " print('[OK]') \n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -145,7 +145,7 @@
"metadata": {},
"outputs": [],
"source": [
- "watched_bucket = 'dsoaws-test-upload-{}'.format(account_id)\n",
+ "watched_bucket = \"dsoaws-test-upload-{}\".format(account_id)\n",
"print(watched_bucket)"
]
},
@@ -173,7 +173,7 @@
"metadata": {},
"outputs": [],
"source": [
- "cloudtrail_bucket = 'cloudtrail-dsoaws-{}'.format(account_id)\n",
+ "cloudtrail_bucket = \"cloudtrail-dsoaws-{}\".format(account_id)\n",
"print(cloudtrail_bucket)"
]
},
@@ -214,44 +214,30 @@
" {\n",
" \"Sid\": \"AWSCloudTrailAclCheck20150319\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"cloudtrail.amazonaws.com\"\n",
- " },\n",
+ " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n",
" \"Action\": \"s3:GetBucketAcl\",\n",
- " \"Resource\": \"arn:aws:s3:::{}\".format(cloudtrail_bucket)\n",
+ " \"Resource\": \"arn:aws:s3:::{}\".format(cloudtrail_bucket),\n",
" },\n",
" {\n",
" \"Sid\": \"AWSCloudTrailWrite20150319\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"cloudtrail.amazonaws.com\"\n",
- " },\n",
+ " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n",
" \"Action\": \"s3:PutObject\",\n",
" \"Resource\": \"arn:aws:s3:::{}/AWSLogs/{}/*\".format(cloudtrail_bucket, account_id),\n",
- " \"Condition\": {\n",
- " \"StringEquals\": {\n",
- " \"s3:x-amz-acl\": \"bucket-owner-full-control\"\n",
- " }\n",
- " }\n",
+ " \"Condition\": {\"StringEquals\": {\"s3:x-amz-acl\": \"bucket-owner-full-control\"}},\n",
" },\n",
" {\n",
" \"Sid\": \"AWSCloudTrailHTTPSOnly20180329\",\n",
" \"Effect\": \"Deny\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"cloudtrail.amazonaws.com\"\n",
- " },\n",
+ " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n",
" \"Action\": \"s3:*\",\n",
" \"Resource\": [\n",
" \"arn:aws:s3:::{}/AWSLogs/{}/*\".format(cloudtrail_bucket, account_id),\n",
- " \"arn:aws:s3:::{}\".format(cloudtrail_bucket)\n",
+ " \"arn:aws:s3:::{}\".format(cloudtrail_bucket),\n",
" ],\n",
- " \"Condition\": {\n",
- " \"Bool\": {\n",
- " \"aws:SecureTransport\": \"false\"\n",
- " }\n",
- " }\n",
- " }\n",
- " ]\n",
+ " \"Condition\": {\"Bool\": {\"aws:SecureTransport\": \"false\"}},\n",
+ " },\n",
+ " ],\n",
"}\n",
"\n",
"print(policy)"
@@ -272,7 +258,7 @@
"metadata": {},
"outputs": [],
"source": [
- "with open(\"policy.json\", \"w\") as outfile: \n",
+ "with open(\"policy.json\", \"w\") as outfile:\n",
" json.dump(policy, outfile)"
]
},
@@ -307,8 +293,8 @@
"metadata": {},
"outputs": [],
"source": [
- "cloudtrail = boto3.client('cloudtrail')\n",
- "s3 = boto3.client('s3')"
+ "cloudtrail = boto3.client(\"cloudtrail\")\n",
+ "s3 = boto3.client(\"s3\")"
]
},
{
@@ -336,23 +322,23 @@
"outputs": [],
"source": [
"try:\n",
- " t = cloudtrail.create_trail(Name='dsoaws', S3BucketName=cloudtrail_bucket, IsMultiRegionTrail=True)\n",
- " trail_name = t['Name']\n",
- " trail_arn = t['TrailARN']\n",
+ " t = cloudtrail.create_trail(Name=\"dsoaws\", S3BucketName=cloudtrail_bucket, IsMultiRegionTrail=True)\n",
+ " trail_name = t[\"Name\"]\n",
+ " trail_arn = t[\"TrailARN\"]\n",
" cloudtrail.start_logging(Name=trail_arn)\n",
" print(\"Cloud Trail created. Started logging.\")\n",
- " print('--------------------------------------')\n",
- " print('New Trail name: {}'.format(trail_name))\n",
- " print('New Trail arn: {}'.format(trail_arn))\n",
+ " print(\"--------------------------------------\")\n",
+ " print(\"New Trail name: {}\".format(trail_name))\n",
+ " print(\"New Trail arn: {}\".format(trail_arn))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'TrailAlreadyExistsException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"TrailAlreadyExistsException\":\n",
" print(\"Trail already exists. This is OK.\")\n",
- " print('------------------')\n",
- " t = cloudtrail.get_trail(Name='dsoaws')\n",
- " trail_name = t['Trail']['Name']\n",
- " trail_arn = t['Trail']['TrailARN']\n",
- " print('Trail name: {}'.format(trail_name))\n",
- " print('Trail arn: {}'.format(trail_arn))\n",
+ " print(\"------------------\")\n",
+ " t = cloudtrail.get_trail(Name=\"dsoaws\")\n",
+ " trail_name = t[\"Trail\"][\"Name\"]\n",
+ " trail_arn = t[\"Trail\"][\"TrailARN\"]\n",
+ " print(\"Trail name: {}\".format(trail_name))\n",
+ " print(\"Trail arn: {}\".format(trail_arn))\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)"
]
@@ -370,7 +356,7 @@
"metadata": {},
"outputs": [],
"source": [
- "events = boto3.client('events')"
+ "events = boto3.client(\"events\")"
]
},
{
@@ -379,9 +365,9 @@
"metadata": {},
"outputs": [],
"source": [
- "response = events.describe_event_bus(Name='default')\n",
- "eventbus_arn = response['Arn']\n",
- "print('Bus {}'.format(eventbus_arn))"
+ "response = events.describe_event_bus(Name=\"default\")\n",
+ "eventbus_arn = response[\"Arn\"]\n",
+ "print(\"Bus {}\".format(eventbus_arn))"
]
},
{
@@ -406,7 +392,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!aws cloudtrail get-event-selectors --trail-name $trail_name\n"
+ "!aws cloudtrail get-event-selectors --trail-name $trail_name"
]
},
{
@@ -425,7 +411,11 @@
"metadata": {},
"outputs": [],
"source": [
- "event_selector = '\\'[{ \"ReadWriteType\": \"WriteOnly\", \"IncludeManagementEvents\":true, \"DataResources\": [{ \"Type\": \"AWS::S3::Object\", \"Values\": [\"' + watched_bucket_arn + '\"] }] }]\\''\n"
+ "event_selector = (\n",
+ " '\\'[{ \"ReadWriteType\": \"WriteOnly\", \"IncludeManagementEvents\":true, \"DataResources\": [{ \"Type\": \"AWS::S3::Object\", \"Values\": [\"'\n",
+ " + watched_bucket_arn\n",
+ " + \"\\\"] }] }]'\"\n",
+ ")"
]
},
{
@@ -460,27 +450,13 @@
"outputs": [],
"source": [
"pattern = {\n",
- " \"source\": [\n",
- " \"aws.s3\"\n",
- " ],\n",
- " \"detail-type\": [\n",
- " \"AWS API Call via CloudTrail\"\n",
- " ],\n",
- " \"detail\": {\n",
- " \"eventSource\": [\n",
- " \"s3.amazonaws.com\"\n",
- " ],\n",
- " \"eventName\": [\n",
- " \"PutObject\",\n",
- " \"CompleteMultipartUpload\",\n",
- " \"CopyObject\"\n",
- " ],\n",
- " \"requestParameters\": {\n",
- " \"bucketName\": [\n",
- " \"{}\".format(watched_bucket)\n",
- " ]\n",
- " }\n",
- " }\n",
+ " \"source\": [\"aws.s3\"],\n",
+ " \"detail-type\": [\"AWS API Call via CloudTrail\"],\n",
+ " \"detail\": {\n",
+ " \"eventSource\": [\"s3.amazonaws.com\"],\n",
+ " \"eventName\": [\"PutObject\", \"CompleteMultipartUpload\", \"CopyObject\"],\n",
+ " \"requestParameters\": {\"bucketName\": [\"{}\".format(watched_bucket)]},\n",
+ " },\n",
"}\n",
"\n",
"pattern_json = json.dumps(pattern)\n",
@@ -494,11 +470,11 @@
"outputs": [],
"source": [
"response = events.put_rule(\n",
- " Name='S3-Trigger',\n",
+ " Name=\"S3-Trigger\",\n",
" EventPattern=pattern_json,\n",
- " State='ENABLED',\n",
- " Description='Triggers an event on S3 PUT',\n",
- " EventBusName='default'\n",
+ " State=\"ENABLED\",\n",
+ " Description=\"Triggers an event on S3 PUT\",\n",
+ " EventBusName=\"default\",\n",
")\n",
"print(response)"
]
@@ -509,7 +485,7 @@
"metadata": {},
"outputs": [],
"source": [
- "rule_arn = response['RuleArn']\n",
+ "rule_arn = response[\"RuleArn\"]\n",
"print(rule_arn)"
]
},
@@ -533,7 +509,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam = boto3.client('iam')"
+ "iam = boto3.client(\"iam\")"
]
},
{
@@ -542,7 +518,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_name_eventbridge = 'DSOAWS_EventBridge_Invoke_StepFunctions'"
+ "iam_role_name_eventbridge = \"DSOAWS_EventBridge_Invoke_StepFunctions\""
]
},
{
@@ -559,16 +535,8 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"events.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [{\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"events.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}],\n",
"}"
]
},
@@ -582,10 +550,10 @@
" iam_role_eventbridge = iam.create_role(\n",
" RoleName=iam_role_name_eventbridge,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS EventBridge Role'\n",
+ " Description=\"DSOAWS EventBridge Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)"
@@ -605,7 +573,7 @@
"outputs": [],
"source": [
"role_eventbridge = iam.get_role(RoleName=iam_role_name_eventbridge)\n",
- "iam_role_eventbridge_arn = role_eventbridge['Role']['Arn']\n",
+ "iam_role_eventbridge_arn = role_eventbridge[\"Role\"][\"Arn\"]\n",
"print(iam_role_eventbridge_arn)"
]
},
@@ -624,14 +592,7 @@
"source": [
"eventbridge_sfn_policy = {\n",
" \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Sid\": \"VisualEditor0\",\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": \"states:StartExecution\",\n",
- " \"Resource\": \"*\"\n",
- " }\n",
- " ]\n",
+ " \"Statement\": [{\"Sid\": \"VisualEditor0\", \"Effect\": \"Allow\", \"Action\": \"states:StartExecution\", \"Resource\": \"*\"}],\n",
"}\n",
"\n",
"print(eventbridge_sfn_policy)"
@@ -652,18 +613,16 @@
"source": [
"try:\n",
" policy_eventbridge_sfn = iam.create_policy(\n",
- " PolicyName='DSOAWS_EventBridgeInvokeStepFunction',\n",
- " PolicyDocument=json.dumps(eventbridge_sfn_policy)\n",
+ " PolicyName=\"DSOAWS_EventBridgeInvokeStepFunction\", PolicyDocument=json.dumps(eventbridge_sfn_policy)\n",
" )\n",
" print(\"Done.\")\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" print(\"Policy already exists\")\n",
- " policy_eventbridge_sfn_arn = f'arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction'\n",
+ " policy_eventbridge_sfn_arn = f\"arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction\"\n",
" iam.create_policy_version(\n",
- " PolicyArn=policy_eventbridge_sfn_arn,\n",
- " PolicyDocument=json.dumps(eventbridge_sfn_policy),\n",
- " SetAsDefault=True)\n",
+ " PolicyArn=policy_eventbridge_sfn_arn, PolicyDocument=json.dumps(eventbridge_sfn_policy), SetAsDefault=True\n",
+ " )\n",
" print(\"Policy updated.\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)"
@@ -682,7 +641,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_eventbridge_sfn_arn = f'arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction'\n",
+ "policy_eventbridge_sfn_arn = f\"arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction\"\n",
"print(policy_eventbridge_sfn_arn)"
]
},
@@ -700,13 +659,10 @@
"outputs": [],
"source": [
"try:\n",
- " response = iam.attach_role_policy(\n",
- " PolicyArn=policy_eventbridge_sfn_arn,\n",
- " RoleName=iam_role_name_eventbridge\n",
- " )\n",
+ " response = iam.attach_role_policy(PolicyArn=policy_eventbridge_sfn_arn, RoleName=iam_role_name_eventbridge)\n",
" print(\"Done.\")\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" print(\"Policy is already attached. This is ok.\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)"
@@ -725,7 +681,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sfn = boto3.client('stepfunctions')"
+ "sfn = boto3.client(\"stepfunctions\")"
]
},
{
@@ -742,9 +698,10 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())\n",
"\n",
- "execution_name = 'run-{}'.format(timestamp)\n",
+ "execution_name = \"run-{}\".format(timestamp)\n",
"print(execution_name)"
]
},
@@ -762,7 +719,7 @@
"metadata": {},
"outputs": [],
"source": [
- "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n",
+ "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n",
"print(raw_input_data_s3_uri)"
]
},
@@ -779,13 +736,13 @@
"metadata": {},
"outputs": [],
"source": [
- "max_seq_length=64\n",
- "train_split_percentage=0.90\n",
- "validation_split_percentage=0.05\n",
- "test_split_percentage=0.05\n",
- "balance_dataset=True\n",
- "processing_instance_count=2\n",
- "processing_instance_type='ml.c5.2xlarge'"
+ "max_seq_length = 64\n",
+ "train_split_percentage = 0.90\n",
+ "validation_split_percentage = 0.05\n",
+ "test_split_percentage = 0.05\n",
+ "balance_dataset = True\n",
+ "processing_instance_count = 2\n",
+ "processing_instance_type = \"ml.c5.2xlarge\""
]
},
{
@@ -801,31 +758,31 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=False\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True\n",
- "deploy_instance_count=1\n",
- "#deploy_instance_type='ml.m5.4xlarge'\n",
- "deploy_instance_type='ml.m5.large'"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = False\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True\n",
+ "deploy_instance_count = 1\n",
+ "# deploy_instance_type='ml.m5.4xlarge'\n",
+ "deploy_instance_type = \"ml.m5.large\""
]
},
{
@@ -866,10 +823,10 @@
"metadata": {},
"outputs": [],
"source": [
- "# You find the regional AWS ECR account IDs storing the docker images here: \n",
+ "# You find the regional AWS ECR account IDs storing the docker images here:\n",
"# https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-docker-containers-frameworks.html\n",
- "account_id_scikit_learn_image_us_east_1 = '683313688378'\n",
- "account_id_scikit_learn_image_us_west_2 = '246618743249'"
+ "account_id_scikit_learn_image_us_east_1 = \"683313688378\"\n",
+ "account_id_scikit_learn_image_us_west_2 = \"246618743249\""
]
},
{
@@ -878,13 +835,13 @@
"metadata": {},
"outputs": [],
"source": [
- "account_id_scikit_learn_image = ''\n",
- "if region == 'us-east-1':\n",
+ "account_id_scikit_learn_image = \"\"\n",
+ "if region == \"us-east-1\":\n",
" account_id_scikit_learn_image = account_id_scikit_learn_image_us_east_1\n",
- "elif region == 'us-west-2':\n",
+ "elif region == \"us-west-2\":\n",
" account_id_scikit_learn_image = account_id_scikit_learn_image_us_west_2\n",
"else:\n",
- " print('Please look up the correct AWS ECR Account ID per Link above.')"
+ " print(\"Please look up the correct AWS ECR Account ID per Link above.\")"
]
},
{
@@ -903,205 +860,204 @@
"outputs": [],
"source": [
"inputs = {\n",
- " \"Processing Job\": {\n",
- " \"ProcessingJobName\": \"training-pipeline-{}\".format(execution_name), \n",
- " \"ProcessingInputs\": [\n",
- " {\n",
- " \"InputName\": \"raw_input\",\n",
- " \"S3Input\": {\n",
- "# TODO: Change to watched_bucket + watched_s3_prefix \n",
- "# \"S3Uri\": \"s3://{}/{}/\".format(watched_bucket, watched_s3_prefix),\n",
- " \"S3Uri\": \"{}\".format(raw_input_data_s3_uri), \n",
- " \"LocalPath\": \"/opt/ml/processing/input/data/\",\n",
- " \"S3DataType\": \"S3Prefix\",\n",
- " \"S3InputMode\": \"File\",\n",
- " \"S3DataDistributionType\": \"ShardedByS3Key\",\n",
- " \"S3CompressionType\": \"None\"\n",
- " }\n",
- " },\n",
- " {\n",
- " \"InputName\": \"code\",\n",
- " \"S3Input\": {\n",
- " \"S3Uri\": \"s3://{}/{}/preprocess-scikit-text-to-bert.py\".format(bucket, processing_code_s3_prefix),\n",
- " \"LocalPath\": \"/opt/ml/processing/input/code\",\n",
- " \"S3DataType\": \"S3Prefix\",\n",
- " \"S3InputMode\": \"File\",\n",
- " \"S3DataDistributionType\": \"FullyReplicated\",\n",
- " \"S3CompressionType\": \"None\"\n",
- " }\n",
- " }\n",
- " ],\n",
- " \"ProcessingOutputConfig\": {\n",
- " \"Outputs\": [\n",
- " {\n",
- " \"OutputName\": \"bert-train\",\n",
- " \"S3Output\": {\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n",
- " \"LocalPath\": \"/opt/ml/processing/output/bert/train\",\n",
- " \"S3UploadMode\": \"EndOfJob\"\n",
- " }\n",
+ " \"Processing Job\": {\n",
+ " \"ProcessingJobName\": \"training-pipeline-{}\".format(execution_name),\n",
+ " \"ProcessingInputs\": [\n",
+ " {\n",
+ " \"InputName\": \"raw_input\",\n",
+ " \"S3Input\": {\n",
+ " # TODO: Change to watched_bucket + watched_s3_prefix\n",
+ " # \"S3Uri\": \"s3://{}/{}/\".format(watched_bucket, watched_s3_prefix),\n",
+ " \"S3Uri\": \"{}\".format(raw_input_data_s3_uri),\n",
+ " \"LocalPath\": \"/opt/ml/processing/input/data/\",\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3InputMode\": \"File\",\n",
+ " \"S3DataDistributionType\": \"ShardedByS3Key\",\n",
+ " \"S3CompressionType\": \"None\",\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"InputName\": \"code\",\n",
+ " \"S3Input\": {\n",
+ " \"S3Uri\": \"s3://{}/{}/preprocess-scikit-text-to-bert.py\".format(bucket, processing_code_s3_prefix),\n",
+ " \"LocalPath\": \"/opt/ml/processing/input/code\",\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3InputMode\": \"File\",\n",
+ " \"S3DataDistributionType\": \"FullyReplicated\",\n",
+ " \"S3CompressionType\": \"None\",\n",
+ " },\n",
+ " },\n",
+ " ],\n",
+ " \"ProcessingOutputConfig\": {\n",
+ " \"Outputs\": [\n",
+ " {\n",
+ " \"OutputName\": \"bert-train\",\n",
+ " \"S3Output\": {\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n",
+ " \"LocalPath\": \"/opt/ml/processing/output/bert/train\",\n",
+ " \"S3UploadMode\": \"EndOfJob\",\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"OutputName\": \"bert-validation\",\n",
+ " \"S3Output\": {\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n",
+ " \"LocalPath\": \"/opt/ml/processing/output/bert/validation\",\n",
+ " \"S3UploadMode\": \"EndOfJob\",\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"OutputName\": \"bert-test\",\n",
+ " \"S3Output\": {\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n",
+ " \"LocalPath\": \"/opt/ml/processing/output/bert/test\",\n",
+ " \"S3UploadMode\": \"EndOfJob\",\n",
+ " },\n",
+ " },\n",
+ " ]\n",
" },\n",
- " {\n",
- " \"OutputName\": \"bert-validation\",\n",
- " \"S3Output\": {\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n",
- " \"LocalPath\": \"/opt/ml/processing/output/bert/validation\",\n",
- " \"S3UploadMode\": \"EndOfJob\"\n",
- " }\n",
+ " \"AppSpecification\": {\n",
+ " \"ImageUri\": \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\".format(\n",
+ " account_id_scikit_learn_image, region\n",
+ " ),\n",
+ " \"ContainerArguments\": [\n",
+ " \"--train-split-percentage\",\n",
+ " \"{}\".format(train_split_percentage),\n",
+ " \"--validation-split-percentage\",\n",
+ " \"{}\".format(validation_split_percentage),\n",
+ " \"--test-split-percentage\",\n",
+ " \"{}\".format(test_split_percentage),\n",
+ " \"--max-seq-length\",\n",
+ " \"{}\".format(max_seq_length),\n",
+ " \"--balance-dataset\",\n",
+ " \"{}\".format(balance_dataset),\n",
+ " ],\n",
+ " \"ContainerEntrypoint\": [\"python3\", \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py\"],\n",
" },\n",
- " {\n",
- " \"OutputName\": \"bert-test\",\n",
- " \"S3Output\": {\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n",
- " \"LocalPath\": \"/opt/ml/processing/output/bert/test\",\n",
- " \"S3UploadMode\": \"EndOfJob\"\n",
- " }\n",
- " }\n",
- " ]\n",
- " },\n",
- " \"AppSpecification\": {\n",
- " \"ImageUri\": \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\".format(account_id_scikit_learn_image, region),\n",
- " \"ContainerArguments\": [\n",
- " \"--train-split-percentage\",\n",
- " \"{}\".format(train_split_percentage),\n",
- " \"--validation-split-percentage\",\n",
- " \"{}\".format(validation_split_percentage),\n",
- " \"--test-split-percentage\",\n",
- " \"{}\".format(test_split_percentage),\n",
- " \"--max-seq-length\",\n",
- " \"{}\".format(max_seq_length),\n",
- " \"--balance-dataset\",\n",
- " \"{}\".format(balance_dataset)\n",
- " ],\n",
- " \"ContainerEntrypoint\": [\n",
- " \"python3\",\n",
- " \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py\"\n",
- " ]\n",
- " },\n",
- " \"RoleArn\": \"{}\".format(role),\n",
- " \"ProcessingResources\": {\n",
- " \"ClusterConfig\": {\n",
- " \"InstanceCount\": processing_instance_count,\n",
- " \"InstanceType\": \"{}\".format(processing_instance_type),\n",
- " \"VolumeSizeInGB\": 30\n",
- " }\n",
- " },\n",
- " \"StoppingCondition\": {\n",
- " \"MaxRuntimeInSeconds\": 7200\n",
- " }\n",
- " }, \n",
- " \"Training\": {\n",
- " \"AlgorithmSpecification\": {\n",
- " \"TrainingImage\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04\".format(region),\n",
- " \"TrainingInputMode\": \"{}\".format(input_mode)\n",
- " },\n",
- " \"OutputDataConfig\": {\n",
- " \"S3OutputPath\": \"s3://{}/training-pipeline-{}/models\".format(bucket, execution_name)\n",
- " },\n",
- " \"StoppingCondition\": {\n",
- " \"MaxRuntimeInSeconds\": 7200\n",
- " },\n",
- " \"ResourceConfig\": {\n",
- " \"InstanceCount\": train_instance_count,\n",
- " \"InstanceType\": \"{}\".format(train_instance_type),\n",
- " \"VolumeSizeInGB\": train_volume_size\n",
+ " \"RoleArn\": \"{}\".format(role),\n",
+ " \"ProcessingResources\": {\n",
+ " \"ClusterConfig\": {\n",
+ " \"InstanceCount\": processing_instance_count,\n",
+ " \"InstanceType\": \"{}\".format(processing_instance_type),\n",
+ " \"VolumeSizeInGB\": 30,\n",
+ " }\n",
+ " },\n",
+ " \"StoppingCondition\": {\"MaxRuntimeInSeconds\": 7200},\n",
" },\n",
- " \"RoleArn\": \"{}\".format(role),\n",
- " \"InputDataConfig\": [\n",
- " {\n",
- " \"DataSource\": {\n",
- " \"S3DataSource\": {\n",
- " \"S3DataType\": \"S3Prefix\",\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n",
- " \"S3DataDistributionType\": \"ShardedByS3Key\"\n",
- " }\n",
+ " \"Training\": {\n",
+ " \"AlgorithmSpecification\": {\n",
+ " \"TrainingImage\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04\".format(\n",
+ " region\n",
+ " ),\n",
+ " \"TrainingInputMode\": \"{}\".format(input_mode),\n",
" },\n",
- " \"ChannelName\": \"train\"\n",
- " },\n",
- " {\n",
- " \"DataSource\": {\n",
- " \"S3DataSource\": {\n",
- " \"S3DataType\": \"S3Prefix\",\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n",
- " \"S3DataDistributionType\": \"ShardedByS3Key\"\n",
- " }\n",
+ " \"OutputDataConfig\": {\"S3OutputPath\": \"s3://{}/training-pipeline-{}/models\".format(bucket, execution_name)},\n",
+ " \"StoppingCondition\": {\"MaxRuntimeInSeconds\": 7200},\n",
+ " \"ResourceConfig\": {\n",
+ " \"InstanceCount\": train_instance_count,\n",
+ " \"InstanceType\": \"{}\".format(train_instance_type),\n",
+ " \"VolumeSizeInGB\": train_volume_size,\n",
" },\n",
- " \"ChannelName\": \"validation\"\n",
- " },\n",
- " {\n",
- " \"DataSource\": {\n",
- " \"S3DataSource\": {\n",
- " \"S3DataType\": \"S3Prefix\",\n",
- " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n",
- " \"S3DataDistributionType\": \"ShardedByS3Key\"\n",
- " }\n",
+ " \"RoleArn\": \"{}\".format(role),\n",
+ " \"InputDataConfig\": [\n",
+ " {\n",
+ " \"DataSource\": {\n",
+ " \"S3DataSource\": {\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n",
+ " \"S3DataDistributionType\": \"ShardedByS3Key\",\n",
+ " }\n",
+ " },\n",
+ " \"ChannelName\": \"train\",\n",
+ " },\n",
+ " {\n",
+ " \"DataSource\": {\n",
+ " \"S3DataSource\": {\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n",
+ " \"S3DataDistributionType\": \"ShardedByS3Key\",\n",
+ " }\n",
+ " },\n",
+ " \"ChannelName\": \"validation\",\n",
+ " },\n",
+ " {\n",
+ " \"DataSource\": {\n",
+ " \"S3DataSource\": {\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n",
+ " \"S3DataDistributionType\": \"ShardedByS3Key\",\n",
+ " }\n",
+ " },\n",
+ " \"ChannelName\": \"test\",\n",
+ " },\n",
+ " ],\n",
+ " \"HyperParameters\": {\n",
+ " \"epochs\": \"{}\".format(epochs),\n",
+ " \"learning_rate\": \"{}\".format(learning_rate),\n",
+ " \"epsilon\": \"{}\".format(epsilon),\n",
+ " \"train_batch_size\": \"{}\".format(train_batch_size),\n",
+ " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n",
+ " \"test_batch_size\": \"{}\".format(test_batch_size),\n",
+ " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n",
+ " \"validation_steps\": \"{}\".format(validation_steps),\n",
+ " \"test_steps\": \"{}\".format(test_steps),\n",
+ " \"use_xla\": \"{}\".format(str(use_xla).lower()),\n",
+ " \"use_amp\": \"{}\".format(str(use_amp).lower()),\n",
+ " \"max_seq_length\": \"{}\".format(max_seq_length),\n",
+ " \"freeze_bert_layer\": \"{}\".format(str(freeze_bert_layer).lower()),\n",
+ " \"enable_sagemaker_debugger\": \"{}\".format(str(enable_sagemaker_debugger).lower()),\n",
+ " \"enable_checkpointing\": \"{}\".format(str(enable_checkpointing).lower()),\n",
+ " \"enable_tensorboard\": \"{}\".format(str(enable_tensorboard).lower()),\n",
+ " \"run_validation\": \"{}\".format(str(run_validation).lower()),\n",
+ " \"run_test\": \"{}\".format(str(run_test).lower()),\n",
+ " \"run_sample_predictions\": \"{}\".format(str(run_sample_predictions).lower()),\n",
+ " \"sagemaker_submit_directory\": '\"s3://{}/{}/estimator-source/source/sourcedir.tar.gz\"'.format(\n",
+ " bucket, stepfunction_name\n",
+ " ),\n",
+ " \"sagemaker_program\": '\"tf_bert_reviews.py\"',\n",
+ " \"sagemaker_enable_cloudwatch_metrics\": \"false\",\n",
+ " \"sagemaker_container_log_level\": \"20\",\n",
+ " \"sagemaker_job_name\": '\"training-pipeline-{}/estimator-source\"'.format(execution_name),\n",
+ " \"sagemaker_region\": '\"{}\"'.format(region),\n",
+ " \"model_dir\": '\"s3://{}/training-pipeline-{}/estimator-source/model\"'.format(bucket, execution_name),\n",
" },\n",
- " \"ChannelName\": \"test\"\n",
- " }\n",
- " ],\n",
- " \"HyperParameters\": {\n",
- " \"epochs\": \"{}\".format(epochs),\n",
- " \"learning_rate\": \"{}\".format(learning_rate),\n",
- " \"epsilon\": \"{}\".format(epsilon),\n",
- " \"train_batch_size\": \"{}\".format(train_batch_size),\n",
- " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n",
- " \"test_batch_size\": \"{}\".format(test_batch_size),\n",
- " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n",
- " \"validation_steps\": \"{}\".format(validation_steps),\n",
- " \"test_steps\": \"{}\".format(test_steps),\n",
- " \"use_xla\": \"{}\".format(str(use_xla).lower()),\n",
- " \"use_amp\": \"{}\".format(str(use_amp).lower()),\n",
- " \"max_seq_length\": \"{}\".format(max_seq_length),\n",
- " \"freeze_bert_layer\": \"{}\".format(str(freeze_bert_layer).lower()),\n",
- " \"enable_sagemaker_debugger\": \"{}\".format(str(enable_sagemaker_debugger).lower()),\n",
- " \"enable_checkpointing\": \"{}\".format(str(enable_checkpointing).lower()),\n",
- " \"enable_tensorboard\": \"{}\".format(str(enable_tensorboard).lower()),\n",
- " \"run_validation\": \"{}\".format(str(run_validation).lower()),\n",
- " \"run_test\": \"{}\".format(str(run_test).lower()),\n",
- " \"run_sample_predictions\": \"{}\".format(str(run_sample_predictions).lower()),\n",
- " \"sagemaker_submit_directory\": \"\\\"s3://{}/{}/estimator-source/source/sourcedir.tar.gz\\\"\".format(bucket, stepfunction_name),\n",
- " \"sagemaker_program\": \"\\\"tf_bert_reviews.py\\\"\",\n",
- " \"sagemaker_enable_cloudwatch_metrics\": \"false\",\n",
- " \"sagemaker_container_log_level\": \"20\",\n",
- " \"sagemaker_job_name\": \"\\\"training-pipeline-{}/estimator-source\\\"\".format(execution_name),\n",
- " \"sagemaker_region\": \"\\\"{}\\\"\".format(region),\n",
- " \"model_dir\": \"\\\"s3://{}/training-pipeline-{}/estimator-source/model\\\"\".format(bucket, execution_name)\n",
- " }, \n",
- " \"TrainingJobName\": \"estimator-training-pipeline-{}\".format(execution_name),\n",
- " \"DebugHookConfig\": {\n",
- " \"S3OutputPath\": \"s3://{}/\".format(bucket)\n",
- " }\n",
- " },\n",
- " \"Create Model\": {\n",
- " \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n",
- " \"PrimaryContainer\": {\n",
- " \"Image\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu-py36-ubuntu18.04\".format(region),\n",
- " \"Environment\": {\n",
- " \"SAGEMAKER_PROGRAM\": \"null\",\n",
- " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"null\",\n",
- " \"SAGEMAKER_ENABLE_CLOUDWATCH_METRICS\": \"false\",\n",
- " \"SAGEMAKER_CONTAINER_LOG_LEVEL\": \"20\",\n",
- " \"SAGEMAKER_REGION\": \"{}\".format(region)\n",
- " },\n",
- " \"ModelDataUrl\": \"s3://{}/training-pipeline-{}/models/estimator-training-pipeline-{}/output/model.tar.gz\".format(bucket, execution_name, execution_name)\n",
+ " \"TrainingJobName\": \"estimator-training-pipeline-{}\".format(execution_name),\n",
+ " \"DebugHookConfig\": {\"S3OutputPath\": \"s3://{}/\".format(bucket)},\n",
" },\n",
- " \"ExecutionRoleArn\": \"{}\".format(role)\n",
- " },\n",
- " \"Configure Endpoint\": {\n",
- " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n",
- " \"ProductionVariants\": [\n",
- " {\n",
- " \"InitialInstanceCount\": deploy_instance_count,\n",
- " \"InstanceType\": \"{}\".format(deploy_instance_type),\n",
+ " \"Create Model\": {\n",
" \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n",
- " \"VariantName\": \"AllTraffic\"\n",
- " }\n",
- " ]\n",
- " },\n",
- " \"Deploy\": {\n",
- " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n",
- " \"EndpointName\": \"training-pipeline-{}\".format(execution_name)\n",
- " }\n",
+ " \"PrimaryContainer\": {\n",
+ " \"Image\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu-py36-ubuntu18.04\".format(\n",
+ " region\n",
+ " ),\n",
+ " \"Environment\": {\n",
+ " \"SAGEMAKER_PROGRAM\": \"null\",\n",
+ " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"null\",\n",
+ " \"SAGEMAKER_ENABLE_CLOUDWATCH_METRICS\": \"false\",\n",
+ " \"SAGEMAKER_CONTAINER_LOG_LEVEL\": \"20\",\n",
+ " \"SAGEMAKER_REGION\": \"{}\".format(region),\n",
+ " },\n",
+ " \"ModelDataUrl\": \"s3://{}/training-pipeline-{}/models/estimator-training-pipeline-{}/output/model.tar.gz\".format(\n",
+ " bucket, execution_name, execution_name\n",
+ " ),\n",
+ " },\n",
+ " \"ExecutionRoleArn\": \"{}\".format(role),\n",
+ " },\n",
+ " \"Configure Endpoint\": {\n",
+ " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n",
+ " \"ProductionVariants\": [\n",
+ " {\n",
+ " \"InitialInstanceCount\": deploy_instance_count,\n",
+ " \"InstanceType\": \"{}\".format(deploy_instance_type),\n",
+ " \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n",
+ " \"VariantName\": \"AllTraffic\",\n",
+ " }\n",
+ " ],\n",
+ " },\n",
+ " \"Deploy\": {\n",
+ " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n",
+ " \"EndpointName\": \"training-pipeline-{}\".format(execution_name),\n",
+ " },\n",
"}"
]
},
@@ -1130,10 +1086,7 @@
"outputs": [],
"source": [
"# Check for exsting targets\n",
- "targets = events.list_targets_by_rule(\n",
- " Rule='S3-Trigger',\n",
- " EventBusName='default'\n",
- ")"
+ "targets = events.list_targets_by_rule(Rule=\"S3-Trigger\", EventBusName=\"default\")"
]
},
{
@@ -1142,18 +1095,13 @@
"metadata": {},
"outputs": [],
"source": [
- "number_targets = len(targets['Targets'])\n",
+ "number_targets = len(targets[\"Targets\"])\n",
"\n",
"if number_targets > 0:\n",
- " for target in targets['Targets']:\n",
- " print(target['Id'])\n",
- " events.remove_targets(\n",
- " Rule='S3-Trigger',\n",
- " EventBusName='default',\n",
- " Ids=[target['Id']],\n",
- " Force=True\n",
- ")\n",
- " print(\"Target: \" +target['Id']+ \" removed.\")\n",
+ " for target in targets[\"Targets\"]:\n",
+ " print(target[\"Id\"])\n",
+ " events.remove_targets(Rule=\"S3-Trigger\", EventBusName=\"default\", Ids=[target[\"Id\"]], Force=True)\n",
+ " print(\"Target: \" + target[\"Id\"] + \" removed.\")\n",
"else:\n",
" print(\"No targets defined yet.\")"
]
@@ -1169,16 +1117,9 @@
"target_id = str(uuid.uuid4())\n",
"\n",
"response = events.put_targets(\n",
- " Rule='S3-Trigger',\n",
- " EventBusName='default',\n",
- " Targets=[\n",
- " {\n",
- " 'Id': target_id,\n",
- " 'Arn': stepfunction_arn,\n",
- " 'RoleArn': iam_role_eventbridge_arn,\n",
- " 'Input': inputs_json\n",
- " }\n",
- " ]\n",
+ " Rule=\"S3-Trigger\",\n",
+ " EventBusName=\"default\",\n",
+ " Targets=[{\"Id\": target_id, \"Arn\": stepfunction_arn, \"RoleArn\": iam_role_eventbridge_arn, \"Input\": inputs_json}],\n",
")"
]
},
@@ -1206,7 +1147,7 @@
"source": [
"execution_list_before_uploading = sfn.list_executions(stateMachineArn=stepfunction_arn)\n",
"\n",
- "number_of_executions_before_uploading = len(execution_list_before_uploading['executions'])\n",
+ "number_of_executions_before_uploading = len(execution_list_before_uploading[\"executions\"])\n",
"\n",
"print(number_of_executions_before_uploading)"
]
@@ -1225,6 +1166,7 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"time.sleep(15)"
]
},
@@ -1234,7 +1176,7 @@
"metadata": {},
"outputs": [],
"source": [
- "watched_s3_uri = 's3://{}/watched_input/'.format(watched_bucket)\n",
+ "watched_s3_uri = \"s3://{}/watched_input/\".format(watched_bucket)\n",
"\n",
"print('Uploading training data to \"{}\" to trigger a new training pipeline.'.format(watched_s3_uri))"
]
@@ -1299,7 +1241,7 @@
"metadata": {},
"outputs": [],
"source": [
- "number_of_executions_after_uploading = len(execution_list_after_uploading['executions'])\n",
+ "number_of_executions_after_uploading = len(execution_list_after_uploading[\"executions\"])\n",
"\n",
"print(number_of_executions_after_uploading)"
]
@@ -1310,9 +1252,9 @@
"metadata": {},
"outputs": [],
"source": [
- "current_execution = execution_list_after_uploading['executions'][0]\n",
+ "current_execution = execution_list_after_uploading[\"executions\"][0]\n",
"\n",
- "current_execution_arn = current_execution['executionArn']\n",
+ "current_execution_arn = current_execution[\"executionArn\"]\n",
"\n",
"print(current_execution_arn)"
]
@@ -1325,7 +1267,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Step Functions Pipeline'.format(region, current_execution_arn)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Step Functions Pipeline'.format(\n",
+ " region, current_execution_arn\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -1335,7 +1283,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb b/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb
index c8b71577..05477e55 100644
--- a/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb
+++ b/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb
@@ -103,6 +103,7 @@
"source": [
"# Restart the kernel to pick up pip installed libraries\n",
"from IPython.core.display import HTML\n",
+ "\n",
"HTML(\"\")"
]
},
@@ -133,11 +134,19 @@
"import tensorflow_transform.beam as tft_beam\n",
"from tensorflow_transform.beam.tft_beam_io import transform_fn_io\n",
"from tensorflow_transform.saved import saved_transform_io\n",
- "from tensorflow_transform.tf_metadata import (dataset_metadata, dataset_schema,\n",
- " metadata_io, schema_utils)\n",
- "from tfx.components import (Evaluator, ExampleValidator, ImportExampleGen,\n",
- " ModelValidator, Pusher, ResolverNode, SchemaGen,\n",
- " StatisticsGen, Trainer, Transform)\n",
+ "from tensorflow_transform.tf_metadata import dataset_metadata, dataset_schema, metadata_io, schema_utils\n",
+ "from tfx.components import (\n",
+ " Evaluator,\n",
+ " ExampleValidator,\n",
+ " ImportExampleGen,\n",
+ " ModelValidator,\n",
+ " Pusher,\n",
+ " ResolverNode,\n",
+ " SchemaGen,\n",
+ " StatisticsGen,\n",
+ " Trainer,\n",
+ " Transform,\n",
+ ")\n",
"from tfx.components.base import executor_spec\n",
"from tfx.components.trainer.executor import GenericExecutor\n",
"from tfx.dsl.experimental import latest_blessed_model_resolver\n",
@@ -150,8 +159,7 @@
"import tensorflow_model_analysis as tfma\n",
"import tensorflow_text as text\n",
"\n",
- "from tfx.orchestration.experimental.interactive.interactive_context import \\\n",
- " InteractiveContext\n",
+ "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\n",
"\n",
"%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip"
]
@@ -189,40 +197,40 @@
"source": [
"def clean_before_download(base_data_dir):\n",
" rmtree(base_data_dir)\n",
- " \n",
+ "\n",
+ "\n",
"def delete_unnecessary_files(base_path):\n",
" os.remove(base_path + \"dataset_info.json\")\n",
" os.remove(base_path + \"label.labels.txt\")\n",
- " \n",
+ "\n",
" counter = 2\n",
" for f in glob.glob(base_path + \"imdb_reviews-unsupervised.*\"):\n",
" os.remove(f)\n",
" counter += 1\n",
" print(f\"Deleted {counter} files\")\n",
"\n",
- "def get_dataset(name='imdb_reviews', version=\"1.0.0\"):\n",
+ "\n",
+ "def get_dataset(name=\"imdb_reviews\", version=\"1.0.0\"):\n",
"\n",
" base_data_dir = \"./content/tfds/\"\n",
- " config=\"plain_text\"\n",
- " version=\"1.0.0\"\n",
+ " config = \"plain_text\"\n",
+ " version = \"1.0.0\"\n",
"\n",
" clean_before_download(base_data_dir)\n",
" tfds.disable_progress_bar()\n",
- " builder = tfds.text.IMDBReviews(data_dir=base_data_dir, \n",
- " config=config, \n",
- " version=version)\n",
- " download_config = tfds.download.DownloadConfig(\n",
- " download_mode=tfds.GenerateMode.FORCE_REDOWNLOAD)\n",
+ " builder = tfds.text.IMDBReviews(data_dir=base_data_dir, config=config, version=version)\n",
+ " download_config = tfds.download.DownloadConfig(download_mode=tfds.GenerateMode.FORCE_REDOWNLOAD)\n",
" builder.download_and_prepare(download_config=download_config)\n",
"\n",
" base_tfrecords_filename = os.path.join(base_data_dir, \"imdb_reviews\", config, version, \"\")\n",
" train_tfrecords_filename = base_tfrecords_filename + \"imdb_reviews-train*\"\n",
" test_tfrecords_filename = base_tfrecords_filename + \"imdb_reviews-test*\"\n",
" label_filename = os.path.join(base_tfrecords_filename, \"label.labels.txt\")\n",
- " labels = [label.rstrip('\\n') for label in open(label_filename)]\n",
+ " labels = [label.rstrip(\"\\n\") for label in open(label_filename)]\n",
" delete_unnecessary_files(base_tfrecords_filename)\n",
" return (train_tfrecords_filename, test_tfrecords_filename), labels\n",
"\n",
+ "\n",
"tfrecords_filenames, labels = get_dataset()"
]
},
@@ -257,12 +265,11 @@
"\n",
"BERT_TFHUB_URL = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\n",
"\n",
+ "\n",
"def load_bert_layer(model_url=BERT_TFHUB_URL):\n",
" # Load the pre-trained BERT model as layer in Keras\n",
- " bert_layer = hub.KerasLayer(\n",
- " handle=model_url,\n",
- " trainable=True)\n",
- " return bert_layer\n"
+ " bert_layer = hub.KerasLayer(handle=model_url, trainable=True)\n",
+ " return bert_layer"
]
},
{
@@ -321,10 +328,13 @@
"outputs": [],
"source": [
"output = example_gen_pb2.Output(\n",
- " split_config=example_gen_pb2.SplitConfig(splits=[\n",
- " example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=45),\n",
- " example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=5)\n",
- " ]))\n",
+ " split_config=example_gen_pb2.SplitConfig(\n",
+ " splits=[\n",
+ " example_gen_pb2.SplitConfig.Split(name=\"train\", hash_buckets=45),\n",
+ " example_gen_pb2.SplitConfig.Split(name=\"eval\", hash_buckets=5),\n",
+ " ]\n",
+ " )\n",
+ ")\n",
"# Load the data from our prepared TFDS folder\n",
"examples = external_input(\"./content/tfds/imdb_reviews/plain_text/1.0.0\")\n",
"example_gen = ImportExampleGen(input=examples, output_config=output)\n",
@@ -344,7 +354,7 @@
"source": [
"%%skip_for_export\n",
"\n",
- "for artifact in example_gen.outputs['examples'].get():\n",
+ "for artifact in example_gen.outputs[\"examples\"].get():\n",
" print(artifact.uri)"
]
},
@@ -370,11 +380,10 @@
"source": [
"%%skip_for_export\n",
"\n",
- "statistics_gen = StatisticsGen(\n",
- " examples=example_gen.outputs['examples'])\n",
+ "statistics_gen = StatisticsGen(examples=example_gen.outputs[\"examples\"])\n",
"context.run(statistics_gen)\n",
"\n",
- "context.show(statistics_gen.outputs['statistics'])"
+ "context.show(statistics_gen.outputs[\"statistics\"])"
]
},
{
@@ -389,12 +398,10 @@
"source": [
"%%skip_for_export\n",
"\n",
- "schema_gen = SchemaGen(\n",
- " statistics=statistics_gen.outputs['statistics'],\n",
- " infer_feature_shape=True)\n",
+ "schema_gen = SchemaGen(statistics=statistics_gen.outputs[\"statistics\"], infer_feature_shape=True)\n",
"context.run(schema_gen)\n",
"\n",
- "context.show(schema_gen.outputs['schema'])"
+ "context.show(schema_gen.outputs[\"schema\"])"
]
},
{
@@ -410,7 +417,7 @@
"%%skip_for_export\n",
"\n",
"# check the data schema for the type of input tensors\n",
- "tfdv.load_schema_text(schema_gen.outputs['schema'].get()[0].uri + \"/schema.pbtxt\")"
+ "tfdv.load_schema_text(schema_gen.outputs[\"schema\"].get()[0].uri + \"/schema.pbtxt\")"
]
},
{
@@ -426,11 +433,11 @@
"%%skip_for_export\n",
"\n",
"example_validator = ExampleValidator(\n",
- " statistics=statistics_gen.outputs['statistics'],\n",
- " schema=schema_gen.outputs['schema'])\n",
+ " statistics=statistics_gen.outputs[\"statistics\"], schema=schema_gen.outputs[\"schema\"]\n",
+ ")\n",
"context.run(example_validator)\n",
"\n",
- "context.show(example_validator.outputs['anomalies'])"
+ "context.show(example_validator.outputs[\"anomalies\"])"
]
},
{
@@ -466,11 +473,12 @@
"MAX_SEQ_LEN = 64 # max number is 512\n",
"do_lower_case = load_bert_layer().resolved_object.do_lower_case.numpy()\n",
"\n",
+ "\n",
"def preprocessing_fn(inputs):\n",
" \"\"\"Preprocess input column of text into transformed columns of.\n",
- " * input token ids\n",
- " * input mask\n",
- " * input type ids\n",
+ " * input token ids\n",
+ " * input mask\n",
+ " * input type ids\n",
" \"\"\"\n",
"\n",
" CLS_ID = tf.constant(101, dtype=tf.int64)\n",
@@ -478,11 +486,11 @@
" PAD_ID = tf.constant(0, dtype=tf.int64)\n",
"\n",
" vocab_file_path = load_bert_layer().resolved_object.vocab_file.asset_path\n",
- " \n",
- " bert_tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_file_path, \n",
- " token_out_type=tf.int64, \n",
- " lower_case=do_lower_case) \n",
- " \n",
+ "\n",
+ " bert_tokenizer = text.BertTokenizer(\n",
+ " vocab_lookup_table=vocab_file_path, token_out_type=tf.int64, lower_case=do_lower_case\n",
+ " )\n",
+ "\n",
" def tokenize_text(text, sequence_length=MAX_SEQ_LEN):\n",
" \"\"\"\n",
" Perform the BERT preprocessing from text -> input token ids\n",
@@ -490,14 +498,14 @@
"\n",
" # convert text into token ids\n",
" tokens = bert_tokenizer.tokenize(text)\n",
- " \n",
- " # flatten the output ragged tensors \n",
+ "\n",
+ " # flatten the output ragged tensors\n",
" tokens = tokens.merge_dims(1, 2)[:, :sequence_length]\n",
- " \n",
+ "\n",
" # Add start and end token ids to the id sequence\n",
" start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)\n",
" end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)\n",
- " tokens = tokens[:, :sequence_length - 2]\n",
+ " tokens = tokens[:, : sequence_length - 2]\n",
" tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)\n",
"\n",
" # truncate sequences greater than MAX_SEQ_LEN\n",
@@ -508,8 +516,8 @@
" pad = sequence_length - tf.shape(tokens)[1]\n",
" tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID)\n",
"\n",
- " # and finally reshape the word token ids to fit the output \n",
- " # data structure of TFT \n",
+ " # and finally reshape the word token ids to fit the output\n",
+ " # data structure of TFT\n",
" return tf.reshape(tokens, [-1, sequence_length])\n",
"\n",
" def preprocess_bert_input(text):\n",
@@ -519,25 +527,20 @@
" input_word_ids = tokenize_text(text)\n",
" input_mask = tf.cast(input_word_ids > 0, tf.int64)\n",
" input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN])\n",
- " \n",
+ "\n",
" zeros_dims = tf.stack(tf.shape(input_mask))\n",
" input_type_ids = tf.fill(zeros_dims, 0)\n",
" input_type_ids = tf.cast(input_type_ids, tf.int64)\n",
"\n",
- " return (\n",
- " input_word_ids, \n",
- " input_mask,\n",
- " input_type_ids\n",
- " )\n",
+ " return (input_word_ids, input_mask, input_type_ids)\n",
"\n",
- " input_word_ids, input_mask, input_type_ids = \\\n",
- " preprocess_bert_input(tf.squeeze(inputs['text'], axis=1))\n",
+ " input_word_ids, input_mask, input_type_ids = preprocess_bert_input(tf.squeeze(inputs[\"text\"], axis=1))\n",
"\n",
" return {\n",
- " 'input_word_ids': input_word_ids,\n",
- " 'input_mask': input_mask,\n",
- " 'input_type_ids': input_type_ids,\n",
- " 'label': inputs['label']\n",
+ " \"input_word_ids\": input_word_ids,\n",
+ " \"input_mask\": input_mask,\n",
+ " \"input_type_ids\": input_type_ids,\n",
+ " \"label\": inputs[\"label\"],\n",
" }"
]
},
@@ -552,9 +555,10 @@
"outputs": [],
"source": [
"transform = Transform(\n",
- " examples=example_gen.outputs['examples'],\n",
- " schema=schema_gen.outputs['schema'],\n",
- " module_file=os.path.abspath(\"transform.py\"))\n",
+ " examples=example_gen.outputs[\"examples\"],\n",
+ " schema=schema_gen.outputs[\"schema\"],\n",
+ " module_file=os.path.abspath(\"transform.py\"),\n",
+ ")\n",
"context.run(transform)"
]
},
@@ -583,7 +587,7 @@
"pp = pprint.PrettyPrinter()\n",
"\n",
"# Get the URI of the output artifact representing the transformed examples, which is a directory\n",
- "train_uri = transform.outputs['transformed_examples'].get()[0].uri\n",
+ "train_uri = transform.outputs[\"transformed_examples\"].get()[0].uri\n",
"\n",
"print(train_uri)\n",
"\n",
@@ -642,30 +646,29 @@
"from tfx.components.trainer.executor import TrainerFnArgs\n",
"\n",
"\n",
- "_LABEL_KEY = 'label'\n",
+ "_LABEL_KEY = \"label\"\n",
"BERT_TFHUB_URL = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\n",
"\n",
"\n",
"def _gzip_reader_fn(filenames):\n",
" \"\"\"Small utility returning a record reader that can read gzip'ed files.\"\"\"\n",
- " return tf.data.TFRecordDataset(filenames, compression_type='GZIP')\n",
+ " return tf.data.TFRecordDataset(filenames, compression_type=\"GZIP\")\n",
+ "\n",
"\n",
"def load_bert_layer(model_url=BERT_TFHUB_URL):\n",
" # Load the pre-trained BERT model as layer in Keras\n",
- " bert_layer = hub.KerasLayer(\n",
- " handle=model_url,\n",
- " trainable=False) # model can be fine-tuned \n",
+ " bert_layer = hub.KerasLayer(handle=model_url, trainable=False) # model can be fine-tuned\n",
" return bert_layer\n",
"\n",
+ "\n",
"def get_model(tf_transform_output, max_seq_length=64, num_labels=2):\n",
"\n",
" # dynamically create inputs for all outputs of our transform graph\n",
- " feature_spec = tf_transform_output.transformed_feature_spec() \n",
+ " feature_spec = tf_transform_output.transformed_feature_spec()\n",
" feature_spec.pop(_LABEL_KEY)\n",
"\n",
" inputs = {\n",
- " key: tf.keras.layers.Input(shape=(max_seq_length), name=key, dtype=tf.int64)\n",
- " for key in feature_spec.keys()\n",
+ " key: tf.keras.layers.Input(shape=(max_seq_length), name=key, dtype=tf.int64) for key in feature_spec.keys()\n",
" }\n",
"\n",
" input_word_ids = tf.cast(inputs[\"input_word_ids\"], dtype=tf.int32)\n",
@@ -673,28 +676,19 @@
" input_type_ids = tf.cast(inputs[\"input_type_ids\"], dtype=tf.int32)\n",
"\n",
" bert_layer = load_bert_layer()\n",
- " pooled_output, _ = bert_layer(\n",
- " [input_word_ids, \n",
- " input_mask, \n",
- " input_type_ids\n",
- " ]\n",
- " )\n",
- " \n",
+ " pooled_output, _ = bert_layer([input_word_ids, input_mask, input_type_ids])\n",
+ "\n",
" # Add additional layers depending on your problem\n",
- " x = tf.keras.layers.Dense(256, activation='relu')(pooled_output)\n",
- " dense = tf.keras.layers.Dense(64, activation='relu')(x)\n",
- " pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)\n",
+ " x = tf.keras.layers.Dense(256, activation=\"relu\")(pooled_output)\n",
+ " dense = tf.keras.layers.Dense(64, activation=\"relu\")(x)\n",
+ " pred = tf.keras.layers.Dense(1, activation=\"sigmoid\")(dense)\n",
"\n",
" keras_model = tf.keras.Model(\n",
- " inputs=[\n",
- " inputs['input_word_ids'], \n",
- " inputs['input_mask'], \n",
- " inputs['input_type_ids']], \n",
- " outputs=pred)\n",
- " keras_model.compile(loss='binary_crossentropy', \n",
- " optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), \n",
- " metrics=['accuracy']\n",
- " )\n",
+ " inputs=[inputs[\"input_word_ids\"], inputs[\"input_mask\"], inputs[\"input_type_ids\"]], outputs=pred\n",
+ " )\n",
+ " keras_model.compile(\n",
+ " loss=\"binary_crossentropy\", optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=[\"accuracy\"]\n",
+ " )\n",
" return keras_model\n",
"\n",
"\n",
@@ -713,13 +707,12 @@
" transformed_features = model.tft_layer(parsed_features)\n",
"\n",
" outputs = model(transformed_features)\n",
- " return {'outputs': outputs}\n",
+ " return {\"outputs\": outputs}\n",
"\n",
" return serve_tf_examples_fn\n",
"\n",
- "def _input_fn(file_pattern: Text,\n",
- " tf_transform_output: tft.TFTransformOutput,\n",
- " batch_size: int = 32) -> tf.data.Dataset:\n",
+ "\n",
+ "def _input_fn(file_pattern: Text, tf_transform_output: tft.TFTransformOutput, batch_size: int = 32) -> tf.data.Dataset:\n",
" \"\"\"Generates features and label for tuning/training.\n",
"\n",
" Args:\n",
@@ -732,18 +725,19 @@
" A dataset that contains (features, indices) tuple where features is a\n",
" dictionary of Tensors, and indices is a single Tensor of label indices.\n",
" \"\"\"\n",
- " transformed_feature_spec = (\n",
- " tf_transform_output.transformed_feature_spec().copy())\n",
+ " transformed_feature_spec = tf_transform_output.transformed_feature_spec().copy()\n",
"\n",
" dataset = tf.data.experimental.make_batched_features_dataset(\n",
" file_pattern=file_pattern,\n",
" batch_size=batch_size,\n",
" features=transformed_feature_spec,\n",
" reader=_gzip_reader_fn,\n",
- " label_key=_LABEL_KEY)\n",
+ " label_key=_LABEL_KEY,\n",
+ " )\n",
"\n",
" return dataset\n",
"\n",
+ "\n",
"# TFX Trainer will call this function.\n",
"def run_fn(fn_args: TrainerFnArgs):\n",
" \"\"\"Train the model based on given args.\n",
@@ -764,18 +758,15 @@
" train_dataset,\n",
" steps_per_epoch=fn_args.train_steps,\n",
" validation_data=eval_dataset,\n",
- " validation_steps=fn_args.eval_steps)\n",
+ " validation_steps=fn_args.eval_steps,\n",
+ " )\n",
"\n",
" signatures = {\n",
- " 'serving_default':\n",
- " _get_serve_tf_examples_fn(model,\n",
- " tf_transform_output).get_concrete_function(\n",
- " tf.TensorSpec(\n",
- " shape=[None],\n",
- " dtype=tf.string,\n",
- " name='examples')),\n",
+ " \"serving_default\": _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(\n",
+ " tf.TensorSpec(shape=[None], dtype=tf.string, name=\"examples\")\n",
+ " ),\n",
" }\n",
- " model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)\n"
+ " model.save(fn_args.serving_model_dir, save_format=\"tf\", signatures=signatures)"
]
},
{
@@ -795,11 +786,12 @@
"trainer = Trainer(\n",
" module_file=os.path.abspath(\"trainer.py\"),\n",
" custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),\n",
- " examples=transform.outputs['transformed_examples'],\n",
- " transform_graph=transform.outputs['transform_graph'],\n",
- " schema=schema_gen.outputs['schema'],\n",
+ " examples=transform.outputs[\"transformed_examples\"],\n",
+ " transform_graph=transform.outputs[\"transform_graph\"],\n",
+ " schema=schema_gen.outputs[\"schema\"],\n",
" train_args=trainer_pb2.TrainArgs(num_steps=TRAINING_STEPS),\n",
- " eval_args=trainer_pb2.EvalArgs(num_steps=EVALUATION_STEPS))\n",
+ " eval_args=trainer_pb2.EvalArgs(num_steps=EVALUATION_STEPS),\n",
+ ")\n",
"context.run(trainer)"
]
},
@@ -814,10 +806,11 @@
"outputs": [],
"source": [
"model_resolver = ResolverNode(\n",
- " instance_name='latest_blessed_model_resolver',\n",
+ " instance_name=\"latest_blessed_model_resolver\",\n",
" resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,\n",
" model=Channel(type=Model),\n",
- " model_blessing=Channel(type=ModelBlessing))\n",
+ " model_blessing=Channel(type=ModelBlessing),\n",
+ ")\n",
"\n",
"context.run(model_resolver)"
]
@@ -843,34 +836,31 @@
"outputs": [],
"source": [
"eval_config = tfma.EvalConfig(\n",
- " model_specs=[\n",
- " tfma.ModelSpec(label_key='label')\n",
- " ],\n",
+ " model_specs=[tfma.ModelSpec(label_key=\"label\")],\n",
" metrics_specs=[\n",
" tfma.MetricsSpec(\n",
- " metrics=[\n",
- " tfma.MetricConfig(class_name='ExampleCount')\n",
- " ],\n",
- " thresholds = {\n",
- " 'binary_accuracy': tfma.MetricThreshold(\n",
- " value_threshold=tfma.GenericValueThreshold(\n",
- " lower_bound={'value': 0.5}),\n",
+ " metrics=[tfma.MetricConfig(class_name=\"ExampleCount\")],\n",
+ " thresholds={\n",
+ " \"binary_accuracy\": tfma.MetricThreshold(\n",
+ " value_threshold=tfma.GenericValueThreshold(lower_bound={\"value\": 0.5}),\n",
" change_threshold=tfma.GenericChangeThreshold(\n",
- " direction=tfma.MetricDirection.HIGHER_IS_BETTER,\n",
- " absolute={'value': -1e-10}))\n",
- " }\n",
+ " direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={\"value\": -1e-10}\n",
+ " ),\n",
+ " )\n",
+ " },\n",
" )\n",
" ],\n",
" slicing_specs=[\n",
" # An empty slice spec means the overall slice, i.e. the whole dataset.\n",
" tfma.SlicingSpec(),\n",
- " ])\n",
+ " ],\n",
+ ")\n",
"\n",
"evaluator = Evaluator(\n",
- " examples=example_gen.outputs['examples'],\n",
- " model=trainer.outputs['model'],\n",
- " baseline_model=model_resolver.outputs['model'],\n",
- " eval_config=eval_config\n",
+ " examples=example_gen.outputs[\"examples\"],\n",
+ " model=trainer.outputs[\"model\"],\n",
+ " baseline_model=model_resolver.outputs[\"model\"],\n",
+ " eval_config=eval_config,\n",
")\n",
"\n",
"context.run(evaluator)"
@@ -915,11 +905,12 @@
"serving_model_dir = \"./content/serving_model_dir\"\n",
"\n",
"pusher = Pusher(\n",
- " model=trainer.outputs['model'],\n",
- " model_blessing=evaluator.outputs['blessing'],\n",
+ " model=trainer.outputs[\"model\"],\n",
+ " model_blessing=evaluator.outputs[\"blessing\"],\n",
" push_destination=pusher_pb2.PushDestination(\n",
- " filesystem=pusher_pb2.PushDestination.Filesystem(\n",
- " base_directory=serving_model_dir)))\n",
+ " filesystem=pusher_pb2.PushDestination.Filesystem(base_directory=serving_model_dir)\n",
+ " ),\n",
+ ")\n",
"\n",
"context.run(pusher)"
]
@@ -947,14 +938,14 @@
"def _bytes_feature(value):\n",
" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n",
"\n",
+ "\n",
"push_uri = pusher.outputs.model_push.get()[0].uri\n",
"latest_version = max(os.listdir(push_uri))\n",
"latest_version_path = os.path.join(push_uri, latest_version)\n",
"loaded_model = tf.saved_model.load(latest_version_path)\n",
"\n",
"example_str = b\"This is the finest show ever produced for TV. Each episode is a triumph. The casting, the writing, the timing are all second to none. This cast performs miracles.\"\n",
- "example = tf.train.Example(features=tf.train.Features(feature={\n",
- " 'text': _bytes_feature(example_str)}))\n",
+ "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n",
"\n",
"serialized_example = example.SerializeToString()\n",
"f = loaded_model.signatures[\"serving_default\"]\n",
@@ -974,14 +965,14 @@
"def _bytes_feature(value):\n",
" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n",
"\n",
+ "\n",
"push_uri = pusher.outputs.model_push.get()[0].uri\n",
"latest_version = max(os.listdir(push_uri))\n",
"latest_version_path = os.path.join(push_uri, latest_version)\n",
"loaded_model = tf.saved_model.load(latest_version_path)\n",
"\n",
"example_str = b\"I loved it!\"\n",
- "example = tf.train.Example(features=tf.train.Features(feature={\n",
- " 'text': _bytes_feature(example_str)}))\n",
+ "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n",
"\n",
"serialized_example = example.SerializeToString()\n",
"f = loaded_model.signatures[\"serving_default\"]\n",
@@ -1001,14 +992,14 @@
"def _bytes_feature(value):\n",
" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n",
"\n",
+ "\n",
"push_uri = pusher.outputs.model_push.get()[0].uri\n",
"latest_version = max(os.listdir(push_uri))\n",
"latest_version_path = os.path.join(push_uri, latest_version)\n",
"loaded_model = tf.saved_model.load(latest_version_path)\n",
"\n",
"example_str = b\"It's OK.\"\n",
- "example = tf.train.Example(features=tf.train.Features(feature={\n",
- " 'text': _bytes_feature(example_str)}))\n",
+ "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n",
"\n",
"serialized_example = example.SerializeToString()\n",
"f = loaded_model.signatures[\"serving_default\"]\n",
@@ -1028,14 +1019,14 @@
"def _bytes_feature(value):\n",
" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n",
"\n",
+ "\n",
"push_uri = pusher.outputs.model_push.get()[0].uri\n",
"latest_version = max(os.listdir(push_uri))\n",
"latest_version_path = os.path.join(push_uri, latest_version)\n",
"loaded_model = tf.saved_model.load(latest_version_path)\n",
"\n",
"example_str = b\"The worst product ever.\"\n",
- "example = tf.train.Example(features=tf.train.Features(feature={\n",
- " 'text': _bytes_feature(example_str)}))\n",
+ "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n",
"\n",
"serialized_example = example.SerializeToString()\n",
"f = loaded_model.signatures[\"serving_default\"]\n",
@@ -1048,7 +1039,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print('Model has been exported to {}'.format(pusher.outputs.model_push.get()[0].uri))"
+ "print(\"Model has been exported to {}\".format(pusher.outputs.model_push.get()[0].uri))"
]
},
{
@@ -1057,7 +1048,7 @@
"metadata": {},
"outputs": [],
"source": [
- "for path in os.walk('{}/'.format(pusher.outputs.model_push.get()[0].uri)):\n",
+ "for path in os.walk(\"{}/\".format(pusher.outputs.model_push.get()[0].uri)):\n",
" print(path[0])"
]
},
@@ -1068,7 +1059,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
},
diff --git a/11_stream/01_Setup_IAM.ipynb b/11_stream/01_Setup_IAM.ipynb
index 5a8d0b2d..240819d9 100644
--- a/11_stream/01_Setup_IAM.ipynb
+++ b/11_stream/01_Setup_IAM.ipynb
@@ -17,13 +17,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_kinesis_role_name = 'DSOAWS_Kinesis'"
+ "iam_kinesis_role_name = \"DSOAWS_Kinesis\""
]
},
{
@@ -58,31 +58,13 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesis.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"firehose.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesisanalytics.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " } \n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesis.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"firehose.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " ],\n",
+ "}"
]
},
{
@@ -100,18 +82,18 @@
" iam_role_kinesis = iam.create_role(\n",
" RoleName=iam_kinesis_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Kinesis Role'\n",
+ " Description=\"DSOAWS Kinesis Role\",\n",
" )\n",
- " print('Role succesfully created.')\n",
+ " print(\"Role succesfully created.\")\n",
" iam_kinesis_role_passed = True\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role_kinesis = iam.get_role(RoleName=iam_kinesis_role_name)\n",
- " print('Role already exists. That is OK.')\n",
+ " print(\"Role already exists. That is OK.\")\n",
" iam_kinesis_role_passed = True\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " \n",
+ " print(\"Unexpected error: %s\" % e)\n",
+ "\n",
"time.sleep(30)"
]
},
@@ -121,8 +103,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_kinesis_name = iam_role_kinesis['Role']['RoleName']\n",
- "print('Role Name: {}'.format(iam_role_kinesis_name))"
+ "iam_role_kinesis_name = iam_role_kinesis[\"Role\"][\"RoleName\"]\n",
+ "print(\"Role Name: {}\".format(iam_role_kinesis_name))"
]
},
{
@@ -131,8 +113,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_kinesis_arn = iam_role_kinesis['Role']['Arn']\n",
- "print('Role ARN: {}'.format(iam_role_kinesis_arn))"
+ "iam_role_kinesis_arn = iam_role_kinesis[\"Role\"][\"Arn\"]\n",
+ "print(\"Role ARN: {}\".format(iam_role_kinesis_arn))"
]
},
{
@@ -141,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
- "account_id = sts.get_caller_identity()['Account']"
+ "account_id = sts.get_caller_identity()[\"Account\"]"
]
},
{
@@ -157,7 +139,7 @@
"metadata": {},
"outputs": [],
"source": [
- "stream_name = 'dsoaws-kinesis-data-stream'"
+ "stream_name = \"dsoaws-kinesis-data-stream\""
]
},
{
@@ -173,7 +155,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehose_name = 'dsoaws-kinesis-data-firehose'"
+ "firehose_name = \"dsoaws-kinesis-data-firehose\""
]
},
{
@@ -189,7 +171,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lambda_fn_name_cloudwatch = 'DeliverKinesisAnalyticsToCloudWatch'"
+ "lambda_fn_name_cloudwatch = \"DeliverKinesisAnalyticsToCloudWatch\""
]
},
{
@@ -198,7 +180,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lambda_fn_name_invoke_sm_endpoint = 'InvokeSageMakerEndpointFromKinesis'"
+ "lambda_fn_name_invoke_sm_endpoint = \"InvokeSageMakerEndpointFromKinesis\""
]
},
{
@@ -207,7 +189,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lambda_fn_name_sns = 'PushNotificationToSNS'"
+ "lambda_fn_name_sns = \"PushNotificationToSNS\""
]
},
{
@@ -226,79 +208,54 @@
"outputs": [],
"source": [
"kinesis_policy_doc = {\n",
- " \n",
" \"Version\": \"2012-10-17\",\n",
" \"Statement\": [\n",
- " { \n",
- " \"Effect\": \"Allow\", \n",
+ " {\n",
+ " \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"s3:AbortMultipartUpload\",\n",
" \"s3:GetBucketLocation\",\n",
" \"s3:GetObject\",\n",
" \"s3:ListBucket\",\n",
" \"s3:ListBucketMultipartUploads\",\n",
- " \"s3:PutObject\"\n",
- " ], \n",
- " \"Resource\": [ \n",
- " \"arn:aws:s3:::{}\".format(bucket),\n",
- " \"arn:aws:s3:::{}/*\".format(bucket)\n",
- " ] \n",
+ " \"s3:PutObject\",\n",
+ " ],\n",
+ " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket), \"arn:aws:s3:::{}/*\".format(bucket)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"logs:PutLogEvents\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)\n",
- " ]\n",
+ " \"Action\": [\"logs:PutLogEvents\"],\n",
+ " \"Resource\": [\"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"kinesis:*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)\n",
- " ]\n",
+ " \"Resource\": [\"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"firehose:*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)\n",
- " ]\n",
+ " \"Resource\": [\"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"kinesisanalytics:*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ]\n",
+ " \"Resource\": [\"*\"],\n",
" },\n",
" {\n",
" \"Sid\": \"UseLambdaFunction\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"lambda:InvokeFunction\",\n",
- " \"lambda:GetFunctionConfiguration\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ] \n",
+ " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n",
+ " \"Resource\": [\"*\"],\n",
" },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": \"iam:PassRole\",\n",
- " \"Resource\": [\n",
- " \"arn:aws:iam::*:role/service-role/kinesis*\"\n",
- " ] \n",
- " }\n",
- " ]\n",
+ " {\"Effect\": \"Allow\", \"Action\": \"iam:PassRole\", \"Resource\": [\"arn:aws:iam::*:role/service-role/kinesis*\"]},\n",
+ " ],\n",
"}\n",
"\n",
"print(json.dumps(kinesis_policy_doc, indent=4, sort_keys=True, default=str))"
@@ -320,9 +277,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=iam_role_kinesis_name,\n",
- " PolicyName='DSOAWS_KinesisPolicy',\n",
- " PolicyDocument=json.dumps(kinesis_policy_doc)\n",
+ " RoleName=iam_role_kinesis_name, PolicyName=\"DSOAWS_KinesisPolicy\", PolicyDocument=json.dumps(kinesis_policy_doc)\n",
")\n",
"\n",
"time.sleep(30)"
@@ -350,7 +305,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_lambda_role_name = 'DSOAWS_Lambda'"
+ "iam_lambda_role_name = \"DSOAWS_Lambda\""
]
},
{
@@ -371,21 +326,9 @@
"assume_role_policy_doc = {\n",
" \"Version\": \"2012-10-17\",\n",
" \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"lambda.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesisanalytics.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"lambda.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " ],\n",
"}"
]
},
@@ -403,18 +346,18 @@
" iam_role_lambda = iam.create_role(\n",
" RoleName=iam_lambda_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Lambda Role'\n",
+ " Description=\"DSOAWS Lambda Role\",\n",
" )\n",
- " print('Role succesfully created.')\n",
+ " print(\"Role succesfully created.\")\n",
" iam_lambda_role_passed = True\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role_lambda = iam.get_role(RoleName=iam_lambda_role_name)\n",
- " print('Role already exists. This is OK.')\n",
+ " print(\"Role already exists. This is OK.\")\n",
" iam_lambda_role_passed = True\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " \n",
+ " print(\"Unexpected error: %s\" % e)\n",
+ "\n",
"time.sleep(30)"
]
},
@@ -424,8 +367,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_lambda_name = iam_role_lambda['Role']['RoleName']\n",
- "print('Role Name: {}'.format(iam_role_lambda_name))"
+ "iam_role_lambda_name = iam_role_lambda[\"Role\"][\"RoleName\"]\n",
+ "print(\"Role Name: {}\".format(iam_role_lambda_name))"
]
},
{
@@ -434,8 +377,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_lambda_arn = iam_role_lambda['Role']['Arn']\n",
- "print('Role ARN: {}'.format(iam_role_lambda_arn))"
+ "iam_role_lambda_arn = iam_role_lambda[\"Role\"][\"Arn\"]\n",
+ "print(\"Role ARN: {}\".format(iam_role_lambda_arn))"
]
},
{
@@ -457,41 +400,23 @@
" {\n",
" \"Sid\": \"UseLambdaFunction\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"lambda:InvokeFunction\",\n",
- " \"lambda:GetFunctionConfiguration\"\n",
- " ],\n",
- " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id)\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": \"cloudwatch:*\",\n",
- " \"Resource\": \"*\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": \"sns:*\",\n",
- " \"Resource\": \"*\"\n",
+ " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n",
+ " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id),\n",
" },\n",
+ " {\"Effect\": \"Allow\", \"Action\": \"cloudwatch:*\", \"Resource\": \"*\"},\n",
+ " {\"Effect\": \"Allow\", \"Action\": \"sns:*\", \"Resource\": \"*\"},\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": \"logs:CreateLogGroup\",\n",
- " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id)\n",
+ " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id),\n",
" },\n",
+ " {\"Effect\": \"Allow\", \"Action\": \"sagemaker:InvokeEndpoint\", \"Resource\": \"*\"},\n",
" {\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": \"sagemaker:InvokeEndpoint\",\n",
- " \"Resource\": \"*\"\n",
- " }, \n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"logs:CreateLogStream\",\n",
- " \"logs:PutLogEvents\"\n",
- " ],\n",
- " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id)\n",
- " }\n",
- " ]\n",
+ " \"Action\": [\"logs:CreateLogStream\", \"logs:PutLogEvents\"],\n",
+ " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id),\n",
+ " },\n",
+ " ],\n",
"}"
]
},
@@ -513,9 +438,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=iam_role_lambda_name,\n",
- " PolicyName='DSOAWS_LambdaPolicy',\n",
- " PolicyDocument=json.dumps(lambda_policy_doc)\n",
+ " RoleName=iam_role_lambda_name, PolicyName=\"DSOAWS_LambdaPolicy\", PolicyDocument=json.dumps(lambda_policy_doc)\n",
")\n",
"\n",
"time.sleep(30)"
diff --git a/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb b/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb
index 5e83862d..2f0c944b 100644
--- a/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb
+++ b/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb
@@ -35,14 +35,14 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n",
- "lam = boto3.Session().client(service_name='lambda', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n",
+ "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)"
]
},
{
@@ -70,9 +70,9 @@
"try:\n",
" iam_lambda_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -102,9 +102,9 @@
"try:\n",
" iam_lambda_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -123,11 +123,11 @@
"outputs": [],
"source": [
"if not iam_lambda_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -148,9 +148,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -175,7 +175,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lambda_fn_name_invoke_ep='InvokeSageMakerEndpointFromKinesis'"
+ "lambda_fn_name_invoke_ep = \"InvokeSageMakerEndpointFromKinesis\""
]
},
{
@@ -222,11 +222,11 @@
"source": [
"try:\n",
" pytorch_endpoint_name\n",
- " print('[OK]')\n",
+ " print(\"[OK]\")\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -247,13 +247,13 @@
"outputs": [],
"source": [
"try:\n",
- " waiter = sm.get_waiter('endpoint_in_service')\n",
+ " waiter = sm.get_waiter(\"endpoint_in_service\")\n",
" waiter.wait(EndpointName=pytorch_endpoint_name)\n",
"except:\n",
- " print('###################')\n",
- " print('The endpoint is not running.')\n",
- " print('Please re-run the model deployment section to deploy the endpoint.')\n",
- " print('###################') "
+ " print(\"###################\")\n",
+ " print(\"The endpoint is not running.\")\n",
+ " print(\"Please re-run the model deployment section to deploy the endpoint.\")\n",
+ " print(\"###################\")"
]
},
{
@@ -265,30 +265,27 @@
"import json\n",
"import boto3\n",
"\n",
- "runtime = boto3.client('runtime.sagemaker')\n",
+ "runtime = boto3.client(\"runtime.sagemaker\")\n",
"\n",
- "inputs = [\n",
- " {\"features\": [\"This is great!\"]},\n",
- " {\"features\": [\"This is bad.\"]}\n",
- "] \n",
+ "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n",
"\n",
"response = runtime.invoke_endpoint(\n",
- " EndpointName=pytorch_endpoint_name,\n",
- " ContentType='application/jsonlines', \n",
- " Accept='application/jsonlines', \n",
- " Body=json.dumps(inputs).encode('utf-8')\n",
+ " EndpointName=pytorch_endpoint_name,\n",
+ " ContentType=\"application/jsonlines\",\n",
+ " Accept=\"application/jsonlines\",\n",
+ " Body=json.dumps(inputs).encode(\"utf-8\"),\n",
")\n",
- "print('response: {}'.format(response))\n",
+ "print(\"response: {}\".format(response))\n",
"\n",
- "predicted_classes_str = response['Body'].read().decode('utf-8')\n",
+ "predicted_classes_str = response[\"Body\"].read().decode(\"utf-8\")\n",
"predicted_classes_json = json.loads(predicted_classes_str)\n",
"\n",
"predicted_classes = predicted_classes_json.splitlines()\n",
- "print('predicted_classes: {}'.format(predicted_classes))\n",
+ "print(\"predicted_classes: {}\".format(predicted_classes))\n",
"\n",
"for predicted_class_json, input_data in zip(predicted_classes, inputs):\n",
- " predicted_class = json.loads(predicted_class_json)['predicted_label']\n",
- " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0])) "
+ " predicted_class = json.loads(predicted_class_json)[\"predicted_label\"]\n",
+ " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0]))"
]
},
{
@@ -313,7 +310,7 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('src/InvokeSageMakerEndpointFromKinesis.zip', 'rb') as f: \n",
+ "with open(\"src/InvokeSageMakerEndpointFromKinesis.zip\", \"rb\") as f:\n",
" code = f.read()"
]
},
@@ -332,33 +329,28 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = lam.create_function(\n",
- " FunctionName='{}'.format(lambda_fn_name_invoke_ep),\n",
- " Runtime='python3.7',\n",
- " Role='{}'.format(iam_role_lambda_arn),\n",
- " Handler='src/invoke_sm_endpoint_from_kinesis.lambda_handler',\n",
- " Code={\n",
- " 'ZipFile': code\n",
- " },\n",
- " Description='Query SageMaker Endpoint for star rating prediction on review input text.',\n",
+ " FunctionName=\"{}\".format(lambda_fn_name_invoke_ep),\n",
+ " Runtime=\"python3.7\",\n",
+ " Role=\"{}\".format(iam_role_lambda_arn),\n",
+ " Handler=\"src/invoke_sm_endpoint_from_kinesis.lambda_handler\",\n",
+ " Code={\"ZipFile\": code},\n",
+ " Description=\"Query SageMaker Endpoint for star rating prediction on review input text.\",\n",
" # max timeout supported by Firehose is 5min\n",
" Timeout=300,\n",
" MemorySize=128,\n",
- " Publish=True\n",
+ " Publish=True,\n",
" )\n",
- " print('Lambda Function {} successfully created.'.format(lambda_fn_name_invoke_ep))\n",
+ " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_invoke_ep))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceConflictException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n",
" response = lam.update_function_code(\n",
- " FunctionName='{}'.format(lambda_fn_name_invoke_ep),\n",
- " ZipFile=code,\n",
- " Publish=True,\n",
- " DryRun=False\n",
- " ) \n",
- " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_invoke_ep)) \n",
+ " FunctionName=\"{}\".format(lambda_fn_name_invoke_ep), ZipFile=code, Publish=True, DryRun=False\n",
+ " )\n",
+ " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_invoke_ep))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -369,7 +361,7 @@
"source": [
"response = lam.get_function(FunctionName=lambda_fn_name_invoke_ep)\n",
"\n",
- "lambda_fn_arn_invoke_ep = response['Configuration']['FunctionArn']\n",
+ "lambda_fn_arn_invoke_ep = response[\"Configuration\"][\"FunctionArn\"]\n",
"print(lambda_fn_arn_invoke_ep)"
]
},
@@ -396,8 +388,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_invoke_ep)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Function'.format(\n",
+ " region, lambda_fn_name_invoke_ep\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -414,13 +412,8 @@
"outputs": [],
"source": [
"response = lam.update_function_configuration(\n",
- " FunctionName=lambda_fn_name_invoke_ep,\n",
- " Environment={\n",
- " 'Variables': {\n",
- " 'ENDPOINT_NAME': pytorch_endpoint_name\n",
- " }\n",
- " }\n",
- " )"
+ " FunctionName=lambda_fn_name_invoke_ep, Environment={\"Variables\": {\"ENDPOINT_NAME\": pytorch_endpoint_name}}\n",
+ ")"
]
},
{
diff --git a/11_stream/03_Create_Kinesis_Data_Firehose.ipynb b/11_stream/03_Create_Kinesis_Data_Firehose.ipynb
index 8491f8bc..49db44c0 100644
--- a/11_stream/03_Create_Kinesis_Data_Firehose.ipynb
+++ b/11_stream/03_Create_Kinesis_Data_Firehose.ipynb
@@ -27,13 +27,13 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)"
]
},
{
@@ -54,9 +54,9 @@
"try:\n",
" firehose_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -93,9 +93,9 @@
"try:\n",
" iam_kinesis_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -125,9 +125,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -157,9 +157,9 @@
"try:\n",
" iam_kinesis_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -178,11 +178,11 @@
"outputs": [],
"source": [
"if not iam_kinesis_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -210,9 +210,9 @@
"try:\n",
" lambda_fn_arn_invoke_ep\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -239,69 +239,59 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = firehose.create_delivery_stream(\n",
" DeliveryStreamName=firehose_name,\n",
- " DeliveryStreamType='DirectPut',\n",
+ " DeliveryStreamType=\"DirectPut\",\n",
" ExtendedS3DestinationConfiguration={\n",
- " 'RoleARN': iam_role_kinesis_arn,\n",
- " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n",
- " 'Prefix': 'kinesis-data-firehose/', \n",
- " 'ErrorOutputPrefix': 'kinesis-data-firehose-error/',\n",
- " 'BufferingHints': {\n",
- " 'SizeInMBs': 1,\n",
- " 'IntervalInSeconds': 60\n",
+ " \"RoleARN\": iam_role_kinesis_arn,\n",
+ " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n",
+ " \"Prefix\": \"kinesis-data-firehose/\",\n",
+ " \"ErrorOutputPrefix\": \"kinesis-data-firehose-error/\",\n",
+ " \"BufferingHints\": {\"SizeInMBs\": 1, \"IntervalInSeconds\": 60},\n",
+ " \"CompressionFormat\": \"UNCOMPRESSED\",\n",
+ " \"CloudWatchLoggingOptions\": {\n",
+ " \"Enabled\": True,\n",
+ " \"LogGroupName\": \"/aws/kinesisfirehose/dsoaws-kinesis-data-firehose\",\n",
+ " \"LogStreamName\": \"S3Delivery\",\n",
" },\n",
- " 'CompressionFormat': 'UNCOMPRESSED',\n",
- " 'CloudWatchLoggingOptions': {\n",
- " 'Enabled': True,\n",
- " 'LogGroupName': '/aws/kinesisfirehose/dsoaws-kinesis-data-firehose',\n",
- " 'LogStreamName': 'S3Delivery'\n",
+ " \"ProcessingConfiguration\": {\n",
+ " \"Enabled\": True,\n",
+ " \"Processors\": [\n",
+ " {\n",
+ " \"Type\": \"Lambda\",\n",
+ " \"Parameters\": [\n",
+ " {\n",
+ " \"ParameterName\": \"LambdaArn\",\n",
+ " \"ParameterValue\": \"{}:$LATEST\".format(lambda_fn_arn_invoke_ep),\n",
+ " },\n",
+ " {\"ParameterName\": \"BufferSizeInMBs\", \"ParameterValue\": \"1\"},\n",
+ " {\"ParameterName\": \"BufferIntervalInSeconds\", \"ParameterValue\": \"60\"},\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
" },\n",
- " 'ProcessingConfiguration': {\n",
- " 'Enabled': True,\n",
- " 'Processors': [{\n",
- " 'Type': 'Lambda',\n",
- " 'Parameters': [\n",
- " {\n",
- " 'ParameterName': 'LambdaArn',\n",
- " 'ParameterValue': '{}:$LATEST'.format(lambda_fn_arn_invoke_ep)\n",
- " },\n",
- " {\n",
- " 'ParameterName': 'BufferSizeInMBs',\n",
- " 'ParameterValue': '1'\n",
- " },\n",
- " {\n",
- " 'ParameterName': 'BufferIntervalInSeconds',\n",
- " 'ParameterValue': '60'\n",
- " }, \n",
- " ]\n",
- " }]\n",
+ " \"S3BackupMode\": \"Enabled\",\n",
+ " \"S3BackupConfiguration\": {\n",
+ " \"RoleARN\": iam_role_kinesis_arn,\n",
+ " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n",
+ " \"Prefix\": \"kinesis-data-firehose-source-record/\",\n",
+ " \"ErrorOutputPrefix\": \"!{firehose:error-output-type}/\",\n",
+ " \"BufferingHints\": {\"SizeInMBs\": 1, \"IntervalInSeconds\": 60},\n",
+ " \"CompressionFormat\": \"UNCOMPRESSED\",\n",
" },\n",
- " 'S3BackupMode': 'Enabled',\n",
- " 'S3BackupConfiguration': {\n",
- " 'RoleARN': iam_role_kinesis_arn,\n",
- " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n",
- " 'Prefix': 'kinesis-data-firehose-source-record/', \n",
- " 'ErrorOutputPrefix': '!{firehose:error-output-type}/',\n",
- " 'BufferingHints': {\n",
- " 'SizeInMBs': 1,\n",
- " 'IntervalInSeconds': 60\n",
- " },\n",
- " 'CompressionFormat': 'UNCOMPRESSED'\n",
+ " \"CloudWatchLoggingOptions\": {\n",
+ " \"Enabled\": False,\n",
" },\n",
- " 'CloudWatchLoggingOptions': {\n",
- " 'Enabled': False,\n",
- " }\n",
- " }\n",
+ " },\n",
" )\n",
- " print('Delivery stream {} successfully created.'.format(firehose_name))\n",
+ " print(\"Delivery stream {} successfully created.\".format(firehose_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Delivery stream {} already exists.'.format(firehose_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Delivery stream {} already exists.\".format(firehose_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)"
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -319,14 +309,14 @@
"source": [
"import time\n",
"\n",
- "status = ''\n",
- "while status != 'ACTIVE': \n",
+ "status = \"\"\n",
+ "while status != \"ACTIVE\":\n",
" r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n",
- " description = r.get('DeliveryStreamDescription')\n",
- " status = description.get('DeliveryStreamStatus')\n",
+ " description = r.get(\"DeliveryStreamDescription\")\n",
+ " status = description.get(\"DeliveryStreamStatus\")\n",
" time.sleep(5)\n",
- " \n",
- "print('Delivery Stream {} is active'.format(firehose_name))"
+ "\n",
+ "print(\"Delivery Stream {} is active\".format(firehose_name))"
]
},
{
@@ -335,7 +325,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehose_arn = r['DeliveryStreamDescription']['DeliveryStreamARN']\n",
+ "firehose_arn = r[\"DeliveryStreamDescription\"][\"DeliveryStreamARN\"]\n",
"print(firehose_arn)"
]
},
@@ -362,8 +352,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Firehose'.format(region, firehose_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Firehose'.format(\n",
+ " region, firehose_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/11_stream/04_Create_Kinesis_Data_Stream.ipynb b/11_stream/04_Create_Kinesis_Data_Stream.ipynb
index b5cdc435..b64d4ab8 100644
--- a/11_stream/04_Create_Kinesis_Data_Stream.ipynb
+++ b/11_stream/04_Create_Kinesis_Data_Stream.ipynb
@@ -26,14 +26,14 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "kinesis = boto3.Session().client(service_name='kinesis', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "kinesis = boto3.Session().client(service_name=\"kinesis\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)"
]
},
{
@@ -68,9 +68,9 @@
"try:\n",
" stream_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -99,19 +99,16 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
- " response = kinesis.create_stream(\n",
- " StreamName=stream_name, \n",
- " ShardCount=shard_count\n",
- " )\n",
- " print('Data Stream {} successfully created.'.format(stream_name))\n",
+ "try:\n",
+ " response = kinesis.create_stream(StreamName=stream_name, ShardCount=shard_count)\n",
+ " print(\"Data Stream {} successfully created.\".format(stream_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
- " \n",
+ "\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Data Stream {} already exists.'.format(stream_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Data Stream {} already exists.\".format(stream_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)"
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -122,14 +119,14 @@
"source": [
"import time\n",
"\n",
- "status = ''\n",
- "while status != 'ACTIVE': \n",
+ "status = \"\"\n",
+ "while status != \"ACTIVE\":\n",
" r = kinesis.describe_stream(StreamName=stream_name)\n",
- " description = r.get('StreamDescription')\n",
- " status = description.get('StreamStatus')\n",
+ " description = r.get(\"StreamDescription\")\n",
+ " status = description.get(\"StreamStatus\")\n",
" time.sleep(5)\n",
- " \n",
- "print('Stream {} is active'.format(stream_name))"
+ "\n",
+ "print(\"Stream {} is active\".format(stream_name))"
]
},
{
@@ -145,9 +142,7 @@
"metadata": {},
"outputs": [],
"source": [
- "stream_response = kinesis.describe_stream(\n",
- " StreamName=stream_name\n",
- ")\n",
+ "stream_response = kinesis.describe_stream(StreamName=stream_name)\n",
"\n",
"print(json.dumps(stream_response, indent=4, sort_keys=True, default=str))"
]
@@ -160,7 +155,7 @@
},
"outputs": [],
"source": [
- "stream_arn = stream_response['StreamDescription']['StreamARN']\n",
+ "stream_arn = stream_response[\"StreamDescription\"][\"StreamARN\"]\n",
"print(stream_arn)"
]
},
@@ -187,8 +182,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Stream'.format(region, stream_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Stream'.format(\n",
+ " region, stream_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb b/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb
index 641d3741..1eba16ab 100644
--- a/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb
+++ b/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb
@@ -38,16 +38,16 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "account_id = sts.get_caller_identity()['Account']\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"\n",
- "lam = boto3.Session().client(service_name='lambda', region_name=region)"
+ "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)"
]
},
{
@@ -75,9 +75,9 @@
"try:\n",
" lambda_fn_name_cloudwatch\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -114,9 +114,9 @@
"try:\n",
" iam_lambda_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -146,9 +146,9 @@
"try:\n",
" iam_lambda_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -167,11 +167,11 @@
"outputs": [],
"source": [
"if not iam_lambda_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -192,9 +192,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -253,7 +253,7 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('src/DeliverKinesisAnalyticsToCloudWatch.zip', 'rb') as f: \n",
+ "with open(\"src/DeliverKinesisAnalyticsToCloudWatch.zip\", \"rb\") as f:\n",
" code = f.read()"
]
},
@@ -272,33 +272,28 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = lam.create_function(\n",
- " FunctionName='{}'.format(lambda_fn_name_cloudwatch),\n",
- " Runtime='python3.7',\n",
- " Role='{}'.format(iam_role_lambda_arn),\n",
- " Handler='src/deliver_metrics_to_cloudwatch.lambda_handler',\n",
- " Code={\n",
- " 'ZipFile': code\n",
- " },\n",
- " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n",
+ " FunctionName=\"{}\".format(lambda_fn_name_cloudwatch),\n",
+ " Runtime=\"python3.7\",\n",
+ " Role=\"{}\".format(iam_role_lambda_arn),\n",
+ " Handler=\"src/deliver_metrics_to_cloudwatch.lambda_handler\",\n",
+ " Code={\"ZipFile\": code},\n",
+ " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n",
" Timeout=900,\n",
" MemorySize=128,\n",
- " Publish=True\n",
+ " Publish=True,\n",
" )\n",
- " print('Lambda Function {} successfully created.'.format(lambda_fn_name_cloudwatch))\n",
+ " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_cloudwatch))\n",
"\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceConflictException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n",
" response = lam.update_function_code(\n",
- " FunctionName='{}'.format(lambda_fn_name_cloudwatch),\n",
- " ZipFile=code,\n",
- " Publish=True,\n",
- " DryRun=False\n",
- " ) \n",
- " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_cloudwatch)) \n",
+ " FunctionName=\"{}\".format(lambda_fn_name_cloudwatch), ZipFile=code, Publish=True, DryRun=False\n",
+ " )\n",
+ " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_cloudwatch))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -309,7 +304,7 @@
"source": [
"response = lam.get_function(FunctionName=lambda_fn_name_cloudwatch)\n",
"\n",
- "lambda_fn_arn_cloudwatch = response['Configuration']['FunctionArn']\n",
+ "lambda_fn_arn_cloudwatch = response[\"Configuration\"][\"FunctionArn\"]\n",
"print(lambda_fn_arn_cloudwatch)"
]
},
@@ -338,8 +333,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_cloudwatch)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Function'.format(\n",
+ " region, lambda_fn_name_cloudwatch\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/11_stream/06_Create_Lambda_Destination_SNS.ipynb b/11_stream/06_Create_Lambda_Destination_SNS.ipynb
index 0d8c6de6..a1b05c27 100644
--- a/11_stream/06_Create_Lambda_Destination_SNS.ipynb
+++ b/11_stream/06_Create_Lambda_Destination_SNS.ipynb
@@ -38,17 +38,17 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "account_id = sts.get_caller_identity()['Account']\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"\n",
- "lam = boto3.Session().client(service_name='lambda', region_name=region)\n",
- "sns = boto3.Session().client(service_name='sns', region_name=region)"
+ "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)\n",
+ "sns = boto3.Session().client(service_name=\"sns\", region_name=region)"
]
},
{
@@ -76,9 +76,9 @@
"try:\n",
" lambda_fn_name_sns\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -115,9 +115,9 @@
"try:\n",
" iam_lambda_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -147,9 +147,9 @@
"try:\n",
" iam_lambda_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -168,11 +168,11 @@
"outputs": [],
"source": [
"if not iam_lambda_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -193,9 +193,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -231,7 +231,7 @@
"outputs": [],
"source": [
"response = sns.create_topic(\n",
- " Name='review_anomaly_scores',\n",
+ " Name=\"review_anomaly_scores\",\n",
")\n",
"print(response)"
]
@@ -242,7 +242,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sns_topic_arn = response['TopicArn']\n",
+ "sns_topic_arn = response[\"TopicArn\"]\n",
"print(sns_topic_arn)"
]
},
@@ -302,7 +302,7 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('src/PushNotificationToSNS.zip', 'rb') as f: \n",
+ "with open(\"src/PushNotificationToSNS.zip\", \"rb\") as f:\n",
" code = f.read()"
]
},
@@ -321,33 +321,28 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = lam.create_function(\n",
- " FunctionName='{}'.format(lambda_fn_name_sns),\n",
- " Runtime='python3.7',\n",
- " Role='{}'.format(iam_role_lambda_arn),\n",
- " Handler='src/push_notification_to_sns.lambda_handler',\n",
- " Code={\n",
- " 'ZipFile': code\n",
- " },\n",
- " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n",
+ " FunctionName=\"{}\".format(lambda_fn_name_sns),\n",
+ " Runtime=\"python3.7\",\n",
+ " Role=\"{}\".format(iam_role_lambda_arn),\n",
+ " Handler=\"src/push_notification_to_sns.lambda_handler\",\n",
+ " Code={\"ZipFile\": code},\n",
+ " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n",
" Timeout=300,\n",
" MemorySize=128,\n",
- " Publish=True\n",
+ " Publish=True,\n",
" )\n",
- " print('Lambda Function {} successfully created.'.format(lambda_fn_name_sns))\n",
+ " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_sns))\n",
"\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceConflictException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n",
" response = lam.update_function_code(\n",
- " FunctionName='{}'.format(lambda_fn_name_sns),\n",
- " ZipFile=code,\n",
- " Publish=True,\n",
- " DryRun=False\n",
- " ) \n",
- " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_sns)) \n",
+ " FunctionName=\"{}\".format(lambda_fn_name_sns), ZipFile=code, Publish=True, DryRun=False\n",
+ " )\n",
+ " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_sns))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -358,7 +353,7 @@
"source": [
"response = lam.get_function(FunctionName=lambda_fn_name_sns)\n",
"\n",
- "lambda_fn_arn_sns = response['Configuration']['FunctionArn']\n",
+ "lambda_fn_arn_sns = response[\"Configuration\"][\"FunctionArn\"]\n",
"print(lambda_fn_arn_sns)"
]
},
@@ -387,13 +382,8 @@
"outputs": [],
"source": [
"response = lam.update_function_configuration(\n",
- " FunctionName=lambda_fn_name_sns,\n",
- " Environment={\n",
- " 'Variables': {\n",
- " 'SNS_TOPIC_ARN': sns_topic_arn\n",
- " }\n",
- " }\n",
- " )"
+ " FunctionName=lambda_fn_name_sns, Environment={\"Variables\": {\"SNS_TOPIC_ARN\": sns_topic_arn}}\n",
+ ")"
]
},
{
@@ -410,8 +400,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_sns)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Function'.format(\n",
+ " region, lambda_fn_name_sns\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb b/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb
index ed473b47..2b8e1f71 100644
--- a/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb
+++ b/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb
@@ -41,17 +41,17 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "account_id = sts.get_caller_identity()['Account']\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n",
- "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n",
+ "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)"
]
},
{
@@ -72,9 +72,9 @@
"try:\n",
" firehose_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -104,9 +104,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -136,9 +136,9 @@
"try:\n",
" stream_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -168,9 +168,9 @@
"try:\n",
" lambda_fn_arn_cloudwatch\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -200,9 +200,9 @@
"try:\n",
" lambda_fn_arn_sns\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -232,9 +232,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -264,9 +264,9 @@
"try:\n",
" lambda_fn_arn_invoke_ep\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -298,7 +298,7 @@
"metadata": {},
"outputs": [],
"source": [
- "kinesis_data_analytics_app_name = 'dsoaws-kinesis-data-analytics-sql-app'"
+ "kinesis_data_analytics_app_name = \"dsoaws-kinesis-data-analytics-sql-app\""
]
},
{
@@ -307,7 +307,7 @@
"metadata": {},
"outputs": [],
"source": [
- "in_app_stream_name = 'SOURCE_SQL_STREAM_001' # Default\n",
+ "in_app_stream_name = \"SOURCE_SQL_STREAM_001\" # Default\n",
"print(in_app_stream_name)"
]
},
@@ -333,7 +333,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sql_code = ''' \\\n",
+ "sql_code = \"\"\" \\\n",
" CREATE OR REPLACE STREAM \"AVG_STAR_RATING_SQL_STREAM\" ( \\\n",
" avg_star_rating DOUBLE); \\\n",
" CREATE OR REPLACE PUMP \"AVG_STAR_RATING_SQL_STREAM_PUMP\" AS \\\n",
@@ -364,12 +364,9 @@
" {} \\\n",
" ) \\\n",
" ); \\\n",
- " '''.format(in_app_stream_name,\n",
- " in_app_stream_name,\n",
- " window_seconds,\n",
- " in_app_stream_name,\n",
- " in_app_stream_name,\n",
- " window_seconds)\n",
+ " \"\"\".format(\n",
+ " in_app_stream_name, in_app_stream_name, window_seconds, in_app_stream_name, in_app_stream_name, window_seconds\n",
+ ")\n",
"\n",
"print(sql_code)"
]
@@ -382,99 +379,73 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = kinesis_analytics.create_application(\n",
" ApplicationName=kinesis_data_analytics_app_name,\n",
" Inputs=[\n",
" {\n",
- " 'NamePrefix': 'SOURCE_SQL_STREAM',\n",
- " 'KinesisFirehoseInput': {\n",
- " 'ResourceARN': '{}'.format(firehose_arn),\n",
- " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n",
+ " \"NamePrefix\": \"SOURCE_SQL_STREAM\",\n",
+ " \"KinesisFirehoseInput\": {\n",
+ " \"ResourceARN\": \"{}\".format(firehose_arn),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n",
" },\n",
- " 'InputProcessingConfiguration': { \n",
- " 'InputLambdaProcessor': { \n",
- " 'ResourceARN': '{}'.format(lambda_fn_arn_invoke_ep),\n",
- " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n",
+ " \"InputProcessingConfiguration\": {\n",
+ " \"InputLambdaProcessor\": {\n",
+ " \"ResourceARN\": \"{}\".format(lambda_fn_arn_invoke_ep),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n",
" }\n",
- " }, \n",
- " 'InputSchema': {\n",
- " 'RecordFormat': {\n",
- " 'RecordFormatType': 'CSV',\n",
- " 'MappingParameters': {\n",
- " 'CSVMappingParameters': {\n",
- " 'RecordRowDelimiter': '\\n',\n",
- " 'RecordColumnDelimiter': '\\t'\n",
- " }\n",
- " }\n",
- " },\n",
- " 'RecordColumns': [\n",
- " {\n",
- " 'Name': 'review_id',\n",
- " 'Mapping': 'review_id',\n",
- " 'SqlType': 'VARCHAR(14)'\n",
- " }, \n",
- " {\n",
- " 'Name': 'star_rating',\n",
- " 'Mapping': 'star_rating',\n",
- " 'SqlType': 'INTEGER'\n",
+ " },\n",
+ " \"InputSchema\": {\n",
+ " \"RecordFormat\": {\n",
+ " \"RecordFormatType\": \"CSV\",\n",
+ " \"MappingParameters\": {\n",
+ " \"CSVMappingParameters\": {\"RecordRowDelimiter\": \"\\n\", \"RecordColumnDelimiter\": \"\\t\"}\n",
" },\n",
- " {\n",
- " 'Name': 'product_category',\n",
- " 'Mapping': 'product_category',\n",
- " 'SqlType': 'VARCHAR(24)'\n",
- " }, \n",
- " {\n",
- " 'Name': 'review_body',\n",
- " 'Mapping': 'review_body',\n",
- " 'SqlType': 'VARCHAR(65535)'\n",
- " } \n",
- " ]\n",
- " }\n",
+ " },\n",
+ " \"RecordColumns\": [\n",
+ " {\"Name\": \"review_id\", \"Mapping\": \"review_id\", \"SqlType\": \"VARCHAR(14)\"},\n",
+ " {\"Name\": \"star_rating\", \"Mapping\": \"star_rating\", \"SqlType\": \"INTEGER\"},\n",
+ " {\"Name\": \"product_category\", \"Mapping\": \"product_category\", \"SqlType\": \"VARCHAR(24)\"},\n",
+ " {\"Name\": \"review_body\", \"Mapping\": \"review_body\", \"SqlType\": \"VARCHAR(65535)\"},\n",
+ " ],\n",
+ " },\n",
" },\n",
" ],\n",
" Outputs=[\n",
" {\n",
- " 'Name': 'AVG_STAR_RATING_SQL_STREAM',\n",
- " 'LambdaOutput': {\n",
- " 'ResourceARN': '{}'.format(lambda_fn_arn_cloudwatch),\n",
- " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n",
+ " \"Name\": \"AVG_STAR_RATING_SQL_STREAM\",\n",
+ " \"LambdaOutput\": {\n",
+ " \"ResourceARN\": \"{}\".format(lambda_fn_arn_cloudwatch),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n",
" },\n",
- " 'DestinationSchema': {\n",
- " 'RecordFormatType': 'CSV'\n",
- " }\n",
+ " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n",
" },\n",
" {\n",
- " 'Name': 'ANOMALY_SCORE_SQL_STREAM', \n",
- " 'LambdaOutput': {\n",
- " 'ResourceARN': '{}'.format(lambda_fn_arn_sns),\n",
- " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n",
+ " \"Name\": \"ANOMALY_SCORE_SQL_STREAM\",\n",
+ " \"LambdaOutput\": {\n",
+ " \"ResourceARN\": \"{}\".format(lambda_fn_arn_sns),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n",
" },\n",
- " 'DestinationSchema': {\n",
- " 'RecordFormatType': 'CSV'\n",
- " }\n",
+ " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n",
" },\n",
" {\n",
- " 'Name': 'APPROXIMATE_COUNT_SQL_STREAM', \n",
- " 'KinesisStreamsOutput': {\n",
- " 'ResourceARN': '{}'.format(stream_arn),\n",
- " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n",
+ " \"Name\": \"APPROXIMATE_COUNT_SQL_STREAM\",\n",
+ " \"KinesisStreamsOutput\": {\n",
+ " \"ResourceARN\": \"{}\".format(stream_arn),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n",
" },\n",
- " 'DestinationSchema': {\n",
- " 'RecordFormatType': 'CSV'\n",
- " }\n",
- " }\n",
+ " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n",
+ " },\n",
" ],\n",
- " ApplicationCode=sql_code\n",
+ " ApplicationCode=sql_code,\n",
" )\n",
- " print('SQL application {} successfully created.'.format(kinesis_data_analytics_app_name))\n",
+ " print(\"SQL application {} successfully created.\".format(kinesis_data_analytics_app_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('SQL App {} already exists.'.format(kinesis_data_analytics_app_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"SQL App {} already exists.\".format(kinesis_data_analytics_app_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " "
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -495,7 +466,7 @@
"metadata": {},
"outputs": [],
"source": [
- "input_id = response['ApplicationDetail']['InputDescriptions'][0]['InputId']\n",
+ "input_id = response[\"ApplicationDetail\"][\"InputDescriptions\"][0][\"InputId\"]\n",
"print(input_id)"
]
},
@@ -512,24 +483,17 @@
"metadata": {},
"outputs": [],
"source": [
- "try: \n",
+ "try:\n",
" response = kinesis_analytics.start_application(\n",
" ApplicationName=kinesis_data_analytics_app_name,\n",
- " InputConfigurations=[\n",
- " {\n",
- " 'Id': input_id,\n",
- " 'InputStartingPositionConfiguration': {\n",
- " 'InputStartingPosition': 'NOW'\n",
- " }\n",
- " }\n",
- " ]\n",
+ " InputConfigurations=[{\"Id\": input_id, \"InputStartingPositionConfiguration\": {\"InputStartingPosition\": \"NOW\"}}],\n",
" )\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Application {} is already starting.'.format(kinesis_data_analytics_app_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Application {} is already starting.\".format(kinesis_data_analytics_app_name))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -555,8 +519,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -578,17 +548,16 @@
"\n",
"import time\n",
"\n",
- "app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- "print('Application status {}'.format(app_status))\n",
+ "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ "print(\"Application status {}\".format(app_status))\n",
"\n",
- "while app_status != 'RUNNING':\n",
+ "while app_status != \"RUNNING\":\n",
" time.sleep(5)\n",
- " response = kinesis_analytics.describe_application(\n",
- " ApplicationName=kinesis_data_analytics_app_name)\n",
- " app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- " print('Application status {}'.format(app_status))\n",
+ " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n",
+ " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ " print(\"Application status {}\".format(app_status))\n",
"\n",
- "print('Application status {}'.format(app_status))"
+ "print(\"Application status {}\".format(app_status))"
]
},
{
diff --git a/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb b/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
index 86ffdea6..6fdb3b51 100644
--- a/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
+++ b/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
@@ -25,14 +25,14 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n",
- "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)\n"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n",
+ "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)"
]
},
{
@@ -53,9 +53,9 @@
"try:\n",
" firehose_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -85,9 +85,9 @@
"try:\n",
" firehose_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -117,9 +117,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -149,9 +149,9 @@
"try:\n",
" kinesis_data_analytics_app_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -181,9 +181,9 @@
"try:\n",
" lambda_fn_name_cloudwatch\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -201,7 +201,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')\n",
+ "firehoses = firehose.list_delivery_streams(DeliveryStreamType=\"DirectPut\")\n",
"\n",
"print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))"
]
@@ -231,10 +231,12 @@
"import csv\n",
"import pandas as pd\n",
"\n",
- "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df.shape"
]
},
@@ -255,14 +257,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_star_rating_and_review_body = df[['review_id', \n",
- " 'star_rating', \n",
- " 'product_category', \n",
- " 'review_body']][0:1]\n",
+ "df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][0:1]\n",
"\n",
- "df_star_rating_and_review_body.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)"
+ "df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)"
]
},
{
@@ -291,16 +288,15 @@
"\n",
"import time\n",
"\n",
- "app_status = response['ApplicationDetail']['ApplicationStatus']\n",
+ "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
"\n",
- "while app_status != 'RUNNING':\n",
+ "while app_status != \"RUNNING\":\n",
" time.sleep(5)\n",
- " response = kinesis_analytics.describe_application(\n",
- " ApplicationName=kinesis_data_analytics_app_name)\n",
- " app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- " print('Application status {}'.format(app_status))\n",
+ " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n",
+ " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ " print(\"Application status {}\".format(app_status))\n",
"\n",
- "print('Application status {}'.format(app_status))"
+ "print(\"Application status {}\".format(app_status))"
]
},
{
@@ -331,8 +327,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Logs'.format(region, lambda_fn_name_cloudwatch)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Logs'.format(\n",
+ " region, lambda_fn_name_cloudwatch\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -356,8 +358,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML(\"\"\"Review CloudWatch Metrics\"\"\".format(region, region)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " \"\"\"Review CloudWatch Metrics\"\"\".format(\n",
+ " region, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -381,8 +389,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -407,9 +421,7 @@
},
"outputs": [],
"source": [
- "firehose_response = firehose.describe_delivery_stream(\n",
- " DeliveryStreamName=firehose_name\n",
- ")\n",
+ "firehose_response = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n",
"\n",
"print(json.dumps(firehose_response, indent=4, sort_keys=True, default=str))"
]
@@ -427,22 +439,13 @@
"for start_idx in range(0, 500, step):\n",
" end_idx = start_idx + step\n",
"\n",
- " df_star_rating_and_review_body = df[['review_id', \n",
- " 'product_category', \n",
- " 'review_body']][start_idx:end_idx]\n",
+ " df_star_rating_and_review_body = df[[\"review_id\", \"product_category\", \"review_body\"]][start_idx:end_idx]\n",
+ "\n",
+ " reviews_tsv = df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)\n",
"\n",
- " reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)\n",
- " \n",
" # print(reviews_tsv.encode('utf-8'))\n",
- " \n",
- " response = firehose.put_record( \n",
- " Record={\n",
- " 'Data': reviews_tsv.encode('utf-8')\n",
- " },\n",
- " DeliveryStreamName=firehose_name\n",
- " )"
+ "\n",
+ " response = firehose.put_record(Record={\"Data\": reviews_tsv.encode(\"utf-8\")}, DeliveryStreamName=firehose_name)"
]
},
{
@@ -452,8 +455,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -488,8 +497,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review S3 Source Records'.format(bucket, region)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Source Records'.format(\n",
+ " bucket, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -520,8 +535,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review S3 Transformed Records'.format(bucket, region)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Transformed Records'.format(\n",
+ " bucket, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -552,8 +573,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To UI Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To UI Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -612,8 +639,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -639,21 +672,21 @@
"for start_idx in range(0, 10000, anomaly_step):\n",
" timestamp = int(time.time())\n",
"\n",
- " df_anomalies = pd.DataFrame([\n",
- " {'review_id': str(timestamp), \n",
- " 'product_category': 'Digital_Software', \n",
- " 'review_body': 'This is an awful waste of time.'}, \n",
- " ], columns=['review_id', 'star_rating', 'product_category', 'review_body'])\n",
+ " df_anomalies = pd.DataFrame(\n",
+ " [\n",
+ " {\n",
+ " \"review_id\": str(timestamp),\n",
+ " \"product_category\": \"Digital_Software\",\n",
+ " \"review_body\": \"This is an awful waste of time.\",\n",
+ " },\n",
+ " ],\n",
+ " columns=[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"],\n",
+ " )\n",
"\n",
- " reviews_tsv_anomalies = df_anomalies.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)\n",
- " \n",
- " response = firehose.put_record( \n",
- " Record={\n",
- " 'Data': reviews_tsv_anomalies.encode('utf-8')\n",
- " },\n",
- " DeliveryStreamName=firehose_name\n",
+ " reviews_tsv_anomalies = df_anomalies.to_csv(sep=\"\\t\", header=None, index=False)\n",
+ "\n",
+ " response = firehose.put_record(\n",
+ " Record={\"Data\": reviews_tsv_anomalies.encode(\"utf-8\")}, DeliveryStreamName=firehose_name\n",
" )"
]
},
@@ -664,8 +697,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -699,7 +738,7 @@
"\n",
"# Shutting down your kernel for this notebook to release resources.
\n",
"# \n",
- " \n",
+ "\n",
"# "
]
},
diff --git a/11_stream/archive/11_stream.orig/00_Overview.ipynb b/11_stream/archive/11_stream.orig/00_Overview.ipynb
index c009d721..59211e9c 100644
--- a/11_stream/archive/11_stream.orig/00_Overview.ipynb
+++ b/11_stream/archive/11_stream.orig/00_Overview.ipynb
@@ -88,7 +88,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb b/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb
index e0750488..4abc7b09 100644
--- a/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb
+++ b/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb
@@ -17,13 +17,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_kinesis_role_name = 'DSOAWS_Kinesis'"
+ "iam_kinesis_role_name = \"DSOAWS_Kinesis\""
]
},
{
@@ -58,31 +58,13 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesis.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"firehose.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesisanalytics.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " } \n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesis.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"firehose.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " ],\n",
+ "}"
]
},
{
@@ -100,18 +82,18 @@
" iam_role_kinesis = iam.create_role(\n",
" RoleName=iam_kinesis_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Kinesis Role'\n",
+ " Description=\"DSOAWS Kinesis Role\",\n",
" )\n",
- " print('Role succesfully created.')\n",
+ " print(\"Role succesfully created.\")\n",
" iam_kinesis_role_passed = True\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role_kinesis = iam.get_role(RoleName=iam_kinesis_role_name)\n",
- " print('Role already exists. That is OK.')\n",
+ " print(\"Role already exists. That is OK.\")\n",
" iam_kinesis_role_passed = True\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " \n",
+ " print(\"Unexpected error: %s\" % e)\n",
+ "\n",
"time.sleep(30)"
]
},
@@ -121,8 +103,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_kinesis_name = iam_role_kinesis['Role']['RoleName']\n",
- "print('Role Name: {}'.format(iam_role_kinesis_name))"
+ "iam_role_kinesis_name = iam_role_kinesis[\"Role\"][\"RoleName\"]\n",
+ "print(\"Role Name: {}\".format(iam_role_kinesis_name))"
]
},
{
@@ -131,8 +113,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_kinesis_arn = iam_role_kinesis['Role']['Arn']\n",
- "print('Role ARN: {}'.format(iam_role_kinesis_arn))"
+ "iam_role_kinesis_arn = iam_role_kinesis[\"Role\"][\"Arn\"]\n",
+ "print(\"Role ARN: {}\".format(iam_role_kinesis_arn))"
]
},
{
@@ -141,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
- "account_id = sts.get_caller_identity()['Account']"
+ "account_id = sts.get_caller_identity()[\"Account\"]"
]
},
{
@@ -157,7 +139,7 @@
"metadata": {},
"outputs": [],
"source": [
- "stream_name = 'dsoaws-kinesis-data-stream'"
+ "stream_name = \"dsoaws-kinesis-data-stream\""
]
},
{
@@ -173,7 +155,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehose_name = 'dsoaws-kinesis-data-firehose'"
+ "firehose_name = \"dsoaws-kinesis-data-firehose\""
]
},
{
@@ -189,7 +171,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lambda_fn_name = 'DeliverKinesisAnalyticsToCloudWatch'"
+ "lambda_fn_name = \"DeliverKinesisAnalyticsToCloudWatch\""
]
},
{
@@ -208,32 +190,27 @@
"outputs": [],
"source": [
"kinesis_policy_doc = {\n",
- " \n",
" \"Version\": \"2012-10-17\",\n",
" \"Statement\": [\n",
- " { \n",
- " \"Effect\": \"Allow\", \n",
+ " {\n",
+ " \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"s3:AbortMultipartUpload\",\n",
" \"s3:GetBucketLocation\",\n",
" \"s3:GetObject\",\n",
" \"s3:ListBucket\",\n",
" \"s3:ListBucketMultipartUploads\",\n",
- " \"s3:PutObject\"\n",
- " ], \n",
- " \"Resource\": [ \n",
+ " \"s3:PutObject\",\n",
+ " ],\n",
+ " \"Resource\": [\n",
" \"arn:aws:s3:::{}/kinesis-data-firehose\".format(bucket),\n",
- " \"arn:aws:s3:::{}/kinesis-data-firehose/*\".format(bucket)\n",
- " ] \n",
+ " \"arn:aws:s3:::{}/kinesis-data-firehose/*\".format(bucket),\n",
+ " ],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"logs:PutLogEvents\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)\n",
- " ]\n",
+ " \"Action\": [\"logs:PutLogEvents\"],\n",
+ " \"Resource\": [\"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
@@ -243,43 +220,34 @@
" \"kinesis:Put*\",\n",
" \"kinesis:List*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)\n",
- " ]\n",
+ " \"Resource\": [\"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"firehose:*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)\n",
- " ]\n",
+ " \"Resource\": [\"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)],\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": [\n",
" \"kinesisanalytics:*\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ]\n",
+ " \"Resource\": [\"*\"],\n",
" },\n",
" {\n",
" \"Sid\": \"UseLambdaFunction\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"lambda:InvokeFunction\",\n",
- " \"lambda:GetFunctionConfiguration\"\n",
- " ],\n",
- " \"Resource\": \"arn:aws:lambda:{}:{}:function:{}:$LATEST\".format(region, account_id, lambda_fn_name)\n",
+ " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n",
+ " \"Resource\": \"arn:aws:lambda:{}:{}:function:{}:$LATEST\".format(region, account_id, lambda_fn_name),\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": \"iam:PassRole\",\n",
- " \"Resource\": \"arn:aws:iam::*:role/service-role/kinesis-analytics*\"\n",
- " }\n",
- " ]\n",
+ " \"Resource\": \"arn:aws:iam::*:role/service-role/kinesis-analytics*\",\n",
+ " },\n",
+ " ],\n",
"}\n",
"\n",
"print(json.dumps(kinesis_policy_doc, indent=4, sort_keys=True, default=str))"
@@ -301,9 +269,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=iam_role_kinesis_name,\n",
- " PolicyName='DSOAWS_KinesisPolicy',\n",
- " PolicyDocument=json.dumps(kinesis_policy_doc)\n",
+ " RoleName=iam_role_kinesis_name, PolicyName=\"DSOAWS_KinesisPolicy\", PolicyDocument=json.dumps(kinesis_policy_doc)\n",
")\n",
"\n",
"time.sleep(30)"
@@ -331,7 +297,7 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_lambda_role_name = 'DSOAWS_Lambda'"
+ "iam_lambda_role_name = \"DSOAWS_Lambda\""
]
},
{
@@ -352,21 +318,9 @@
"assume_role_policy_doc = {\n",
" \"Version\": \"2012-10-17\",\n",
" \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"lambda.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"kinesisanalytics.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"lambda.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n",
+ " ],\n",
"}"
]
},
@@ -384,18 +338,18 @@
" iam_role_lambda = iam.create_role(\n",
" RoleName=iam_lambda_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Lambda Role'\n",
+ " Description=\"DSOAWS Lambda Role\",\n",
" )\n",
- " print('Role succesfully created.')\n",
+ " print(\"Role succesfully created.\")\n",
" iam_lambda_role_passed = True\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role_lambda = iam.get_role(RoleName=iam_lambda_role_name)\n",
- " print('Role already exists. This is OK.')\n",
+ " print(\"Role already exists. This is OK.\")\n",
" iam_lambda_role_passed = True\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " \n",
+ " print(\"Unexpected error: %s\" % e)\n",
+ "\n",
"time.sleep(30)"
]
},
@@ -405,8 +359,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_lambda_name = iam_role_lambda['Role']['RoleName']\n",
- "print('Role Name: {}'.format(iam_role_lambda_name))"
+ "iam_role_lambda_name = iam_role_lambda[\"Role\"][\"RoleName\"]\n",
+ "print(\"Role Name: {}\".format(iam_role_lambda_name))"
]
},
{
@@ -415,8 +369,8 @@
"metadata": {},
"outputs": [],
"source": [
- "iam_role_lambda_arn = iam_role_lambda['Role']['Arn']\n",
- "print('Role ARN: {}'.format(iam_role_lambda_arn))"
+ "iam_role_lambda_arn = iam_role_lambda[\"Role\"][\"Arn\"]\n",
+ "print(\"Role ARN: {}\".format(iam_role_lambda_arn))"
]
},
{
@@ -438,31 +392,21 @@
" {\n",
" \"Sid\": \"UseLambdaFunction\",\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"lambda:InvokeFunction\",\n",
- " \"lambda:GetFunctionConfiguration\"\n",
- " ],\n",
- " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id)\n",
- " },\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Action\": \"cloudwatch:*\",\n",
- " \"Resource\": \"*\"\n",
+ " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n",
+ " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id),\n",
" },\n",
+ " {\"Effect\": \"Allow\", \"Action\": \"cloudwatch:*\", \"Resource\": \"*\"},\n",
" {\n",
" \"Effect\": \"Allow\",\n",
" \"Action\": \"logs:CreateLogGroup\",\n",
- " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id)\n",
+ " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id),\n",
" },\n",
" {\n",
" \"Effect\": \"Allow\",\n",
- " \"Action\": [\n",
- " \"logs:CreateLogStream\",\n",
- " \"logs:PutLogEvents\"\n",
- " ],\n",
- " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id)\n",
- " }\n",
- " ]\n",
+ " \"Action\": [\"logs:CreateLogStream\", \"logs:PutLogEvents\"],\n",
+ " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id),\n",
+ " },\n",
+ " ],\n",
"}"
]
},
@@ -484,9 +428,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=iam_role_lambda_name,\n",
- " PolicyName='DSOAWS_LambdaPolicy',\n",
- " PolicyDocument=json.dumps(lambda_policy_doc)\n",
+ " RoleName=iam_role_lambda_name, PolicyName=\"DSOAWS_LambdaPolicy\", PolicyDocument=json.dumps(lambda_policy_doc)\n",
")\n",
"\n",
"time.sleep(30)"
@@ -605,7 +547,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb b/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb
index 5892eb1d..5f281451 100644
--- a/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb
+++ b/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb
@@ -27,13 +27,13 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)"
]
},
{
@@ -54,9 +54,9 @@
"try:\n",
" firehose_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -93,9 +93,9 @@
"try:\n",
" iam_kinesis_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -125,9 +125,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -157,9 +157,9 @@
"try:\n",
" iam_kinesis_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -178,11 +178,11 @@
"outputs": [],
"source": [
"if not iam_kinesis_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -200,24 +200,23 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = firehose.create_delivery_stream(\n",
" DeliveryStreamName=firehose_name,\n",
- " DeliveryStreamType='DirectPut',\n",
+ " DeliveryStreamType=\"DirectPut\",\n",
" S3DestinationConfiguration={\n",
- " 'RoleARN': iam_role_kinesis_arn,\n",
- " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n",
- " 'Prefix': 'kinesis-data-firehose', \n",
- " }\n",
+ " \"RoleARN\": iam_role_kinesis_arn,\n",
+ " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n",
+ " \"Prefix\": \"kinesis-data-firehose\",\n",
+ " },\n",
" )\n",
- " print('Delivery stream {} successfully created.'.format(firehose_name))\n",
+ " print(\"Delivery stream {} successfully created.\".format(firehose_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Delivery stream {} already exists.'.format(firehose_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Delivery stream {} already exists.\".format(firehose_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " "
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -228,14 +227,14 @@
"source": [
"import time\n",
"\n",
- "status = ''\n",
- "while status != 'ACTIVE': \n",
+ "status = \"\"\n",
+ "while status != \"ACTIVE\":\n",
" r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n",
- " description = r.get('DeliveryStreamDescription')\n",
- " status = description.get('DeliveryStreamStatus')\n",
+ " description = r.get(\"DeliveryStreamDescription\")\n",
+ " status = description.get(\"DeliveryStreamStatus\")\n",
" time.sleep(5)\n",
- " \n",
- "print('Delivery Stream {} is active'.format(firehose_name))"
+ "\n",
+ "print(\"Delivery Stream {} is active\".format(firehose_name))"
]
},
{
@@ -253,12 +252,12 @@
"source": [
"r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n",
"\n",
- "status = description.get('DeliveryStreamStatus')\n",
+ "status = description.get(\"DeliveryStreamStatus\")\n",
"print(status)\n",
"\n",
"print()\n",
"\n",
- "description = r.get('DeliveryStreamDescription')\n",
+ "description = r.get(\"DeliveryStreamDescription\")\n",
"print(json.dumps(description, indent=4, sort_keys=True, default=str))"
]
},
@@ -268,7 +267,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehose_arn = r['DeliveryStreamDescription']['DeliveryStreamARN']\n",
+ "firehose_arn = r[\"DeliveryStreamDescription\"][\"DeliveryStreamARN\"]\n",
"print(firehose_arn)"
]
},
@@ -295,8 +294,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Firehose'.format(region, firehose_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Firehose'.format(\n",
+ " region, firehose_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -322,7 +327,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb b/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb
index d9ba7992..db2bba69 100644
--- a/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb
+++ b/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb
@@ -26,14 +26,14 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "kinesis = boto3.Session().client(service_name='kinesis', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "kinesis = boto3.Session().client(service_name=\"kinesis\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)"
]
},
{
@@ -61,9 +61,9 @@
"try:\n",
" stream_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -92,19 +92,16 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
- " response = kinesis.create_stream(\n",
- " StreamName=stream_name, \n",
- " ShardCount=shard_count\n",
- " )\n",
- " print('Data Stream {} successfully created.'.format(stream_name))\n",
+ "try:\n",
+ " response = kinesis.create_stream(StreamName=stream_name, ShardCount=shard_count)\n",
+ " print(\"Data Stream {} successfully created.\".format(stream_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
- " \n",
+ "\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Data Stream {} already exists.'.format(stream_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Data Stream {} already exists.\".format(stream_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)"
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -115,14 +112,14 @@
"source": [
"import time\n",
"\n",
- "status = ''\n",
- "while status != 'ACTIVE': \n",
+ "status = \"\"\n",
+ "while status != \"ACTIVE\":\n",
" r = kinesis.describe_stream(StreamName=stream_name)\n",
- " description = r.get('StreamDescription')\n",
- " status = description.get('StreamStatus')\n",
+ " description = r.get(\"StreamDescription\")\n",
+ " status = description.get(\"StreamStatus\")\n",
" time.sleep(5)\n",
- " \n",
- "print('Stream {} is active'.format(stream_name))"
+ "\n",
+ "print(\"Stream {} is active\".format(stream_name))"
]
},
{
@@ -138,9 +135,7 @@
"metadata": {},
"outputs": [],
"source": [
- "stream_response = kinesis.describe_stream(\n",
- " StreamName=stream_name\n",
- ")\n",
+ "stream_response = kinesis.describe_stream(StreamName=stream_name)\n",
"\n",
"print(json.dumps(stream_response, indent=4, sort_keys=True, default=str))"
]
@@ -153,7 +148,7 @@
},
"outputs": [],
"source": [
- "stream_arn = stream_response['StreamDescription']['StreamARN']\n",
+ "stream_arn = stream_response[\"StreamDescription\"][\"StreamARN\"]\n",
"print(stream_arn)"
]
},
@@ -180,8 +175,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Stream'.format(region, stream_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Stream'.format(\n",
+ " region, stream_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -207,7 +208,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb b/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb
index 6e99c2c7..7a6faf3f 100644
--- a/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb
+++ b/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb
@@ -38,16 +38,16 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "account_id = sts.get_caller_identity()['Account']\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"\n",
- "lam = boto3.Session().client(service_name='lambda', region_name=region)"
+ "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)"
]
},
{
@@ -75,9 +75,9 @@
"try:\n",
" lambda_fn_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -114,9 +114,9 @@
"try:\n",
" iam_lambda_role_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -146,9 +146,9 @@
"try:\n",
" iam_lambda_role_passed\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -167,11 +167,11 @@
"outputs": [],
"source": [
"if not iam_lambda_role_passed:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
"else:\n",
- " print('[OK]')"
+ " print(\"[OK]\")"
]
},
{
@@ -192,9 +192,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -253,7 +253,7 @@
"metadata": {},
"outputs": [],
"source": [
- "with open('src/DeliverKinesisAnalyticsToCloudWatch.zip', 'rb') as f: \n",
+ "with open(\"src/DeliverKinesisAnalyticsToCloudWatch.zip\", \"rb\") as f:\n",
" code = f.read()"
]
},
@@ -272,27 +272,25 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = lam.create_function(\n",
- " FunctionName='{}'.format(lambda_fn_name),\n",
- " Runtime='python2.7',\n",
- " Role='{}'.format(iam_role_lambda_arn),\n",
- " Handler='src/lambda_function.lambda_handler',\n",
- " Code={\n",
- " 'ZipFile': code\n",
- " },\n",
- " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n",
+ " FunctionName=\"{}\".format(lambda_fn_name),\n",
+ " Runtime=\"python2.7\",\n",
+ " Role=\"{}\".format(iam_role_lambda_arn),\n",
+ " Handler=\"src/lambda_function.lambda_handler\",\n",
+ " Code={\"ZipFile\": code},\n",
+ " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n",
" Timeout=60,\n",
" MemorySize=128,\n",
- " Publish=True\n",
+ " Publish=True,\n",
" )\n",
- " print('Lambda Function {} successfully created.'.format(lambda_fn_name))\n",
+ " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name))\n",
"\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceConflictException':\n",
- " print('Lambda Function {} already exists. This is OK.'.format(lambda_fn_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n",
+ " print(\"Lambda Function {} already exists. This is OK.\".format(lambda_fn_name))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -313,7 +311,7 @@
"source": [
"response = lam.get_function(FunctionName=lambda_fn_name)\n",
"\n",
- "lambda_fn_arn = response['Configuration']['FunctionArn']\n",
+ "lambda_fn_arn = response[\"Configuration\"][\"FunctionArn\"]\n",
"print(lambda_fn_arn)"
]
},
@@ -342,8 +340,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Function'.format(region, lambda_fn_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Function'.format(\n",
+ " region, lambda_fn_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -369,7 +373,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb b/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb
index 7ddc0be0..6c7aae36 100644
--- a/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb
+++ b/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb
@@ -27,17 +27,17 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sts = boto3.Session().client(service_name='sts', region_name=region)\n",
- "account_id = sts.get_caller_identity()['Account']\n",
+ "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n",
+ "account_id = sts.get_caller_identity()[\"Account\"]\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n",
- "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n",
+ "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)"
]
},
{
@@ -58,9 +58,9 @@
"try:\n",
" firehose_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -90,9 +90,9 @@
"try:\n",
" firehose_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -122,9 +122,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -154,9 +154,9 @@
"try:\n",
" stream_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -186,9 +186,9 @@
"try:\n",
" lambda_fn_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -218,9 +218,9 @@
"try:\n",
" iam_role_lambda_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -252,7 +252,7 @@
"metadata": {},
"outputs": [],
"source": [
- "kinesis_data_analytics_app_name = 'dsoaws-kinesis-data-analytics-sql-app'"
+ "kinesis_data_analytics_app_name = \"dsoaws-kinesis-data-analytics-sql-app\""
]
},
{
@@ -261,7 +261,7 @@
"metadata": {},
"outputs": [],
"source": [
- "in_app_stream_name = 'firehose_001' # Default"
+ "in_app_stream_name = \"firehose_001\" # Default"
]
},
{
@@ -286,7 +286,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sql_code = ''' \\\n",
+ "sql_code = \"\"\" \\\n",
" CREATE OR REPLACE STREAM \"AVG_STAR_RATING_SQL_STREAM\" ( \\\n",
" avg_star_rating DOUBLE); \\\n",
" CREATE OR REPLACE PUMP \"AVG_STAR_RATING_STREAM_PUMP\" AS \\\n",
@@ -317,12 +317,9 @@
" {} \\\n",
" ) \\\n",
" ); \\\n",
- " '''.format(in_app_stream_name, \n",
- " in_app_stream_name, \n",
- " window_seconds,\n",
- " in_app_stream_name, \n",
- " in_app_stream_name, \n",
- " window_seconds)\n",
+ " \"\"\".format(\n",
+ " in_app_stream_name, in_app_stream_name, window_seconds, in_app_stream_name, in_app_stream_name, window_seconds\n",
+ ")\n",
"\n",
"print(sql_code)"
]
@@ -335,83 +332,59 @@
"source": [
"from botocore.exceptions import ClientError\n",
"\n",
- "try: \n",
+ "try:\n",
" response = kinesis_analytics.create_application(\n",
" ApplicationName=kinesis_data_analytics_app_name,\n",
" Inputs=[\n",
" {\n",
- " 'NamePrefix': 'firehose',\n",
- " 'KinesisFirehoseInput': {\n",
- " 'ResourceARN': '{}'.format(firehose_arn),\n",
- " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n",
+ " \"NamePrefix\": \"firehose\",\n",
+ " \"KinesisFirehoseInput\": {\n",
+ " \"ResourceARN\": \"{}\".format(firehose_arn),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n",
" },\n",
- " 'InputSchema': {\n",
- " 'RecordFormat': {\n",
- " 'RecordFormatType': 'CSV',\n",
- " 'MappingParameters': {\n",
- " 'CSVMappingParameters': {\n",
- " 'RecordRowDelimiter': '\\n',\n",
- " 'RecordColumnDelimiter': '\\t'\n",
- " }\n",
- " }\n",
- " },\n",
- " 'RecordColumns': [\n",
- " {\n",
- " 'Name': 'review_id',\n",
- " 'Mapping': 'review_id',\n",
- " 'SqlType': 'VARCHAR(14)'\n",
- " }, \n",
- " {\n",
- " 'Name': 'star_rating',\n",
- " 'Mapping': 'star_rating',\n",
- " 'SqlType': 'INTEGER'\n",
+ " \"InputSchema\": {\n",
+ " \"RecordFormat\": {\n",
+ " \"RecordFormatType\": \"CSV\",\n",
+ " \"MappingParameters\": {\n",
+ " \"CSVMappingParameters\": {\"RecordRowDelimiter\": \"\\n\", \"RecordColumnDelimiter\": \"\\t\"}\n",
" },\n",
- " {\n",
- " 'Name': 'product_category',\n",
- " 'Mapping': 'product_category',\n",
- " 'SqlType': 'VARCHAR(24)'\n",
- " }, \n",
- " {\n",
- " 'Name': 'review_body',\n",
- " 'Mapping': 'review_body',\n",
- " 'SqlType': 'VARCHAR(65535)'\n",
- " } \n",
- " ]\n",
- " }\n",
+ " },\n",
+ " \"RecordColumns\": [\n",
+ " {\"Name\": \"review_id\", \"Mapping\": \"review_id\", \"SqlType\": \"VARCHAR(14)\"},\n",
+ " {\"Name\": \"star_rating\", \"Mapping\": \"star_rating\", \"SqlType\": \"INTEGER\"},\n",
+ " {\"Name\": \"product_category\", \"Mapping\": \"product_category\", \"SqlType\": \"VARCHAR(24)\"},\n",
+ " {\"Name\": \"review_body\", \"Mapping\": \"review_body\", \"SqlType\": \"VARCHAR(65535)\"},\n",
+ " ],\n",
+ " },\n",
" },\n",
" ],\n",
" Outputs=[\n",
" {\n",
- " 'Name': 'AVG_STAR_RATING_STREAM', \n",
- " 'KinesisStreamsOutput': {\n",
- " 'ResourceARN': '{}'.format(stream_arn),\n",
- " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n",
+ " \"Name\": \"AVG_STAR_RATING_STREAM\",\n",
+ " \"KinesisStreamsOutput\": {\n",
+ " \"ResourceARN\": \"{}\".format(stream_arn),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n",
" },\n",
- " 'DestinationSchema': {\n",
- " 'RecordFormatType': 'CSV'\n",
- " }\n",
+ " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n",
" },\n",
" {\n",
- " 'Name': 'AVG_STAR_RATING_SQL_STREAM',\n",
- " 'LambdaOutput': {\n",
- " 'ResourceARN': '{}'.format(lambda_fn_arn),\n",
- " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n",
+ " \"Name\": \"AVG_STAR_RATING_SQL_STREAM\",\n",
+ " \"LambdaOutput\": {\n",
+ " \"ResourceARN\": \"{}\".format(lambda_fn_arn),\n",
+ " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n",
" },\n",
- " 'DestinationSchema': {\n",
- " 'RecordFormatType': 'CSV'\n",
- " }\n",
- " }\n",
+ " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n",
+ " },\n",
" ],\n",
- " ApplicationCode=sql_code\n",
+ " ApplicationCode=sql_code,\n",
" )\n",
- " print('SQL application {} successfully created.'.format(kinesis_data_analytics_app_name))\n",
+ " print(\"SQL application {} successfully created.\".format(kinesis_data_analytics_app_name))\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('SQL App {} already exists.'.format(kinesis_data_analytics_app_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"SQL App {} already exists.\".format(kinesis_data_analytics_app_name))\n",
" else:\n",
- " print('Unexpected error: %s' % e)\n",
- " "
+ " print(\"Unexpected error: %s\" % e)"
]
},
{
@@ -432,7 +405,7 @@
"metadata": {},
"outputs": [],
"source": [
- "input_id = response['ApplicationDetail']['InputDescriptions'][0]['InputId']\n",
+ "input_id = response[\"ApplicationDetail\"][\"InputDescriptions\"][0][\"InputId\"]\n",
"print(input_id)"
]
},
@@ -449,24 +422,17 @@
"metadata": {},
"outputs": [],
"source": [
- "try: \n",
+ "try:\n",
" response = kinesis_analytics.start_application(\n",
" ApplicationName=kinesis_data_analytics_app_name,\n",
- " InputConfigurations=[\n",
- " {\n",
- " 'Id': input_id,\n",
- " 'InputStartingPositionConfiguration': {\n",
- " 'InputStartingPosition': 'NOW'\n",
- " }\n",
- " }\n",
- " ]\n",
+ " InputConfigurations=[{\"Id\": input_id, \"InputStartingPositionConfiguration\": {\"InputStartingPosition\": \"NOW\"}}],\n",
" )\n",
" print(json.dumps(response, indent=4, sort_keys=True, default=str))\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceInUseException':\n",
- " print('Application {} is already starting.'.format(kinesis_data_analytics_app_name))\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n",
+ " print(\"Application {} is already starting.\".format(kinesis_data_analytics_app_name))\n",
" else:\n",
- " print('Error: {}'.format(e))"
+ " print(\"Error: {}\".format(e))"
]
},
{
@@ -492,8 +458,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -515,17 +487,16 @@
"\n",
"import time\n",
"\n",
- "app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- "print('Application status {}'.format(app_status))\n",
+ "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ "print(\"Application status {}\".format(app_status))\n",
"\n",
- "while app_status != 'RUNNING':\n",
+ "while app_status != \"RUNNING\":\n",
" time.sleep(5)\n",
- " response = kinesis_analytics.describe_application(\n",
- " ApplicationName=kinesis_data_analytics_app_name)\n",
- " app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- " print('Application status {}'.format(app_status))\n",
+ " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n",
+ " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ " print(\"Application status {}\".format(app_status))\n",
"\n",
- "print('Application status {}'.format(app_status))"
+ "print(\"Application status {}\".format(app_status))"
]
},
{
@@ -560,7 +531,7 @@
"outputs": [],
"source": [
"%%javascript\n",
- "Jupyter.notebook.save_checkpoint();\n",
+ "Jupyter.notebook.save_checkpoint()\n",
"Jupyter.notebook.session.delete();"
]
}
diff --git a/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb b/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
index d2196329..afb6adf5 100644
--- a/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
+++ b/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb
@@ -25,14 +25,14 @@
"import pandas as pd\n",
"import json\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n",
- "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)\n"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n",
+ "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)"
]
},
{
@@ -53,9 +53,9 @@
"try:\n",
" firehose_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -85,9 +85,9 @@
"try:\n",
" firehose_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -117,9 +117,9 @@
"try:\n",
" iam_role_kinesis_arn\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -149,9 +149,9 @@
"try:\n",
" kinesis_data_analytics_app_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -181,9 +181,9 @@
"try:\n",
" lambda_fn_name\n",
"except NameError:\n",
- " print('+++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in this section before you continue.')\n",
- " print('+++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++\")"
]
},
{
@@ -201,7 +201,7 @@
"metadata": {},
"outputs": [],
"source": [
- "firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')\n",
+ "firehoses = firehose.list_delivery_streams(DeliveryStreamType=\"DirectPut\")\n",
"\n",
"print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))"
]
@@ -231,10 +231,12 @@
"import csv\n",
"import pandas as pd\n",
"\n",
- "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n",
- " delimiter='\\t', \n",
- " quoting=csv.QUOTE_NONE,\n",
- " compression='gzip')\n",
+ "df = pd.read_csv(\n",
+ " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n",
+ " delimiter=\"\\t\",\n",
+ " quoting=csv.QUOTE_NONE,\n",
+ " compression=\"gzip\",\n",
+ ")\n",
"df.shape"
]
},
@@ -255,14 +257,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_star_rating_and_review_body = df[['review_id', \n",
- " 'star_rating', \n",
- " 'product_category', \n",
- " 'review_body']][0:1]\n",
+ "df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][0:1]\n",
"\n",
- "df_star_rating_and_review_body.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)"
+ "df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)"
]
},
{
@@ -291,16 +288,15 @@
"\n",
"import time\n",
"\n",
- "app_status = response['ApplicationDetail']['ApplicationStatus']\n",
+ "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
"\n",
- "while app_status != 'RUNNING':\n",
+ "while app_status != \"RUNNING\":\n",
" time.sleep(5)\n",
- " response = kinesis_analytics.describe_application(\n",
- " ApplicationName=kinesis_data_analytics_app_name)\n",
- " app_status = response['ApplicationDetail']['ApplicationStatus']\n",
- " print('Application status {}'.format(app_status))\n",
+ " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n",
+ " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n",
+ " print(\"Application status {}\".format(app_status))\n",
"\n",
- "print('Application status {}'.format(app_status))"
+ "print(\"Application status {}\".format(app_status))"
]
},
{
@@ -331,8 +327,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Lambda Logs'.format(region, lambda_fn_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Lambda Logs'.format(\n",
+ " region, lambda_fn_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -356,8 +358,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML(\"\"\"Review CloudWatch Metrics\"\"\".format(region, region)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " \"\"\"Review CloudWatch Metrics\"\"\".format(\n",
+ " region, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -381,8 +389,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -407,9 +421,7 @@
},
"outputs": [],
"source": [
- "firehose_response = firehose.describe_delivery_stream(\n",
- " DeliveryStreamName=firehose_name\n",
- ")\n",
+ "firehose_response = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n",
"\n",
"print(json.dumps(firehose_response, indent=4, sort_keys=True, default=str))"
]
@@ -424,23 +436,15 @@
"for start_idx in range(0, 10000, step):\n",
" end_idx = start_idx + step\n",
"\n",
- " df_star_rating_and_review_body = df[['review_id', \n",
- " 'star_rating', \n",
- " 'product_category', \n",
- " 'review_body']][start_idx:end_idx]\n",
+ " df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][\n",
+ " start_idx:end_idx\n",
+ " ]\n",
+ "\n",
+ " reviews_tsv = df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)\n",
"\n",
- " reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)\n",
- " \n",
" # print(reviews_tsv.encode('utf-8'))\n",
- " \n",
- " response = firehose.put_record( \n",
- " Record={\n",
- " 'Data': reviews_tsv.encode('utf-8')\n",
- " },\n",
- " DeliveryStreamName=firehose_name\n",
- " )"
+ "\n",
+ " response = firehose.put_record(Record={\"Data\": reviews_tsv.encode(\"utf-8\")}, DeliveryStreamName=firehose_name)"
]
},
{
@@ -450,8 +454,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Review Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -486,8 +496,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To UI Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To UI Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -546,8 +562,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -570,22 +592,22 @@
"for start_idx in range(0, 10000, anomaly_step):\n",
" timestamp = int(time.time())\n",
"\n",
- " df_anomalies = pd.DataFrame([\n",
- " {'review_id': str(timestamp), \n",
- " 'star_rating': 100, \n",
- " 'product_category': 'Digital_Software', \n",
- " 'review_body': 'blahblah'}, \n",
- " ], columns=['review_id', 'star_rating', 'product_category', 'review_body'])\n",
+ " df_anomalies = pd.DataFrame(\n",
+ " [\n",
+ " {\n",
+ " \"review_id\": str(timestamp),\n",
+ " \"star_rating\": 100,\n",
+ " \"product_category\": \"Digital_Software\",\n",
+ " \"review_body\": \"blahblah\",\n",
+ " },\n",
+ " ],\n",
+ " columns=[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"],\n",
+ " )\n",
+ "\n",
+ " reviews_tsv_anomalies = df_anomalies.to_csv(sep=\"\\t\", header=None, index=False)\n",
"\n",
- " reviews_tsv_anomalies = df_anomalies.to_csv(sep='\\t',\n",
- " header=None,\n",
- " index=False)\n",
- " \n",
- " response = firehose.put_record( \n",
- " Record={\n",
- " 'Data': reviews_tsv_anomalies.encode('utf-8')\n",
- " },\n",
- " DeliveryStreamName=firehose_name\n",
+ " response = firehose.put_record(\n",
+ " Record={\"Data\": reviews_tsv_anomalies.encode(\"utf-8\")}, DeliveryStreamName=firehose_name\n",
" )"
]
},
@@ -596,8 +618,14 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
- " \n",
- "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n"
+ "\n",
+ "display(\n",
+ " HTML(\n",
+ " 'Go To Kinesis Data Analytics App'.format(\n",
+ " region, kinesis_data_analytics_app_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -621,8 +649,8 @@
"outputs": [],
"source": [
"#%%javascript\n",
- "#Jupyter.notebook.save_checkpoint();\n",
- "#Jupyter.notebook.session.delete();"
+ "# Jupyter.notebook.save_checkpoint();\n",
+ "# Jupyter.notebook.session.delete();"
]
}
],
diff --git a/11_stream/archive/11_stream.orig/src/lambda_function.py b/11_stream/archive/11_stream.orig/src/lambda_function.py
index 889896bf..a867f165 100644
--- a/11_stream/archive/11_stream.orig/src/lambda_function.py
+++ b/11_stream/archive/11_stream.orig/src/lambda_function.py
@@ -10,51 +10,51 @@
logger = logging.getLogger()
logger.setLevel(logging.INFO)
-client = boto3.client('cloudwatch')
+client = boto3.client("cloudwatch")
+
def lambda_handler(event, context):
output = []
success = 0
failure = 0
- for record in event['records']:
+ for record in event["records"]:
try:
- #logger.info(f'event: {event}')
- payload = base64.b64decode(record['data'])
+ # logger.info(f'event: {event}')
+ payload = base64.b64decode(record["data"])
datapoint = float(payload)
# logger.info(f'avg_star_rating: {payload}')
client.put_metric_data(
- Namespace='kinesis/analytics/AVGStarRating',
+ Namespace="kinesis/analytics/AVGStarRating",
MetricData=[
{
- 'MetricName': 'AVGStarRating',
- 'Dimensions': [
- {
- 'Name': 'Product Category',
- 'Value': 'All'
- },
+ "MetricName": "AVGStarRating",
+ "Dimensions": [
+ {"Name": "Product Category", "Value": "All"},
],
- 'Value': datapoint,
- 'StorageResolution': 1
+ "Value": datapoint,
+ "StorageResolution": 1,
}
- ]
+ ],
)
- output.append({'recordId': record['recordId'], 'result': 'Ok'})
+ output.append({"recordId": record["recordId"], "result": "Ok"})
success += 1
print(datapoint)
-
+
except Exception as exp:
- output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'})
+ output.append({"recordId": record["recordId"], "result": "DeliveryFailed"})
failure += 1
exception_type, exception_value, exception_traceback = sys.exc_info()
traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback)
- err_msg = json.dumps({
- "errorType": exception_type.__name__,
- "errorMessage": str(exception_value),
- "stackTrace": traceback_string
- })
+ err_msg = json.dumps(
+ {
+ "errorType": exception_type.__name__,
+ "errorMessage": str(exception_value),
+ "stackTrace": traceback_string,
+ }
+ )
logger.error(err_msg)
- print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure))
- return {'records': output}
\ No newline at end of file
+ print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure))
+ return {"records": output}
diff --git a/11_stream/src/deliver_metrics_to_cloudwatch.py b/11_stream/src/deliver_metrics_to_cloudwatch.py
index 889896bf..a867f165 100644
--- a/11_stream/src/deliver_metrics_to_cloudwatch.py
+++ b/11_stream/src/deliver_metrics_to_cloudwatch.py
@@ -10,51 +10,51 @@
logger = logging.getLogger()
logger.setLevel(logging.INFO)
-client = boto3.client('cloudwatch')
+client = boto3.client("cloudwatch")
+
def lambda_handler(event, context):
output = []
success = 0
failure = 0
- for record in event['records']:
+ for record in event["records"]:
try:
- #logger.info(f'event: {event}')
- payload = base64.b64decode(record['data'])
+ # logger.info(f'event: {event}')
+ payload = base64.b64decode(record["data"])
datapoint = float(payload)
# logger.info(f'avg_star_rating: {payload}')
client.put_metric_data(
- Namespace='kinesis/analytics/AVGStarRating',
+ Namespace="kinesis/analytics/AVGStarRating",
MetricData=[
{
- 'MetricName': 'AVGStarRating',
- 'Dimensions': [
- {
- 'Name': 'Product Category',
- 'Value': 'All'
- },
+ "MetricName": "AVGStarRating",
+ "Dimensions": [
+ {"Name": "Product Category", "Value": "All"},
],
- 'Value': datapoint,
- 'StorageResolution': 1
+ "Value": datapoint,
+ "StorageResolution": 1,
}
- ]
+ ],
)
- output.append({'recordId': record['recordId'], 'result': 'Ok'})
+ output.append({"recordId": record["recordId"], "result": "Ok"})
success += 1
print(datapoint)
-
+
except Exception as exp:
- output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'})
+ output.append({"recordId": record["recordId"], "result": "DeliveryFailed"})
failure += 1
exception_type, exception_value, exception_traceback = sys.exc_info()
traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback)
- err_msg = json.dumps({
- "errorType": exception_type.__name__,
- "errorMessage": str(exception_value),
- "stackTrace": traceback_string
- })
+ err_msg = json.dumps(
+ {
+ "errorType": exception_type.__name__,
+ "errorMessage": str(exception_value),
+ "stackTrace": traceback_string,
+ }
+ )
logger.error(err_msg)
- print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure))
- return {'records': output}
\ No newline at end of file
+ print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure))
+ return {"records": output}
diff --git a/11_stream/src/invoke_sm_endpoint_from_kinesis.py b/11_stream/src/invoke_sm_endpoint_from_kinesis.py
index 2994ec12..3fc01ed3 100644
--- a/11_stream/src/invoke_sm_endpoint_from_kinesis.py
+++ b/11_stream/src/invoke_sm_endpoint_from_kinesis.py
@@ -7,26 +7,27 @@
import json
# grab environment variables
-ENDPOINT_NAME = os.environ['ENDPOINT_NAME']
-print('Endpoint: {}'.format(ENDPOINT_NAME))
-runtime = boto3.client('runtime.sagemaker')
+ENDPOINT_NAME = os.environ["ENDPOINT_NAME"]
+print("Endpoint: {}".format(ENDPOINT_NAME))
+runtime = boto3.client("runtime.sagemaker")
+
+print("Loading function")
-print('Loading function')
def lambda_handler(event, context):
outputs = []
-
- r = event['records']
- print('records: {}'.format(r))
- print('type_records: {}'.format(type(r)))
-
+
+ r = event["records"]
+ print("records: {}".format(r))
+ print("type_records: {}".format(type(r)))
+
# TODO: Handle batches
- for record in event['records']:
- print(record['recordId'])
- payload = base64.b64decode(record['data'])
- print('payload: {}'.format(payload))
+ for record in event["records"]:
+ print(record["recordId"])
+ payload = base64.b64decode(record["data"])
+ print("payload: {}".format(payload))
text = payload.decode("utf-8")
- print('text: {}'.format(text))
+ print("text: {}".format(text))
# Do custom processing on the payload here
split_inputs = text.split("\t")
@@ -34,44 +35,44 @@ def lambda_handler(event, context):
print(split_inputs)
review_body = split_inputs[2]
print(review_body)
-
- inputs = [
- {"features": [review_body]}
- ]
+
+ inputs = [{"features": [review_body]}]
response = runtime.invoke_endpoint(
- EndpointName=pytorch_endpoint_name,
- ContentType='application/jsonlines',
- Accept='application/jsonlines',
- Body=json.dumps(inputs).encode('utf-8')
+ EndpointName=pytorch_endpoint_name,
+ ContentType="application/jsonlines",
+ Accept="application/jsonlines",
+ Body=json.dumps(inputs).encode("utf-8"),
)
- print('response: {}'.format(response))
+ print("response: {}".format(response))
- predicted_classes_str = response['Body'].read().decode()
+ predicted_classes_str = response["Body"].read().decode()
predicted_classes_json = json.loads(predicted_classes_str)
predicted_classes = predicted_classes_json.splitlines()
- print('predicted_classes: {}'.format(predicted_classes))
+ print("predicted_classes: {}".format(predicted_classes))
for predicted_class_json, input_data in zip(predicted_classes, inputs):
- predicted_class = json.loads(predicted_class_json)['predicted_label']
- print('Predicted star_rating: {} for review_body "{}"'.format(predicted_class, input_data["features"][0]))
+ predicted_class = json.loads(predicted_class_json)["predicted_label"]
+ print('Predicted star_rating: {} for review_body "{}"'.format(predicted_class, input_data["features"][0]))
# Built output_record
# review_id, star_rating, product_category, review_body
- output_data = '{}\t{}\t{}\t{}'.format(split_inputs[0], str(predicted_class), split_inputs[1], input_data["review_body"])
- print('output_data: {}'.format(output_data))
- output_data_encoded = output_data.encode('utf-8')
+ output_data = "{}\t{}\t{}\t{}".format(
+ split_inputs[0], str(predicted_class), split_inputs[1], input_data["review_body"]
+ )
+ print("output_data: {}".format(output_data))
+ output_data_encoded = output_data.encode("utf-8")
output_record = {
- 'recordId': record['recordId'],
- 'result': 'Ok',
- 'data': base64.b64encode(output_data_encoded).decode('utf-8')
+ "recordId": record["recordId"],
+ "result": "Ok",
+ "data": base64.b64encode(output_data_encoded).decode("utf-8"),
}
outputs.append(output_record)
- print('Successfully processed {} records.'.format(len(event['records'])))
- print('type(output): {}'.format(type(outputs)))
- print('Output Length: {} .'.format(len(outputs)))
+ print("Successfully processed {} records.".format(len(event["records"])))
+ print("type(output): {}".format(type(outputs)))
+ print("Output Length: {} .".format(len(outputs)))
- return {'records': outputs}
\ No newline at end of file
+ return {"records": outputs}
diff --git a/11_stream/src/push_notification_to_sns.py b/11_stream/src/push_notification_to_sns.py
index e2d31cc7..3d9bb587 100644
--- a/11_stream/src/push_notification_to_sns.py
+++ b/11_stream/src/push_notification_to_sns.py
@@ -3,11 +3,12 @@
import base64
import os
-SNS_TOPIC_ARN = os.environ['SNS_TOPIC_ARN']
+SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
-sns = boto3.client('sns')
+sns = boto3.client("sns")
+
+print("Loading function")
-print('Loading function')
def lambda_handler(event, context):
output = []
@@ -15,30 +16,34 @@ def lambda_handler(event, context):
failure = 0
highest_score = 0
- print('event: {}'.format(event))
- r = event['records']
- print('records: {}'.format(r))
- print('type_records: {}'.format(type(r)))
-
- for record in event['records']:
+ print("event: {}".format(event))
+ r = event["records"]
+ print("records: {}".format(r))
+ print("type_records: {}".format(type(r)))
+
+ for record in event["records"]:
try:
# Uncomment the below line to publish the decoded data to the SNS topic.
- payload = base64.b64decode(record['data'])
- print('payload: {}'.format(payload))
+ payload = base64.b64decode(record["data"])
+ print("payload: {}".format(payload))
text = payload.decode("utf-8")
- print('text: {}'.format(text))
+ print("text: {}".format(text))
score = float(text)
if (score != 0) and (score > highest_score):
highest_score = score
- print('New highest_score: {}'.format(highest_score))
+ print("New highest_score: {}".format(highest_score))
# sns.publish(TopicArn=SNS_TOPIC_ARN, Message='New anomaly score: {}'.format(text), Subject='New Reviews Anomaly Score Detected')
- output.append({'recordId': record['recordId'], 'result': 'Ok'})
+ output.append({"recordId": record["recordId"], "result": "Ok"})
success += 1
except Exception as e:
print(e)
- output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'})
+ output.append({"recordId": record["recordId"], "result": "DeliveryFailed"})
failure += 1
- if (highest_score != 0):
- sns.publish(TopicArn=SNS_TOPIC_ARN, Message='New anomaly score: {}'.format(str(highest_score)), Subject='New Reviews Anomaly Score Detected')
- print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure))
- return {'records': output}
\ No newline at end of file
+ if highest_score != 0:
+ sns.publish(
+ TopicArn=SNS_TOPIC_ARN,
+ Message="New anomaly score: {}".format(str(highest_score)),
+ Subject="New Reviews Anomaly Score Detected",
+ )
+ print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure))
+ return {"records": output}
diff --git a/12_security/01_Secrets_Manager.ipynb b/12_security/01_Secrets_Manager.ipynb
index bffc6d57..5a5e4dc6 100644
--- a/12_security/01_Secrets_Manager.ipynb
+++ b/12_security/01_Secrets_Manager.ipynb
@@ -19,12 +19,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "secretsmanager = boto3.client('secretsmanager')"
+ "secretsmanager = boto3.client(\"secretsmanager\")"
]
},
{
@@ -37,18 +37,15 @@
"\n",
"try:\n",
" response = secretsmanager.create_secret(\n",
- " Name='dsoaws_redshift_login',\n",
- " Description='DSOAWS Redshift Login',\n",
+ " Name=\"dsoaws_redshift_login\",\n",
+ " Description=\"DSOAWS Redshift Login\",\n",
" SecretString='[{\"username\":\"dsoaws\"},{\"password\":\"Password9\"}]',\n",
" Tags=[\n",
- " {\n",
- " 'Key': 'name',\n",
- " 'Value': 'dsoaws_redshift_login'\n",
- " },\n",
- " ]\n",
+ " {\"Key\": \"name\", \"Value\": \"dsoaws_redshift_login\"},\n",
+ " ],\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'ResourceExistsException':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"ResourceExistsException\":\n",
" print(\"Secret already exists. This is ok.\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)"
@@ -62,14 +59,14 @@
"source": [
"import json\n",
"\n",
- "secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')\n",
- "cred = json.loads(secret['SecretString'])\n",
+ "secret = secretsmanager.get_secret_value(SecretId=\"dsoaws_redshift_login\")\n",
+ "cred = json.loads(secret[\"SecretString\"])\n",
"\n",
- "redshift_username = cred[0]['username']\n",
- "redshift_pw = cred[1]['password']\n",
+ "redshift_username = cred[0][\"username\"]\n",
+ "redshift_pw = cred[1][\"password\"]\n",
"\n",
- "print('redshift_username: {}'.format(redshift_username))\n",
- "print('redshift_pw: {}'.format(redshift_pw))"
+ "print(\"redshift_username: {}\".format(redshift_username))\n",
+ "print(\"redshift_pw: {}\".format(redshift_pw))"
]
},
{
diff --git a/12_security/02_Insecure_DataAccess_S3.ipynb b/12_security/02_Insecure_DataAccess_S3.ipynb
index c2b1b38c..26c6559f 100644
--- a/12_security/02_Insecure_DataAccess_S3.ipynb
+++ b/12_security/02_Insecure_DataAccess_S3.ipynb
@@ -10,12 +10,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
diff --git a/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb b/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb
index 2d4588c3..dceedfa9 100644
--- a/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb
+++ b/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb
@@ -17,13 +17,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)"
]
},
{
@@ -36,7 +36,7 @@
"\n",
"timestamp = int(time.time())\n",
"\n",
- "bucket_secure_name = 'bucket-secure-bucket-policy-role-{}'.format(timestamp)\n",
+ "bucket_secure_name = \"bucket-secure-bucket-policy-role-{}\".format(timestamp)\n",
"print(bucket_secure_name)"
]
},
@@ -66,19 +66,17 @@
"source": [
"# Create the bucket policy\n",
"bucket_policy_deny = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Deny',\n",
- " 'Principal': '*',\n",
- " 'Action': [\n",
- " 's3:ListBucket'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\n",
+ " \"Sid\": \"\",\n",
+ " \"Effect\": \"Deny\",\n",
+ " \"Principal\": \"*\",\n",
+ " \"Action\": [\"s3:ListBucket\"],\n",
+ " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n",
+ " }\n",
+ " ],\n",
+ "}"
]
},
{
diff --git a/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb b/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb
index 8db2ebdb..b36dc50c 100644
--- a/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb
+++ b/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb
@@ -17,14 +17,14 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)\n",
- "ec2 = boto3.Session().client(service_name='ec2', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n",
+ "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)"
]
},
{
@@ -37,7 +37,7 @@
"\n",
"timestamp = int(time.time())\n",
"\n",
- "bucket_secure_name = 'bucket-secure-bucket-policy-vpc-{}'.format(timestamp)\n",
+ "bucket_secure_name = \"bucket-secure-bucket-policy-vpc-{}\".format(timestamp)\n",
"print(bucket_secure_name)"
]
},
@@ -65,7 +65,7 @@
"metadata": {},
"outputs": [],
"source": [
- "different_vpc_id='blah'"
+ "different_vpc_id = \"blah\""
]
},
{
@@ -76,7 +76,7 @@
"source": [
"all_vpcs = ec2.describe_vpcs()\n",
"\n",
- "vpc_id=all_vpcs['Vpcs'][0]['VpcId']\n",
+ "vpc_id = all_vpcs[\"Vpcs\"][0][\"VpcId\"]\n",
"\n",
"print(vpc_id)"
]
@@ -89,26 +89,22 @@
"source": [
"# Create the bucket policy\n",
"bucket_policy_deny = {\n",
- " \"Version\": \"2008-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Deny\",\n",
- " \"Principal\": \"*\",\n",
- " \"Action\": [\n",
- " \"s3:ListBucket\"\n",
- " ],\n",
- " \"Resource\": [\n",
- " \"arn:aws:s3:::{}\".format(bucket_secure_name)\n",
- " ],\n",
- " \"Condition\": {\n",
- " \"StringNotEquals\": {\n",
- "# \"aws:sourceVpc\": different_vpc_id\n",
- " \"aws:sourceVpc\": vpc_id\n",
- " }\n",
+ " \"Version\": \"2008-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\n",
+ " \"Effect\": \"Deny\",\n",
+ " \"Principal\": \"*\",\n",
+ " \"Action\": [\"s3:ListBucket\"],\n",
+ " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n",
+ " \"Condition\": {\n",
+ " \"StringNotEquals\": {\n",
+ " # \"aws:sourceVpc\": different_vpc_id\n",
+ " \"aws:sourceVpc\": vpc_id\n",
" }\n",
- " }\n",
- " ]\n",
- " }"
+ " },\n",
+ " }\n",
+ " ],\n",
+ "}"
]
},
{
@@ -127,8 +123,7 @@
"import json\n",
"import time\n",
"\n",
- "response = s3.put_bucket_policy(Bucket=bucket_secure_name, \n",
- " Policy=json.dumps(bucket_policy_deny))\n",
+ "response = s3.put_bucket_policy(Bucket=bucket_secure_name, Policy=json.dumps(bucket_policy_deny))\n",
"\n",
"print(response)\n",
"\n",
diff --git a/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb b/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb
index 4bcf0548..e5f7706c 100644
--- a/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb
+++ b/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb
@@ -16,13 +16,13 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -31,7 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
- "role_name = role.split('/')[-1]\n",
+ "role_name = role.split(\"/\")[-1]\n",
"print(role_name)"
]
},
@@ -42,6 +42,7 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())"
]
},
@@ -51,7 +52,7 @@
"metadata": {},
"outputs": [],
"source": [
- "bucket_secure_name = 'bucket-secure-iam-policy-role-{}'.format(timestamp)\n",
+ "bucket_secure_name = \"bucket-secure-iam-policy-role-{}\".format(timestamp)\n",
"print(bucket_secure_name)"
]
},
@@ -79,7 +80,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_name='DSOAWS_SecureBucket_Policy_IAM_{}'.format(timestamp)\n",
+ "policy_name = \"DSOAWS_SecureBucket_Policy_IAM_{}\".format(timestamp)\n",
"print(policy_name)"
]
},
@@ -91,18 +92,16 @@
"source": [
"# Create the IAM policy\n",
"iam_policy_deny = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Deny',\n",
- " 'Action': [\n",
- " 's3:ListBucket'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\n",
+ " \"Sid\": \"\",\n",
+ " \"Effect\": \"Deny\",\n",
+ " \"Action\": [\"s3:ListBucket\"],\n",
+ " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n",
+ " }\n",
+ " ],\n",
+ "}"
]
},
{
@@ -113,13 +112,9 @@
},
"outputs": [],
"source": [
- "import json \n",
+ "import json\n",
"\n",
- "response = iam.put_role_policy(\n",
- " RoleName=role_name,\n",
- " PolicyName=policy_name,\n",
- " PolicyDocument=json.dumps(iam_policy_deny)\n",
- ")\n",
+ "response = iam.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(iam_policy_deny))\n",
"\n",
"print(response)\n",
"\n",
@@ -159,10 +154,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=role_name,\n",
- " PolicyName=policy_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=role_name, PolicyName=policy_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
diff --git a/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb b/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb
index fc23205e..66dcc3d3 100644
--- a/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb
+++ b/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb
@@ -16,13 +16,13 @@
"import boto3\n",
"import sagemaker\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "s3 = boto3.Session().client(service_name='s3', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -31,7 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
- "role_name = role.split('/')[-1]\n",
+ "role_name = role.split(\"/\")[-1]\n",
"print(role_name)"
]
},
@@ -42,6 +42,7 @@
"outputs": [],
"source": [
"import time\n",
+ "\n",
"timestamp = int(time.time())"
]
},
@@ -51,7 +52,7 @@
"metadata": {},
"outputs": [],
"source": [
- "bucket_secure_name = 'bucket-secure-iam-policy-vpc-{}'.format(timestamp)\n",
+ "bucket_secure_name = \"bucket-secure-iam-policy-vpc-{}\".format(timestamp)\n",
"print(bucket_secure_name)"
]
},
@@ -79,7 +80,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_name='DSOAWS_Secure_IAMPolicy_VPC_{}'.format(timestamp)\n",
+ "policy_name = \"DSOAWS_Secure_IAMPolicy_VPC_{}\".format(timestamp)\n",
"print(policy_name)"
]
},
@@ -89,7 +90,7 @@
"metadata": {},
"outputs": [],
"source": [
- "different_vpc_id='blah'"
+ "different_vpc_id = \"blah\""
]
},
{
@@ -100,24 +101,17 @@
"source": [
"# Create the IAM policy\n",
"iam_policy_deny = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Deny',\n",
- " 'Action': [\n",
- " 's3:ListBucket'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n",
- " ],\n",
- " 'Condition': {\n",
- " 'StringNotEquals': {\n",
- " 'aws:sourceVpc': different_vpc_id\n",
- " }\n",
- " }\n",
- " \n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\n",
+ " \"Sid\": \"\",\n",
+ " \"Effect\": \"Deny\",\n",
+ " \"Action\": [\"s3:ListBucket\"],\n",
+ " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n",
+ " \"Condition\": {\"StringNotEquals\": {\"aws:sourceVpc\": different_vpc_id}},\n",
+ " }\n",
+ " ],\n",
+ "}"
]
},
{
@@ -126,13 +120,9 @@
"metadata": {},
"outputs": [],
"source": [
- "import json \n",
+ "import json\n",
"\n",
- "response = iam.put_role_policy(\n",
- " RoleName=role_name,\n",
- " PolicyName=policy_name,\n",
- " PolicyDocument=json.dumps(iam_policy_deny)\n",
- ")\n",
+ "response = iam.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(iam_policy_deny))\n",
"\n",
"print(response)\n",
"\n",
@@ -163,10 +153,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=role_name,\n",
- " PolicyName=policy_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=role_name, PolicyName=policy_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
diff --git a/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb b/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb
index 6fc42392..1709f5e9 100644
--- a/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb
+++ b/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb
@@ -19,17 +19,17 @@
"outputs": [],
"source": [
"sm.create_notebook_instance(\n",
- " NotebookInstanceName='dsoaws',\n",
- " InstanceType='ml.t3.medium',\n",
- " SubnetId='',\n",
+ " NotebookInstanceName=\"dsoaws\",\n",
+ " InstanceType=\"ml.t3.medium\",\n",
+ " SubnetId=\"\",\n",
" SecurityGroupIds=[\n",
- " '',\n",
+ " \"\",\n",
" ],\n",
- " RoleArn='arn:aws:iam:::role/service-role/',\n",
- " KmsKeyId='',\n",
- " DirectInternetAccess='Disabled',\n",
+ " RoleArn=\"arn:aws:iam:::role/service-role/\",\n",
+ " KmsKeyId=\"\",\n",
+ " DirectInternetAccess=\"Disabled\",\n",
" VolumeSizeInGB=10,\n",
- " RootAccess='Disabled'\n",
+ " RootAccess=\"Disabled\",\n",
")"
]
},
diff --git a/12_security/07_Insecure_Train.ipynb b/12_security/07_Insecure_Train.ipynb
index b40992be..6a9439af 100644
--- a/12_security/07_Insecure_Train.ipynb
+++ b/12_security/07_Insecure_Train.ipynb
@@ -10,12 +10,12 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
]
},
{
@@ -36,9 +36,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -68,9 +68,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -100,9 +100,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -192,12 +192,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -226,28 +223,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='Pipe'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"Pipe\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -257,10 +254,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -280,38 +277,41 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- "# max_run=7200, # number of seconds\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " # max_run=7200, # number of seconds\n",
+ ")"
]
},
{
@@ -327,11 +327,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -341,7 +340,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -352,7 +351,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -363,7 +368,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -374,7 +385,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/08_Secure_Train_IAMPolicy_Role.ipynb b/12_security/08_Secure_Train_IAMPolicy_Role.ipynb
index d9ab9bc6..6e4f97b4 100644
--- a/12_security/08_Secure_Train_IAMPolicy_Role.ipynb
+++ b/12_security/08_Secure_Train_IAMPolicy_Role.ipynb
@@ -10,13 +10,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -37,9 +37,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -69,9 +69,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -101,9 +101,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -193,12 +193,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -227,28 +224,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='Pipe'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"Pipe\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -258,10 +255,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -280,17 +277,11 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"sagemaker.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n",
+ " ],\n",
+ "}"
]
},
{
@@ -310,7 +301,7 @@
"metadata": {},
"outputs": [],
"source": [
- "secure_iam_role_name = 'DSOAWS_Secure_Train_Role_{}'.format(timestamp)"
+ "secure_iam_role_name = \"DSOAWS_Secure_Train_Role_{}\".format(timestamp)"
]
},
{
@@ -328,12 +319,12 @@
" secure_iam_role = iam.create_role(\n",
" RoleName=secure_iam_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Secure Role'\n",
+ " Description=\"DSOAWS Secure Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role = iam.get_role(RoleName=secure_iam_role_name)\n",
- "# print(\"Role already exists\")\n",
+ " # print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)\n",
"\n",
@@ -349,18 +340,9 @@
"outputs": [],
"source": [
"iam_policy_allow_s3 = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Allow',\n",
- " 'Action': [\n",
- " 's3:*'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n",
+ "}"
]
},
{
@@ -369,7 +351,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)"
+ "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)"
]
},
{
@@ -381,9 +363,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name,\n",
- " PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
+ " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
")\n",
"\n",
"print(response)\n",
@@ -406,11 +386,9 @@
" \"Action\": [\n",
" \"sagemaker:CreateTrainingJob\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ]\n",
+ " \"Resource\": [\"*\"],\n",
" }\n",
- " ]\n",
+ " ],\n",
"}"
]
},
@@ -420,8 +398,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_Role_{}'.format(timestamp)\n",
- "\n"
+ "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_Role_{}\".format(timestamp)"
]
},
{
@@ -432,38 +409,41 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=secure_iam_role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- "# max_run=7200, # number of seconds\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=secure_iam_role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " # max_run=7200, # number of seconds\n",
+ ")"
]
},
{
@@ -479,11 +459,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -493,7 +472,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -504,7 +483,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -515,7 +500,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -526,7 +517,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -576,10 +573,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_deny_create_training_job_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
@@ -591,10 +585,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
diff --git a/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb b/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb
index b4d93d0a..d9c3d6c1 100644
--- a/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb
+++ b/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb
@@ -11,14 +11,14 @@
"import pandas as pd\n",
"\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "ec2 = boto3.Session().client(service_name='ec2', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)"
]
},
{
@@ -39,9 +39,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -71,9 +71,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -103,9 +103,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -195,12 +195,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -229,29 +226,29 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "#input_mode='Pipe'\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "# input_mode='Pipe'\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -261,10 +258,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -283,17 +280,11 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"sagemaker.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n",
+ " ],\n",
+ "}"
]
},
{
@@ -313,7 +304,7 @@
"metadata": {},
"outputs": [],
"source": [
- "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)"
+ "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)"
]
},
{
@@ -331,12 +322,12 @@
" secure_iam_role = iam.create_role(\n",
" RoleName=secure_iam_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Secure Role'\n",
+ " Description=\"DSOAWS Secure Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role = iam.get_role(RoleName=secure_iam_role_name)\n",
- "# print(\"Role already exists\")\n",
+ " # print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)\n",
"\n",
@@ -359,18 +350,9 @@
"outputs": [],
"source": [
"iam_policy_allow_s3 = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Allow',\n",
- " 'Action': [\n",
- " 's3:*'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n",
+ "}"
]
},
{
@@ -379,7 +361,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)"
+ "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)"
]
},
{
@@ -391,9 +373,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name,\n",
- " PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
+ " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
")\n",
"\n",
"print(response)\n",
@@ -460,17 +440,10 @@
" \"Action\": [\n",
" \"sagemaker:CreateTrainingJob\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ],\n",
- " \"Condition\": {\n",
- " \"Null\": {\n",
- " \"sagemaker:VpcSubnets\": \"true\",\n",
- " \"sagemaker:VpcSecurityGroupIds\": \"true\"\n",
- " }\n",
- " }\n",
+ " \"Resource\": [\"*\"],\n",
+ " \"Condition\": {\"Null\": {\"sagemaker:VpcSubnets\": \"true\", \"sagemaker:VpcSecurityGroupIds\": \"true\"}},\n",
" }\n",
- " ]\n",
+ " ],\n",
"}"
]
},
@@ -480,7 +453,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)"
+ "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)"
]
},
{
@@ -494,7 +467,7 @@
"response = iam.put_role_policy(\n",
" RoleName=secure_iam_role_name,\n",
" PolicyName=policy_deny_create_training_job_name,\n",
- " PolicyDocument=json.dumps(policy_deny_create_training_job)\n",
+ " PolicyDocument=json.dumps(policy_deny_create_training_job),\n",
")\n",
"\n",
"print(response)\n",
@@ -510,41 +483,44 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- "# role=role,\n",
- " role=secure_iam_role_name,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- "# subnets=None,\n",
- "# security_group_ids=None,\n",
- "# max_run=7200, # number of seconds\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " # role=role,\n",
+ " role=secure_iam_role_name,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " # subnets=None,\n",
+ " # security_group_ids=None,\n",
+ " # max_run=7200, # number of seconds\n",
+ ")"
]
},
{
@@ -563,11 +539,9 @@
"outputs": [],
"source": [
"estimator.fit(\n",
- " inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -577,7 +551,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -588,7 +562,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -599,7 +579,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -610,7 +596,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -651,10 +643,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_deny_create_training_job_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
@@ -666,10 +655,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
@@ -707,19 +693,20 @@
"outputs": [],
"source": [
"import json\n",
+ "\n",
"notebook_instance_name = None\n",
"\n",
"try:\n",
- " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n",
+ " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n",
" data = json.load(notebook_info)\n",
- " resource_arn = data['ResourceArn']\n",
- " region = resource_arn.split(':')[3]\n",
- " notebook_instance_name = data['ResourceName']\n",
- " print('Notebook Instance Name: {}'.format(notebook_instance_name))\n",
+ " resource_arn = data[\"ResourceArn\"]\n",
+ " region = resource_arn.split(\":\")[3]\n",
+ " notebook_instance_name = data[\"ResourceName\"]\n",
+ " print(\"Notebook Instance Name: {}\".format(notebook_instance_name))\n",
"except:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -728,9 +715,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = sm.describe_notebook_instance(\n",
- " NotebookInstanceName=notebook_instance_name\n",
- ")\n",
+ "response = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)\n",
"\n",
"print(response)"
]
@@ -741,11 +726,11 @@
"metadata": {},
"outputs": [],
"source": [
- "print('SubnetId: {}'.format(response['SubnetId']))\n",
- "print('SecurityGroups: {}'.format(response['SecurityGroups']))\n",
- "print('IAM Role: {}'.format(response['RoleArn']))\n",
- "print('NetworkInterfaceId: {}'.format(response['NetworkInterfaceId']))\n",
- "print('DirectInternetAccess: {}'.format(response['DirectInternetAccess']))"
+ "print(\"SubnetId: {}\".format(response[\"SubnetId\"]))\n",
+ "print(\"SecurityGroups: {}\".format(response[\"SecurityGroups\"]))\n",
+ "print(\"IAM Role: {}\".format(response[\"RoleArn\"]))\n",
+ "print(\"NetworkInterfaceId: {}\".format(response[\"NetworkInterfaceId\"]))\n",
+ "print(\"DirectInternetAccess: {}\".format(response[\"DirectInternetAccess\"]))"
]
},
{
@@ -754,7 +739,7 @@
"metadata": {},
"outputs": [],
"source": [
- "subnet_id=response['SubnetId']\n",
+ "subnet_id = response[\"SubnetId\"]\n",
"print(subnet_id)"
]
},
@@ -764,7 +749,7 @@
"metadata": {},
"outputs": [],
"source": [
- "security_group_ids=response['SecurityGroups']\n",
+ "security_group_ids = response[\"SecurityGroups\"]\n",
"print(security_group_ids)"
]
},
@@ -776,11 +761,11 @@
"source": [
"from pprint import pprint\n",
"\n",
- "all_vpcs = ec2.describe_vpcs()['Vpcs']\n",
+ "all_vpcs = ec2.describe_vpcs()[\"Vpcs\"]\n",
"\n",
"print(len(all_vpcs))\n",
"\n",
- "pprint(all_vpcs)\n"
+ "pprint(all_vpcs)"
]
},
{
@@ -789,7 +774,7 @@
"metadata": {},
"outputs": [],
"source": [
- "vpc_id = ec2.describe_vpcs()['Vpcs'][-1]['VpcId']\n",
+ "vpc_id = ec2.describe_vpcs()[\"Vpcs\"][-1][\"VpcId\"]\n",
"print(vpc_id)"
]
},
@@ -808,43 +793,44 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- "# role=secure_iam_role_name,\n",
- " role=role, \n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " subnets=[\n",
- " subnet_id\n",
- " ],\n",
- " security_group_ids=security_group_ids\n",
- "# max_run=7200, # number of seconds\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " # role=secure_iam_role_name,\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " subnets=[subnet_id],\n",
+ " security_group_ids=security_group_ids\n",
+ " # max_run=7200, # number of seconds\n",
+ ")"
]
},
{
@@ -871,11 +857,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -885,7 +870,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -896,7 +881,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -907,7 +898,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -918,7 +915,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb b/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb
index 023bbda8..53283aff 100644
--- a/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb
+++ b/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb
@@ -11,14 +11,14 @@
"import pandas as pd\n",
"\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)\n",
- "ec2 = boto3.Session().client(service_name='ec2', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n",
+ "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)"
]
},
{
@@ -39,9 +39,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -71,9 +71,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -103,9 +103,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -195,12 +195,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -229,29 +226,29 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "#input_mode='Pipe'\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "# input_mode='Pipe'\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -261,10 +258,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -283,17 +280,11 @@
"outputs": [],
"source": [
"assume_role_policy_doc = {\n",
- " \"Version\": \"2012-10-17\",\n",
- " \"Statement\": [\n",
- " {\n",
- " \"Effect\": \"Allow\",\n",
- " \"Principal\": {\n",
- " \"Service\": \"sagemaker.amazonaws.com\"\n",
- " },\n",
- " \"Action\": \"sts:AssumeRole\"\n",
- " }\n",
- " ]\n",
- "} "
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [\n",
+ " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n",
+ " ],\n",
+ "}"
]
},
{
@@ -313,7 +304,7 @@
"metadata": {},
"outputs": [],
"source": [
- "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)"
+ "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)"
]
},
{
@@ -331,12 +322,12 @@
" secure_iam_role = iam.create_role(\n",
" RoleName=secure_iam_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Secure Role'\n",
+ " Description=\"DSOAWS Secure Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role = iam.get_role(RoleName=secure_iam_role_name)\n",
- "# print(\"Role already exists\")\n",
+ " # print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)\n",
"\n",
@@ -352,18 +343,9 @@
"outputs": [],
"source": [
"iam_policy_allow_s3 = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Allow',\n",
- " 'Action': [\n",
- " 's3:*'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n",
+ "}"
]
},
{
@@ -372,7 +354,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)"
+ "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)"
]
},
{
@@ -384,9 +366,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name,\n",
- " PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
+ " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
")\n",
"\n",
"print(response)\n",
@@ -400,8 +380,8 @@
"metadata": {},
"outputs": [],
"source": [
- "different_subnet_id='blah'\n",
- "different_security_group_ids=['blah']"
+ "different_subnet_id = \"blah\"\n",
+ "different_security_group_ids = [\"blah\"]"
]
},
{
@@ -419,20 +399,15 @@
" \"Action\": [\n",
" \"sagemaker:CreateTrainingJob\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ],\n",
- " \"Condition\": {\n",
+ " \"Resource\": [\"*\"],\n",
+ " \"Condition\": {\n",
" \"StringNotEquals\": {\n",
" \"sagemaker:VpcSecurityGroupIds\": different_security_group_ids,\n",
- " \"sagemaker:VpcSubnets\": [\n",
- " different_subnet_id\n",
- " ]\n",
+ " \"sagemaker:VpcSubnets\": [different_subnet_id],\n",
" }\n",
- " }\n",
- "\n",
+ " },\n",
" }\n",
- " ]\n",
+ " ],\n",
"}"
]
},
@@ -442,7 +417,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)"
+ "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)"
]
},
{
@@ -456,7 +431,7 @@
"response = iam.put_role_policy(\n",
" RoleName=secure_iam_role_name,\n",
" PolicyName=policy_deny_create_training_job_name,\n",
- " PolicyDocument=json.dumps(policy_deny_create_training_job)\n",
+ " PolicyDocument=json.dumps(policy_deny_create_training_job),\n",
")\n",
"\n",
"print(response)\n",
@@ -472,37 +447,40 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=secure_iam_role_name,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " subnets=None,\n",
- " security_group_ids=None,\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=secure_iam_role_name,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " subnets=None,\n",
+ " security_group_ids=None,\n",
+ ")"
]
},
{
@@ -521,11 +499,9 @@
"outputs": [],
"source": [
"estimator.fit(\n",
- " inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -535,7 +511,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -546,7 +522,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -557,7 +539,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -568,7 +556,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -595,10 +589,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_deny_create_training_job_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
@@ -610,10 +601,7 @@
"metadata": {},
"outputs": [],
"source": [
- "response = iam.delete_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name\n",
- ")\n",
+ "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n",
"print(response)\n",
"\n",
"time.sleep(30)"
@@ -651,19 +639,20 @@
"outputs": [],
"source": [
"import json\n",
+ "\n",
"notebook_instance_name = None\n",
"\n",
"try:\n",
- " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n",
+ " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n",
" data = json.load(notebook_info)\n",
- " resource_arn = data['ResourceArn']\n",
- " region = resource_arn.split(':')[3]\n",
- " notebook_instance_name = data['ResourceName']\n",
- " print('Notebook Instance Name: {}'.format(notebook_instance_name))\n",
+ " resource_arn = data[\"ResourceArn\"]\n",
+ " region = resource_arn.split(\":\")[3]\n",
+ " notebook_instance_name = data[\"ResourceName\"]\n",
+ " print(\"Notebook Instance Name: {}\".format(notebook_instance_name))\n",
"except:\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.')\n",
- " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.\")\n",
+ " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -674,9 +663,7 @@
},
"outputs": [],
"source": [
- "response = sm.describe_notebook_instance(\n",
- " NotebookInstanceName=notebook_instance_name\n",
- ")\n",
+ "response = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)\n",
"\n",
"print(response)"
]
@@ -687,11 +674,11 @@
"metadata": {},
"outputs": [],
"source": [
- "print('SubnetId: {}'.format(response['SubnetId']))\n",
- "print('SecurityGroups: {}'.format(response['SecurityGroups']))\n",
- "print('IAM Role: {}'.format(response['RoleArn']))\n",
- "print('NetworkInterfaceId: {}'.format(response['NetworkInterfaceId']))\n",
- "print('DirectInternetAccess: {}'.format(response['DirectInternetAccess']))"
+ "print(\"SubnetId: {}\".format(response[\"SubnetId\"]))\n",
+ "print(\"SecurityGroups: {}\".format(response[\"SecurityGroups\"]))\n",
+ "print(\"IAM Role: {}\".format(response[\"RoleArn\"]))\n",
+ "print(\"NetworkInterfaceId: {}\".format(response[\"NetworkInterfaceId\"]))\n",
+ "print(\"DirectInternetAccess: {}\".format(response[\"DirectInternetAccess\"]))"
]
},
{
@@ -700,7 +687,7 @@
"metadata": {},
"outputs": [],
"source": [
- "subnet_id=response['SubnetId']\n",
+ "subnet_id = response[\"SubnetId\"]\n",
"print(subnet_id)"
]
},
@@ -710,7 +697,7 @@
"metadata": {},
"outputs": [],
"source": [
- "security_group_ids=response['SecurityGroups']\n",
+ "security_group_ids = response[\"SecurityGroups\"]\n",
"print(security_group_ids)"
]
},
@@ -731,7 +718,7 @@
"metadata": {},
"outputs": [],
"source": [
- "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)"
+ "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)"
]
},
{
@@ -749,12 +736,12 @@
" secure_iam_role = iam.create_role(\n",
" RoleName=secure_iam_role_name,\n",
" AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
- " Description='DSOAWS Secure Role'\n",
+ " Description=\"DSOAWS Secure Role\",\n",
" )\n",
"except ClientError as e:\n",
- " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
+ " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n",
" iam_role = iam.get_role(RoleName=secure_iam_role_name)\n",
- "# print(\"Role already exists\")\n",
+ " # print(\"Role already exists\")\n",
" else:\n",
" print(\"Unexpected error: %s\" % e)\n",
"\n",
@@ -770,18 +757,9 @@
"outputs": [],
"source": [
"iam_policy_allow_s3 = {\n",
- " 'Version': '2012-10-17',\n",
- " 'Statement': [{\n",
- " 'Sid': '',\n",
- " 'Effect': 'Allow',\n",
- " 'Action': [\n",
- " 's3:*'\n",
- " ],\n",
- " 'Resource': [\n",
- " 'arn:aws:s3:::{}'.format(bucket)\n",
- " ]\n",
- " }]\n",
- " }"
+ " \"Version\": \"2012-10-17\",\n",
+ " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n",
+ "}"
]
},
{
@@ -790,7 +768,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)"
+ "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)"
]
},
{
@@ -802,9 +780,7 @@
"import time\n",
"\n",
"response = iam.put_role_policy(\n",
- " RoleName=secure_iam_role_name,\n",
- " PolicyName=policy_allow_s3_name,\n",
- " PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
+ " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n",
")\n",
"\n",
"print(response)\n",
@@ -827,20 +803,15 @@
" \"Action\": [\n",
" \"sagemaker:CreateTrainingJob\",\n",
" ],\n",
- " \"Resource\": [\n",
- " \"*\"\n",
- " ],\n",
- " \"Condition\": {\n",
+ " \"Resource\": [\"*\"],\n",
+ " \"Condition\": {\n",
" \"StringNotEquals\": {\n",
" \"sagemaker:VpcSecurityGroupIds\": security_group_ids,\n",
- " \"sagemaker:VpcSubnets\": [\n",
- " subnet_id\n",
- " ]\n",
+ " \"sagemaker:VpcSubnets\": [subnet_id],\n",
" }\n",
- " }\n",
- "\n",
+ " },\n",
" }\n",
- " ]\n",
+ " ],\n",
"}"
]
},
@@ -850,7 +821,7 @@
"metadata": {},
"outputs": [],
"source": [
- "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)"
+ "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)"
]
},
{
@@ -864,7 +835,7 @@
"response = iam.put_role_policy(\n",
" RoleName=secure_iam_role_name,\n",
" PolicyName=policy_deny_create_training_job_name,\n",
- " PolicyDocument=json.dumps(policy_deny_create_training_job)\n",
+ " PolicyDocument=json.dumps(policy_deny_create_training_job),\n",
")\n",
"\n",
"print(response)\n",
@@ -887,39 +858,40 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=secure_iam_role_name,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " subnets=[\n",
- " subnet_id\n",
- " ],\n",
- " security_group_ids=security_group_ids\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=secure_iam_role_name,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " subnets=[subnet_id],\n",
+ " security_group_ids=security_group_ids,\n",
+ ")"
]
},
{
@@ -946,11 +918,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -960,7 +931,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -971,7 +942,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -982,7 +959,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -993,7 +976,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb b/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb
index 159b8976..9e3537b9 100644
--- a/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb
+++ b/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb
@@ -10,13 +10,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "kms = boto3.Session().client(service_name='kms', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "kms = boto3.Session().client(service_name=\"kms\", region_name=region)"
]
},
{
@@ -37,9 +37,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -77,9 +77,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -117,9 +117,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -268,12 +268,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -310,28 +307,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=10\n",
- "validation_steps=10\n",
- "test_steps=10\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 10\n",
+ "validation_steps = 10\n",
+ "test_steps = 10\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -341,10 +338,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -363,8 +360,8 @@
"source": [
"create_ebs_key_response = kms.create_key()\n",
"\n",
- "ebs_kms_key_id=create_ebs_key_response['KeyMetadata']['KeyId']\n",
- "ebs_kms_key_arn=create_ebs_key_response['KeyMetadata']['Arn']"
+ "ebs_kms_key_id = create_ebs_key_response[\"KeyMetadata\"][\"KeyId\"]\n",
+ "ebs_kms_key_arn = create_ebs_key_response[\"KeyMetadata\"][\"Arn\"]"
]
},
{
@@ -375,8 +372,8 @@
"source": [
"create_s3_key_response = kms.create_key()\n",
"\n",
- "s3_kms_key_id=create_s3_key_response['KeyMetadata']['KeyId']\n",
- "s3_kms_key_arn=create_s3_key_response['KeyMetadata']['Arn']"
+ "s3_kms_key_id = create_s3_key_response[\"KeyMetadata\"][\"KeyId\"]\n",
+ "s3_kms_key_arn = create_s3_key_response[\"KeyMetadata\"][\"Arn\"]"
]
},
{
@@ -395,37 +392,40 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " volume_kms_key=ebs_kms_key_id,\n",
- " output_kms_key=s3_kms_key_id,\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " volume_kms_key=ebs_kms_key_id,\n",
+ " output_kms_key=s3_kms_key_id,\n",
+ ")"
]
},
{
@@ -441,11 +441,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -463,7 +462,7 @@
],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -487,7 +486,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -511,7 +516,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -535,7 +546,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/10_Secure_Train_EncryptionInTransit.ipynb b/12_security/10_Secure_Train_EncryptionInTransit.ipynb
index 693d3132..4504a149 100644
--- a/12_security/10_Secure_Train_EncryptionInTransit.ipynb
+++ b/12_security/10_Secure_Train_EncryptionInTransit.ipynb
@@ -10,13 +10,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -37,9 +37,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -77,9 +77,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -117,9 +117,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -268,12 +268,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -310,28 +307,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=10\n",
- "validation_steps=10\n",
- "test_steps=10\n",
- "train_instance_count=2\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='File'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 10\n",
+ "validation_steps = 10\n",
+ "test_steps = 10\n",
+ "train_instance_count = 2\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"File\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -341,10 +338,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -364,36 +361,39 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " encrypt_inter_container_traffic=True,\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " encrypt_inter_container_traffic=True,\n",
+ ")"
]
},
{
@@ -409,11 +409,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -423,7 +422,7 @@
"outputs": [],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -434,7 +433,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -445,7 +450,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -456,7 +467,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/11_Secure_Train_NetworkIsolation.ipynb b/12_security/11_Secure_Train_NetworkIsolation.ipynb
index a8bb4bc5..a33d64f0 100644
--- a/12_security/11_Secure_Train_NetworkIsolation.ipynb
+++ b/12_security/11_Secure_Train_NetworkIsolation.ipynb
@@ -10,13 +10,13 @@
"import sagemaker\n",
"import pandas as pd\n",
"\n",
- "sess = sagemaker.Session()\n",
+ "sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"role = sagemaker.get_execution_role()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "iam = boto3.Session().client(service_name='iam', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
]
},
{
@@ -37,9 +37,9 @@
"try:\n",
" processed_train_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -77,9 +77,9 @@
"try:\n",
" processed_validation_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -117,9 +117,9 @@
"try:\n",
" processed_test_data_s3_uri\n",
"except NameError:\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n",
- " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n",
- " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')"
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
+ " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
+ " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
]
},
{
@@ -268,12 +268,9 @@
"source": [
"from sagemaker.inputs import TrainingInput\n",
"\n",
- "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n",
- " distribution='ShardedByS3Key') \n",
- "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
- "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n",
- " distribution='ShardedByS3Key')\n",
+ "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
+ "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
"\n",
"print(s3_input_train_data.config)\n",
"print(s3_input_validation_data.config)\n",
@@ -310,28 +307,28 @@
"metadata": {},
"outputs": [],
"source": [
- "epochs=1\n",
- "learning_rate=0.00001\n",
- "epsilon=0.00000001\n",
- "train_batch_size=128\n",
- "validation_batch_size=128\n",
- "test_batch_size=128\n",
- "train_steps_per_epoch=100\n",
- "validation_steps=100\n",
- "test_steps=100\n",
- "train_instance_count=1\n",
- "train_instance_type='ml.c5.9xlarge'\n",
- "train_volume_size=1024\n",
- "use_xla=True\n",
- "use_amp=True\n",
- "freeze_bert_layer=False\n",
- "enable_sagemaker_debugger=True\n",
- "enable_checkpointing=False\n",
- "enable_tensorboard=False\n",
- "input_mode='Pipe'\n",
- "run_validation=True\n",
- "run_test=True\n",
- "run_sample_predictions=True"
+ "epochs = 1\n",
+ "learning_rate = 0.00001\n",
+ "epsilon = 0.00000001\n",
+ "train_batch_size = 128\n",
+ "validation_batch_size = 128\n",
+ "test_batch_size = 128\n",
+ "train_steps_per_epoch = 100\n",
+ "validation_steps = 100\n",
+ "test_steps = 100\n",
+ "train_instance_count = 1\n",
+ "train_instance_type = \"ml.c5.9xlarge\"\n",
+ "train_volume_size = 1024\n",
+ "use_xla = True\n",
+ "use_amp = True\n",
+ "freeze_bert_layer = False\n",
+ "enable_sagemaker_debugger = True\n",
+ "enable_checkpointing = False\n",
+ "enable_tensorboard = False\n",
+ "input_mode = \"Pipe\"\n",
+ "run_validation = True\n",
+ "run_test = True\n",
+ "run_sample_predictions = True"
]
},
{
@@ -341,10 +338,10 @@
"outputs": [],
"source": [
"metrics_definitions = [\n",
- " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n",
- " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n",
+ " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
"]"
]
},
@@ -364,38 +361,41 @@
"source": [
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
- "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n",
- " source_dir='src',\n",
- " role=role,\n",
- " instance_count=train_instance_count,\n",
- " instance_type=train_instance_type,\n",
- " volume_size=train_volume_size,\n",
- "# use_spot_instances=True,\n",
- "# max_wait=7200, # Seconds to wait for spot instances to become available\n",
- " py_version='py3',\n",
- " framework_version='2.1.0',\n",
- " hyperparameters={'epochs': epochs,\n",
- " 'learning_rate': learning_rate,\n",
- " 'epsilon': epsilon,\n",
- " 'train_batch_size': train_batch_size,\n",
- " 'validation_batch_size': validation_batch_size,\n",
- " 'test_batch_size': test_batch_size, \n",
- " 'train_steps_per_epoch': train_steps_per_epoch,\n",
- " 'validation_steps': validation_steps,\n",
- " 'test_steps': test_steps,\n",
- " 'use_xla': use_xla,\n",
- " 'use_amp': use_amp, \n",
- " 'max_seq_length': max_seq_length,\n",
- " 'freeze_bert_layer': freeze_bert_layer,\n",
- " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n",
- " 'enable_checkpointing': enable_checkpointing,\n",
- " 'enable_tensorboard': enable_tensorboard, \n",
- " 'run_validation': run_validation,\n",
- " 'run_test': run_test,\n",
- " 'run_sample_predictions': run_sample_predictions},\n",
- " input_mode=input_mode,\n",
- " enable_network_isolation=True\n",
- " )"
+ "estimator = TensorFlow(\n",
+ " entry_point=\"tf_bert_reviews.py\",\n",
+ " source_dir=\"src\",\n",
+ " role=role,\n",
+ " instance_count=train_instance_count,\n",
+ " instance_type=train_instance_type,\n",
+ " volume_size=train_volume_size,\n",
+ " # use_spot_instances=True,\n",
+ " # max_wait=7200, # Seconds to wait for spot instances to become available\n",
+ " py_version=\"py3\",\n",
+ " framework_version=\"2.1.0\",\n",
+ " hyperparameters={\n",
+ " \"epochs\": epochs,\n",
+ " \"learning_rate\": learning_rate,\n",
+ " \"epsilon\": epsilon,\n",
+ " \"train_batch_size\": train_batch_size,\n",
+ " \"validation_batch_size\": validation_batch_size,\n",
+ " \"test_batch_size\": test_batch_size,\n",
+ " \"train_steps_per_epoch\": train_steps_per_epoch,\n",
+ " \"validation_steps\": validation_steps,\n",
+ " \"test_steps\": test_steps,\n",
+ " \"use_xla\": use_xla,\n",
+ " \"use_amp\": use_amp,\n",
+ " \"max_seq_length\": max_seq_length,\n",
+ " \"freeze_bert_layer\": freeze_bert_layer,\n",
+ " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
+ " \"enable_checkpointing\": enable_checkpointing,\n",
+ " \"enable_tensorboard\": enable_tensorboard,\n",
+ " \"run_validation\": run_validation,\n",
+ " \"run_test\": run_test,\n",
+ " \"run_sample_predictions\": run_sample_predictions,\n",
+ " },\n",
+ " input_mode=input_mode,\n",
+ " enable_network_isolation=True,\n",
+ ")"
]
},
{
@@ -415,11 +415,10 @@
"metadata": {},
"outputs": [],
"source": [
- "estimator.fit(inputs={'train': s3_input_train_data, \n",
- " 'validation': s3_input_validation_data,\n",
- " 'test': s3_input_test_data\n",
- " }, \n",
- " wait=False)"
+ "estimator.fit(\n",
+ " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
+ " wait=False,\n",
+ ")"
]
},
{
@@ -437,7 +436,7 @@
],
"source": [
"training_job_name = estimator.latest_training_job.name\n",
- "print('Training Job Name: {}'.format(training_job_name))"
+ "print(\"Training Job Name: {}\".format(training_job_name))"
]
},
{
@@ -461,7 +460,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review Training Job After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -485,7 +490,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review CloudWatch Logs After About 5 Minutes'.format(\n",
+ " region, training_job_name\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
@@ -509,7 +520,13 @@
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
- "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n"
+ "display(\n",
+ " HTML(\n",
+ " 'Review S3 Output Data After The Training Job Has Completed'.format(\n",
+ " bucket, training_job_name, region\n",
+ " )\n",
+ " )\n",
+ ")"
]
},
{
diff --git a/12_security/src/inference.py b/12_security/src/inference.py
index 2975dc2d..53196737 100644
--- a/12_security/src/inference.py
+++ b/12_security/src/inference.py
@@ -1,102 +1,97 @@
import json
import subprocess
import sys
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1'])
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"])
# Workaround for https://github.com/huggingface/tokenizers/issues/120 and
# https://github.com/kaushaltrivedi/fast-bert/issues/174
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers'])
import tensorflow as tf
from transformers import DistilBertTokenizer
-classes=[1, 2, 3, 4, 5]
+classes = [1, 2, 3, 4, 5]
+
+max_seq_length = 64
-max_seq_length=64
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def input_handler(data, context):
- data_str = data.read().decode('utf-8')
- print('data_str: {}'.format(data_str))
- print('type data_str: {}'.format(type(data_str)))
-
+ data_str = data.read().decode("utf-8")
+ print("data_str: {}".format(data_str))
+ print("type data_str: {}".format(type(data_str)))
+
jsonlines = data_str.split("\n")
- print('jsonlines: {}'.format(jsonlines))
- print('type jsonlines: {}'.format(type(jsonlines)))
-
+ print("jsonlines: {}".format(jsonlines))
+ print("type jsonlines: {}".format(type(jsonlines)))
+
transformed_instances = []
-
+
for jsonline in jsonlines:
- print('jsonline: {}'.format(jsonline))
- print('type jsonline: {}'.format(type(jsonline)))
+ print("jsonline: {}".format(jsonline))
+ print("type jsonline: {}".format(type(jsonline)))
# features[0] is review_body
# features[1..n] are others (ie. 1: product_category, etc)
review_body = json.loads(jsonline)["features"][0]
print("""review_body: {}""".format(review_body))
-
- encode_plus_tokens = tokenizer.encode_plus(review_body,
- pad_to_max_length=True,
- max_length=max_seq_length,
- truncation=True)
+
+ encode_plus_tokens = tokenizer.encode_plus(
+ review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True
+ )
# Convert the text-based tokens to ids from the pre-trained BERT vocabulary
- input_ids = encode_plus_tokens['input_ids']
-
+ input_ids = encode_plus_tokens["input_ids"]
+
# Specifies which tokens BERT should pay attention to (0 or 1)
- input_mask = encode_plus_tokens['attention_mask']
-
- transformed_instance = {
- "input_ids": input_ids,
- "input_mask": input_mask
- }
-
+ input_mask = encode_plus_tokens["attention_mask"]
+
+ transformed_instance = {"input_ids": input_ids, "input_mask": input_mask}
+
transformed_instances.append(transformed_instance)
-
- transformed_data = {
- "signature_name":"serving_default",
- "instances": transformed_instances
- }
+
+ transformed_data = {"signature_name": "serving_default", "instances": transformed_instances}
transformed_data_json = json.dumps(transformed_data)
- print('transformed_data_json: {}'.format(transformed_data_json))
-
+ print("transformed_data_json: {}".format(transformed_data_json))
+
return transformed_data_json
def output_handler(response, context):
- print('response: {}'.format(response))
+ print("response: {}".format(response))
response_json = response.json()
- print('response_json: {}'.format(response_json))
-
+ print("response_json: {}".format(response_json))
+
log_probabilities = response_json["predictions"]
- print('log_probabilities: {}'.format(log_probabilities))
-
+ print("log_probabilities: {}".format(log_probabilities))
+
predicted_classes = []
for log_probability in log_probabilities:
- print('log_probability in loop: {}'.format(log_probability))
- print('type(log_probability) in loop: {}'.format(type(log_probability)))
-
- softmax = tf.nn.softmax(log_probability)
-
- predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
+ print("log_probability in loop: {}".format(log_probability))
+ print("type(log_probability) in loop: {}".format(type(log_probability)))
+
+ softmax = tf.nn.softmax(log_probability)
+
+ predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
predicted_class = classes[predicted_class_idx]
- print('predicted_class: {}'.format(predicted_class))
+ print("predicted_class: {}".format(predicted_class))
prediction_dict = {}
- prediction_dict['predicted_label'] = predicted_class
-
+ prediction_dict["predicted_label"] = predicted_class
+
jsonline = json.dumps(prediction_dict)
- print('jsonline: {}'.format(jsonline))
-
+ print("jsonline: {}".format(jsonline))
+
predicted_classes.append(jsonline)
- print('predicted_classes in the loop: {}'.format(predicted_classes))
-
- predicted_classes_jsonlines = '\n'.join(predicted_classes)
- print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))
+ print("predicted_classes in the loop: {}".format(predicted_classes))
+
+ predicted_classes_jsonlines = "\n".join(predicted_classes)
+ print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines))
response_content_type = context.accept_header
-
- return predicted_classes_jsonlines, response_content_type
\ No newline at end of file
+
+ return predicted_classes_jsonlines, response_content_type
diff --git a/12_security/src/tf_bert_reviews.py b/12_security/src/tf_bert_reviews.py
index 8c46a2ef..30ad69de 100644
--- a/12_security/src/tf_bert_reviews.py
+++ b/12_security/src/tf_bert_reviews.py
@@ -9,91 +9,92 @@
import sys
import os
import tensorflow as tf
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
-#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
+
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==2.8.0"])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"])
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers.configuration_distilbert import DistilBertConfig
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
-#from tensorflow.keras.mixed_precision import experimental as mixed_precision
+
+# from tensorflow.keras.mixed_precision import experimental as mixed_precision
CLASSES = [1, 2, 3, 4, 5]
def select_data_and_label_from_record(record):
- x = {
- 'input_ids': record['input_ids'],
- 'input_mask': record['input_mask'],
- 'segment_ids': record['segment_ids']
- }
+ x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]}
- y = record['label_ids']
+ y = record["label_ids"]
return (x, y)
-def file_based_input_dataset_builder(channel,
- input_filenames,
- pipe_mode,
- is_training,
- drop_remainder,
- batch_size,
- epochs,
- steps_per_epoch,
- max_seq_length):
+def file_based_input_dataset_builder(
+ channel,
+ input_filenames,
+ pipe_mode,
+ is_training,
+ drop_remainder,
+ batch_size,
+ epochs,
+ steps_per_epoch,
+ max_seq_length,
+):
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if pipe_mode:
- print('***** Using pipe_mode with channel {}'.format(channel))
+ print("***** Using pipe_mode with channel {}".format(channel))
from sagemaker_tensorflow import PipeModeDataset
- dataset = PipeModeDataset(channel=channel,
- record_format='TFRecord')
+
+ dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
else:
- print('***** Using input_filenames {}'.format(input_filenames))
+ print("***** Using input_filenames {}".format(input_filenames))
dataset = tf.data.TFRecordDataset(input_filenames)
dataset = dataset.repeat(epochs * steps_per_epoch * 100)
-# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+ # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
name_to_features = {
- "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
- "label_ids": tf.io.FixedLenFeature([], tf.int64),
+ "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+ "label_ids": tf.io.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
record = tf.io.parse_single_example(record, name_to_features)
# TODO: wip/bert/bert_attention_head_view/train.py
- # Convert input_ids into input_tokens with DistilBert vocabulary
+ # Convert input_ids into input_tokens with DistilBert vocabulary
# if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
# hook._write_raw_tensor_simple("input_tokens", input_tokens)
return record
-
+
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
- lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- drop_remainder=drop_remainder,
- num_parallel_calls=tf.data.experimental.AUTOTUNE))
+ lambda record: _decode_record(record, name_to_features),
+ batch_size=batch_size,
+ drop_remainder=drop_remainder,
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
+ )
+ )
-# dataset.cache()
+ # dataset.cache()
- dataset = dataset.shuffle(buffer_size=1000,
- reshuffle_each_iteration=True)
+ dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
row_count = 0
- print('**************** {} *****************'.format(channel))
+ print("**************** {} *****************".format(channel))
for row in dataset.as_numpy_iterator():
print(row)
if row_count == 5:
@@ -106,236 +107,178 @@ def _decode_record(record, name_to_features):
def load_checkpoint_model(checkpoint_path):
import glob
import os
-
- glob_pattern = os.path.join(checkpoint_path, '*.h5')
- print('glob pattern {}'.format(glob_pattern))
+
+ glob_pattern = os.path.join(checkpoint_path, "*.h5")
+ print("glob pattern {}".format(glob_pattern))
list_of_checkpoint_files = glob.glob(glob_pattern)
- print('List of checkpoint files {}'.format(list_of_checkpoint_files))
-
+ print("List of checkpoint files {}".format(list_of_checkpoint_files))
+
latest_checkpoint_file = max(list_of_checkpoint_files)
- print('Latest checkpoint file {}'.format(latest_checkpoint_file))
+ print("Latest checkpoint file {}".format(latest_checkpoint_file))
- initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0]
+ initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0]
initial_epoch_number = int(initial_epoch_number_str)
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
- latest_checkpoint_file,
- config=config)
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config)
+
+ print("loaded_model {}".format(loaded_model))
+ print("initial_epoch_number {}".format(initial_epoch_number))
- print('loaded_model {}'.format(loaded_model))
- print('initial_epoch_number {}'.format(initial_epoch_number))
-
return loaded_model, initial_epoch_number
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--train_data',
- type=str,
- default=os.environ['SM_CHANNEL_TRAIN'])
- parser.add_argument('--validation_data',
- type=str,
- default=os.environ['SM_CHANNEL_VALIDATION'])
- parser.add_argument('--test_data',
- type=str,
- default=os.environ['SM_CHANNEL_TEST'])
- parser.add_argument('--output_dir',
- type=str,
- default=os.environ['SM_OUTPUT_DIR'])
- parser.add_argument('--hosts',
- type=list,
- default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current_host',
- type=str,
- default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--num_gpus',
- type=int,
- default=os.environ['SM_NUM_GPUS'])
- parser.add_argument('--checkpoint_base_path',
- type=str,
- default='/opt/ml/checkpoints')
- parser.add_argument('--use_xla',
- type=eval,
- default=False)
- parser.add_argument('--use_amp',
- type=eval,
- default=False)
- parser.add_argument('--max_seq_length',
- type=int,
- default=64)
- parser.add_argument('--train_batch_size',
- type=int,
- default=128)
- parser.add_argument('--validation_batch_size',
- type=int,
- default=256)
- parser.add_argument('--test_batch_size',
- type=int,
- default=256)
- parser.add_argument('--epochs',
- type=int,
- default=2)
- parser.add_argument('--learning_rate',
- type=float,
- default=0.00003)
- parser.add_argument('--epsilon',
- type=float,
- default=0.00000001)
- parser.add_argument('--train_steps_per_epoch',
- type=int,
- default=None)
- parser.add_argument('--validation_steps',
- type=int,
- default=None)
- parser.add_argument('--test_steps',
- type=int,
- default=None)
- parser.add_argument('--freeze_bert_layer',
- type=eval,
- default=False)
- parser.add_argument('--enable_sagemaker_debugger',
- type=eval,
- default=False)
- parser.add_argument('--run_validation',
- type=eval,
- default=False)
- parser.add_argument('--run_test',
- type=eval,
- default=False)
- parser.add_argument('--run_sample_predictions',
- type=eval,
- default=False)
- parser.add_argument('--enable_tensorboard',
- type=eval,
- default=False)
- parser.add_argument('--enable_checkpointing',
- type=eval,
- default=False)
- parser.add_argument('--output_data_dir', # This is unused
- type=str,
- default=os.environ['SM_OUTPUT_DATA_DIR'])
-
+ parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+ parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
+ parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"])
+ parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+ parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints")
+ parser.add_argument("--use_xla", type=eval, default=False)
+ parser.add_argument("--use_amp", type=eval, default=False)
+ parser.add_argument("--max_seq_length", type=int, default=64)
+ parser.add_argument("--train_batch_size", type=int, default=128)
+ parser.add_argument("--validation_batch_size", type=int, default=256)
+ parser.add_argument("--test_batch_size", type=int, default=256)
+ parser.add_argument("--epochs", type=int, default=2)
+ parser.add_argument("--learning_rate", type=float, default=0.00003)
+ parser.add_argument("--epsilon", type=float, default=0.00000001)
+ parser.add_argument("--train_steps_per_epoch", type=int, default=None)
+ parser.add_argument("--validation_steps", type=int, default=None)
+ parser.add_argument("--test_steps", type=int, default=None)
+ parser.add_argument("--freeze_bert_layer", type=eval, default=False)
+ parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False)
+ parser.add_argument("--run_validation", type=eval, default=False)
+ parser.add_argument("--run_test", type=eval, default=False)
+ parser.add_argument("--run_sample_predictions", type=eval, default=False)
+ parser.add_argument("--enable_tensorboard", type=eval, default=False)
+ parser.add_argument("--enable_checkpointing", type=eval, default=False)
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused
+
# This points to the S3 location - this should not be used by our code
# We should use /opt/ml/model/ instead
- # parser.add_argument('--model_dir',
- # type=str,
+ # parser.add_argument('--model_dir',
+ # type=str,
# default=os.environ['SM_MODEL_DIR'])
-
+
args, _ = parser.parse_known_args()
- print("Args:")
+ print("Args:")
print(args)
-
- env_var = os.environ
- print("Environment Variables:")
- pprint.pprint(dict(env_var), width = 1)
-
- print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
- sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
- is_master = sm_training_env_json['is_master']
- print('is_master {}'.format(is_master))
-
+
+ env_var = os.environ
+ print("Environment Variables:")
+ pprint.pprint(dict(env_var), width=1)
+
+ print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"]))
+ sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"])
+ is_master = sm_training_env_json["is_master"]
+ print("is_master {}".format(is_master))
+
train_data = args.train_data
- print('train_data {}'.format(train_data))
+ print("train_data {}".format(train_data))
validation_data = args.validation_data
- print('validation_data {}'.format(validation_data))
+ print("validation_data {}".format(validation_data))
test_data = args.test_data
- print('test_data {}'.format(test_data))
- local_model_dir = os.environ['SM_MODEL_DIR']
+ print("test_data {}".format(test_data))
+ local_model_dir = os.environ["SM_MODEL_DIR"]
output_dir = args.output_dir
- print('output_dir {}'.format(output_dir))
+ print("output_dir {}".format(output_dir))
hosts = args.hosts
- print('hosts {}'.format(hosts))
+ print("hosts {}".format(hosts))
current_host = args.current_host
- print('current_host {}'.format(current_host))
+ print("current_host {}".format(current_host))
num_gpus = args.num_gpus
- print('num_gpus {}'.format(num_gpus))
- job_name = os.environ['SAGEMAKER_JOB_NAME']
- print('job_name {}'.format(job_name))
+ print("num_gpus {}".format(num_gpus))
+ job_name = os.environ["SAGEMAKER_JOB_NAME"]
+ print("job_name {}".format(job_name))
use_xla = args.use_xla
- print('use_xla {}'.format(use_xla))
+ print("use_xla {}".format(use_xla))
use_amp = args.use_amp
- print('use_amp {}'.format(use_amp))
+ print("use_amp {}".format(use_amp))
max_seq_length = args.max_seq_length
- print('max_seq_length {}'.format(max_seq_length))
+ print("max_seq_length {}".format(max_seq_length))
train_batch_size = args.train_batch_size
- print('train_batch_size {}'.format(train_batch_size))
+ print("train_batch_size {}".format(train_batch_size))
validation_batch_size = args.validation_batch_size
- print('validation_batch_size {}'.format(validation_batch_size))
+ print("validation_batch_size {}".format(validation_batch_size))
test_batch_size = args.test_batch_size
- print('test_batch_size {}'.format(test_batch_size))
+ print("test_batch_size {}".format(test_batch_size))
epochs = args.epochs
- print('epochs {}'.format(epochs))
+ print("epochs {}".format(epochs))
learning_rate = args.learning_rate
- print('learning_rate {}'.format(learning_rate))
+ print("learning_rate {}".format(learning_rate))
epsilon = args.epsilon
- print('epsilon {}'.format(epsilon))
+ print("epsilon {}".format(epsilon))
train_steps_per_epoch = args.train_steps_per_epoch
- print('train_steps_per_epoch {}'.format(train_steps_per_epoch))
+ print("train_steps_per_epoch {}".format(train_steps_per_epoch))
validation_steps = args.validation_steps
- print('validation_steps {}'.format(validation_steps))
+ print("validation_steps {}".format(validation_steps))
test_steps = args.test_steps
- print('test_steps {}'.format(test_steps))
+ print("test_steps {}".format(test_steps))
freeze_bert_layer = args.freeze_bert_layer
- print('freeze_bert_layer {}'.format(freeze_bert_layer))
+ print("freeze_bert_layer {}".format(freeze_bert_layer))
enable_sagemaker_debugger = args.enable_sagemaker_debugger
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
run_validation = args.run_validation
- print('run_validation {}'.format(run_validation))
+ print("run_validation {}".format(run_validation))
run_test = args.run_test
- print('run_test {}'.format(run_test))
+ print("run_test {}".format(run_test))
run_sample_predictions = args.run_sample_predictions
- print('run_sample_predictions {}'.format(run_sample_predictions))
+ print("run_sample_predictions {}".format(run_sample_predictions))
enable_tensorboard = args.enable_tensorboard
- print('enable_tensorboard {}'.format(enable_tensorboard))
+ print("enable_tensorboard {}".format(enable_tensorboard))
enable_checkpointing = args.enable_checkpointing
- print('enable_checkpointing {}'.format(enable_checkpointing))
+ print("enable_checkpointing {}".format(enable_checkpointing))
checkpoint_base_path = args.checkpoint_base_path
- print('checkpoint_base_path {}'.format(checkpoint_base_path))
+ print("checkpoint_base_path {}".format(checkpoint_base_path))
if is_master:
checkpoint_path = checkpoint_base_path
else:
- checkpoint_path = '/tmp/checkpoints'
- print('checkpoint_path {}'.format(checkpoint_path))
-
- # Determine if PipeMode is enabled
- pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
- pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
- print('Using pipe_mode: {}'.format(pipe_mode))
-
- # Model Output
- transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')
+ checkpoint_path = "/tmp/checkpoints"
+ print("checkpoint_path {}".format(checkpoint_path))
+
+ # Determine if PipeMode is enabled
+ pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "")
+ pipe_mode = pipe_mode_str.find("Pipe") >= 0
+ print("Using pipe_mode: {}".format(pipe_mode))
+
+ # Model Output
+ transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/")
os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)
# SavedModel Output
- tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
+ tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0")
os.makedirs(tensorflow_saved_model_path, exist_ok=True)
- # Tensorboard Logs
- tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
+ # Tensorboard Logs
+ tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/")
os.makedirs(tensorboard_logs_path, exist_ok=True)
# Commented out due to incompatibility with transformers library (possibly)
- # Set the global precision mixed_precision policy to "mixed_float16"
-# mixed_precision_policy = 'mixed_float16'
-# print('Mixed precision policy {}'.format(mixed_precision_policy))
-# policy = mixed_precision.Policy(mixed_precision_policy)
-# mixed_precision.set_policy(policy)
-
+ # Set the global precision mixed_precision policy to "mixed_float16"
+ # mixed_precision_policy = 'mixed_float16'
+ # print('Mixed precision policy {}'.format(mixed_precision_policy))
+ # policy = mixed_precision.Policy(mixed_precision_policy)
+ # mixed_precision.set_policy(policy)
+
distributed_strategy = tf.distribute.MirroredStrategy()
# Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0
- #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+ # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with distributed_strategy.scope():
tf.config.optimizer.set_jit(use_xla)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})
- train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
- print('train_data_filenames {}'.format(train_data_filenames))
+ train_data_filenames = glob(os.path.join(train_data, "*.tfrecord"))
+ print("train_data_filenames {}".format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
- channel='train',
+ channel="train",
input_filenames=train_data_filenames,
pipe_mode=pipe_mode,
is_training=True,
@@ -343,7 +286,8 @@ def load_checkpoint_model(checkpoint_path):
batch_size=train_batch_size,
epochs=epochs,
steps_per_epoch=train_steps_per_epoch,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
tokenizer = None
config = None
@@ -352,83 +296,82 @@ def load_checkpoint_model(checkpoint_path):
# This is required when launching many instances at once... the urllib request seems to get denied periodically
successful_download = False
retries = 0
- while (retries < 5 and not successful_download):
+ while retries < 5 and not successful_download:
try:
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
- config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
- num_labels=len(CLASSES))
- model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
- config=config)
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+ config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=len(CLASSES))
+ model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
successful_download = True
- print('Sucessfully downloaded after {} retries.'.format(retries))
+ print("Sucessfully downloaded after {} retries.".format(retries))
except:
retries = retries + 1
random_sleep = random.randint(1, 30)
- print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))
+ print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep))
time.sleep(random_sleep)
callbacks = []
- initial_epoch_number = 0
+ initial_epoch_number = 0
if enable_checkpointing:
- print('***** Checkpoint enabled *****')
-
- os.makedirs(checkpoint_path, exist_ok=True)
+ print("***** Checkpoint enabled *****")
+
+ os.makedirs(checkpoint_path, exist_ok=True)
if os.listdir(checkpoint_path):
- print('***** Found checkpoint *****')
+ print("***** Found checkpoint *****")
print(checkpoint_path)
model, initial_epoch_number = load_checkpoint_model(checkpoint_path)
- print('***** Using checkpoint model {} *****'.format(model))
-
+ print("***** Using checkpoint model {} *****".format(model))
+
checkpoint_callback = ModelCheckpoint(
- filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'),
- save_weights_only=False,
- verbose=1,
- monitor='val_accuracy')
- print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback))
+ filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"),
+ save_weights_only=False,
+ verbose=1,
+ monitor="val_accuracy",
+ )
+ print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback))
callbacks.append(checkpoint_callback)
if not tokenizer or not model or not config:
- print('Not properly initialized...')
+ print("Not properly initialized...")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
- print('** use_amp {}'.format(use_amp))
+ print("** use_amp {}".format(use_amp))
if use_amp:
# loss scaling is currently required when using mixed precision
- optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+ optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
- print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger))
+ print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger))
if enable_sagemaker_debugger:
- print('*** DEBUGGING ***')
+ print("*** DEBUGGING ***")
import smdebug.tensorflow as smd
+
# This assumes that we specified debugger_hook_config
debugger_callback = smd.KerasHook.create_from_json_file()
- print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback))
+ print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback))
callbacks.append(debugger_callback)
optimizer = debugger_callback.wrap_optimizer(optimizer)
- if enable_tensorboard:
- tensorboard_callback = tf.keras.callbacks.TensorBoard(
- log_dir=tensorboard_logs_path)
- print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
+ if enable_tensorboard:
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
+ print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback))
callbacks.append(tensorboard_callback)
-
- print('*** OPTIMIZER {} ***'.format(optimizer))
-
+
+ print("*** OPTIMIZER {} ***".format(optimizer))
+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+ metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- print('Compiled model {}'.format(model))
+ print("Compiled model {}".format(model))
model.layers[0].trainable = not freeze_bert_layer
print(model.summary())
if run_validation:
- validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
- print('validation_data_filenames {}'.format(validation_data_filenames))
+ validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord"))
+ print("validation_data_filenames {}".format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
- channel='validation',
+ channel="validation",
input_filenames=validation_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -436,34 +379,39 @@ def load_checkpoint_model(checkpoint_path):
batch_size=validation_batch_size,
epochs=epochs,
steps_per_epoch=validation_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting Training and Validation...')
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting Training and Validation...")
validation_dataset = validation_dataset.take(validation_steps)
- train_and_validation_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- validation_data=validation_dataset,
- validation_steps=validation_steps,
- callbacks=callbacks)
+ train_and_validation_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ validation_data=validation_dataset,
+ validation_steps=validation_steps,
+ callbacks=callbacks,
+ )
print(train_and_validation_history)
- else: # Not running validation
- print('Starting Training (Without Validation)...')
- train_history = model.fit(train_dataset,
- shuffle=True,
- epochs=epochs,
- initial_epoch=initial_epoch_number,
- steps_per_epoch=train_steps_per_epoch,
- callbacks=callbacks)
+ else: # Not running validation
+ print("Starting Training (Without Validation)...")
+ train_history = model.fit(
+ train_dataset,
+ shuffle=True,
+ epochs=epochs,
+ initial_epoch=initial_epoch_number,
+ steps_per_epoch=train_steps_per_epoch,
+ callbacks=callbacks,
+ )
print(train_history)
if run_test:
- test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
- print('test_data_filenames {}'.format(test_data_filenames))
+ test_data_filenames = glob(os.path.join(test_data, "*.tfrecord"))
+ print("test_data_filenames {}".format(test_data_filenames))
test_dataset = file_based_input_dataset_builder(
- channel='test',
+ channel="test",
input_filenames=test_data_filenames,
pipe_mode=pipe_mode,
is_training=False,
@@ -471,138 +419,139 @@ def load_checkpoint_model(checkpoint_path):
batch_size=test_batch_size,
epochs=epochs,
steps_per_epoch=test_steps,
- max_seq_length=max_seq_length).map(select_data_and_label_from_record)
-
- print('Starting test...')
- test_history = model.evaluate(test_dataset,
- steps=test_steps,
- callbacks=callbacks)
-
- print('Test history {}'.format(test_history))
-
+ max_seq_length=max_seq_length,
+ ).map(select_data_and_label_from_record)
+
+ print("Starting test...")
+ test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
+
+ print("Test history {}".format(test_history))
+
# Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
- print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))
+ print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path))
model.save_pretrained(transformer_fine_tuned_model_path)
# Save the TensorFlow SavedModel for Serving Predictions
- print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
- model.save(tensorflow_saved_model_path, save_format='tf')
-
+ print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path))
+ model.save(tensorflow_saved_model_path, save_format="tf")
+
# Copy inference.py and requirements.txt to the code/ directory
# Note: This is required for the SageMaker Endpoint to pick them up.
# This appears to be hard-coded and must be called code/
- inference_path = os.path.join(local_model_dir, 'code/')
- print('Copying inference source files to {}'.format(inference_path))
- os.makedirs(inference_path, exist_ok=True)
- os.system('cp inference.py {}'.format(inference_path))
- print(glob(inference_path))
-# os.system('cp requirements.txt {}/code'.format(inference_path))
-
+ inference_path = os.path.join(local_model_dir, "code/")
+ print("Copying inference source files to {}".format(inference_path))
+ os.makedirs(inference_path, exist_ok=True)
+ os.system("cp inference.py {}".format(inference_path))
+ print(glob(inference_path))
+ # os.system('cp requirements.txt {}/code'.format(inference_path))
+
if run_sample_predictions:
- loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path,
- id2label={
- 0: 1,
- 1: 2,
- 2: 3,
- 3: 4,
- 4: 5
- },
- label2id={
- 1: 0,
- 2: 1,
- 3: 2,
- 4: 3,
- 5: 4
- })
-
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+ loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
+ transformer_fine_tuned_model_path,
+ id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
+ label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
+ )
+
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
if num_gpus >= 1:
- inference_device = 0 # GPU 0
+ inference_device = 0 # GPU 0
else:
- inference_device = -1 # CPU
- print('inference_device {}'.format(inference_device))
+ inference_device = -1 # CPU
+ print("inference_device {}".format(inference_device))
- inference_pipeline = TextClassificationPipeline(model=loaded_model,
- tokenizer=tokenizer,
- framework='tf',
- device=inference_device)
+ inference_pipeline = TextClassificationPipeline(
+ model=loaded_model, tokenizer=tokenizer, framework="tf", device=inference_device
+ )
- print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone."""))
+ print(
+ """I loved it! I will recommend this to everyone.""",
+ inference_pipeline("""I loved it! I will recommend this to everyone."""),
+ )
print("""It's OK.""", inference_pipeline("""It's OK."""))
- print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore."""))
+ print(
+ """Really bad. I hope they don't make this anymore.""",
+ inference_pipeline("""Really bad. I hope they don't make this anymore."""),
+ )
import csv
- df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
- delimiter='\t',
- quoting=csv.QUOTE_NONE,
- compression='gzip')[['review_body', 'star_rating']]
+ df_test_reviews = pd.read_csv(
+ "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
+ delimiter="\t",
+ quoting=csv.QUOTE_NONE,
+ compression="gzip",
+ )[["review_body", "star_rating"]]
df_test_reviews = df_test_reviews.sample(n=100)
df_test_reviews.shape
df_test_reviews.head()
-
+
import pandas as pd
def predict(review_body):
prediction_map = inference_pipeline(review_body)
- return prediction_map[0]['label']
+ return prediction_map[0]["label"]
- y_test = df_test_reviews['review_body'].map(predict)
+ y_test = df_test_reviews["review_body"].map(predict)
y_test
-
- y_actual = df_test_reviews['star_rating']
+
+ y_actual = df_test_reviews["star_rating"]
y_actual
from sklearn.metrics import classification_report
+
print(classification_report(y_true=y_test, y_pred=y_actual))
-
+
from sklearn.metrics import accuracy_score
- print('Accuracy: ', accuracy_score(y_true=y_test, y_pred=y_actual))
-
+
+ print("Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_actual))
+
import matplotlib.pyplot as plt
import pandas as pd
- def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
+ def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens):
print(cm)
- plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.imshow(cm, interpolation="nearest", cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
- fmt = 'd'
- thresh = cm.max() / 2.
+ fmt = "d"
+ thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plt.text(j, i, format(cm[i, j], fmt),
- horizontalalignment="center",
- color="black" if cm[i, j] > thresh else "black")
+ plt.text(
+ j,
+ i,
+ format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="black" if cm[i, j] > thresh else "black",
+ )
plt.tight_layout()
- plt.ylabel('True label')
- plt.xlabel('Predicted label')
-
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label")
+
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
+
#%matplotlib inline
#%config InlineBackend.figure_format='retina'
cm = confusion_matrix(y_true=y_test, y_pred=y_actual)
plt.figure()
- fig, ax = plt.subplots(figsize=(10,5))
- plot_conf_mat(cm,
- classes=['1', '2', '3', '4', '5'],
- title='Confusion Matrix')
+ fig, ax = plt.subplots(figsize=(10, 5))
+ plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix")
- # Save the confusion matrix
+ # Save the confusion matrix
plt.show()
-
- # Model Output
- metrics_path = os.path.join(local_model_dir, 'metrics/')
+
+ # Model Output
+ metrics_path = os.path.join(local_model_dir, "metrics/")
os.makedirs(metrics_path, exist_ok=True)
- plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
+ plt.savefig("{}/confusion_matrix.png".format(metrics_path))
diff --git a/99_cleanup/01_Cleanup.ipynb b/99_cleanup/01_Cleanup.ipynb
index 75d00ab1..8573ab45 100644
--- a/99_cleanup/01_Cleanup.ipynb
+++ b/99_cleanup/01_Cleanup.ipynb
@@ -14,8 +14,8 @@
"bucket = sagemaker_session.default_bucket()\n",
"region = boto3.Session().region_name\n",
"\n",
- "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n",
- "comprehend = boto3.Session().client(service_name='comprehend', region_name=region)"
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
+ "comprehend = boto3.Session().client(service_name=\"comprehend\", region_name=region)"
]
},
{
@@ -70,9 +70,7 @@
"outputs": [],
"source": [
"try:\n",
- " comprehend.delete_endpoint(\n",
- " EndpointArn=comprehend_endpoint_arn\n",
- " )\n",
+ " comprehend.delete_endpoint(EndpointArn=comprehend_endpoint_arn)\n",
"except:\n",
" pass"
]
@@ -84,9 +82,7 @@
"outputs": [],
"source": [
"try:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=autopilot_endpoint_name\n",
- " )\n",
+ " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)\n",
"except:\n",
" pass"
]
@@ -98,9 +94,7 @@
"outputs": [],
"source": [
"try:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=tensorflow_endpoint_name\n",
- " )\n",
+ " sm.delete_endpoint(EndpointName=tensorflow_endpoint_name)\n",
"except:\n",
" pass"
]
@@ -112,9 +106,7 @@
"outputs": [],
"source": [
"try:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=pytorch_endpoint_name\n",
- " )\n",
+ " sm.delete_endpoint(EndpointName=pytorch_endpoint_name)\n",
"except:\n",
" pass"
]
@@ -126,9 +118,7 @@
"outputs": [],
"source": [
"try:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=bandit_experiment_name\n",
- " )\n",
+ " sm.delete_endpoint(EndpointName=bandit_experiment_name)\n",
"except:\n",
" pass"
]
@@ -140,9 +130,7 @@
"outputs": [],
"source": [
"try:\n",
- " sm.delete_endpoint(\n",
- " EndpointName=pipeline_endpoint_name\n",
- " )\n",
+ " sm.delete_endpoint(EndpointName=pipeline_endpoint_name)\n",
"except:\n",
" pass"
]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..fa2ea559
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.black]
+line-length = 119
+exclude = '''
+(
+ /(
+ \.eggs
+ | \.git
+ | \.mypy_cache
+ | build
+ | dist
+ | spark
+ | jars
+ | \.jar
+ | wip
+ )
+)
+'''