From 21631fa620b3e109a0720e7ec8ea8b9ea3536b71 Mon Sep 17 00:00:00 2001 From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com> Date: Thu, 11 Feb 2021 15:24:13 -0800 Subject: [PATCH] Format all working *.py and *.ipynb --- 00_quickstart/01_Setup_Dependencies.ipynb | 4 +- 00_quickstart/02_Copy_TSV_To_S3.ipynb | 24 +- 00_quickstart/03_Create_Athena_Database.ipynb | 27 +- .../04_Register_S3_TSV_With_Athena.ipynb | 59 +- ...onvert_S3_TSV_To_Parquet_With_Athena.ipynb | 59 +- .../06_Visualize_Reviews_Dataset.ipynb | 340 +++-- .../07_Prepare_Dataset_Bias_Analysis.ipynb | 84 +- .../08_Run_Data_Bias_Analysis_AdHoc.ipynb | 40 +- ...Run_Data_Bias_Analysis_ProcessingJob.ipynb | 73 +- ...eate_SageMaker_Pipeline_BERT_Reviews.ipynb | 563 ++++----- .../11_Evaluate_Pipeline_Execution.ipynb | 73 +- 00_quickstart/12_Register_Deploy_Model.ipynb | 107 +- 00_quickstart/13_Cleanup.ipynb | 4 +- 00_quickstart/evaluate_model_metrics.py | 218 ++-- ...ocess-scikit-text-to-bert-feature-store.py | 695 +++++----- 00_quickstart/src/inference.py | 113 +- 00_quickstart/src/tf_bert_reviews.py | 676 +++++----- 01_setup/01_Setup_Dependencies.ipynb | 4 +- 01_setup/02_Check_Environment.ipynb | 57 +- 01_setup/03_Create_S3_Bucket.ipynb | 10 +- .../04_Update_IAM_Roles_And_Policies.ipynb | 362 +++--- 02_usecases/01_Setup.ipynb | 2 +- 02_usecases/03_Celebrity_Recognition.ipynb | 127 +- 02_usecases/04_Content_Moderation.ipynb | 73 +- .../05_Inappropriate_Text_Detection.ipynb | 158 ++- ..._Text_Classification_Prepare_Dataset.ipynb | 104 +- .../07_Text_Classification_Train_Model.ipynb | 232 ++-- .../08_Text_Classification_Predict.ipynb | 67 +- .../archive/05_Celebrity_Detection.ipynb | 126 +- 03_automl/01_Prepare_Dataset_Autopilot.ipynb | 156 ++- 03_automl/02_Train_Reviews_Autopilot.ipynb | 502 +++++--- 03_automl/03_Predict_Reviews_Autopilot.ipynb | 52 +- .../candidate_data_processors/dpp0.py | 27 +- .../candidate_data_processors/dpp1.py | 28 +- .../candidate_data_processors/dpp2.py | 27 +- .../sagemaker_serve.py | 86 +- 03_automl/generated_module/setup.py | 16 +- ...AutopilotCandidateDefinitionNotebook.ipynb | 245 ++-- .../notebooks/sagemaker_automl/common.py | 71 +- .../notebooks/sagemaker_automl/config.py | 39 +- .../sagemaker_automl/interactive_runner.py | 46 +- .../sagemaker_automl/local_candidate.py | 84 +- 03_automl/notebooks/sagemaker_automl/steps.py | 46 +- 04_ingest/01_Copy_TSV_To_S3.ipynb | 72 +- 04_ingest/02_Create_Athena_Database.ipynb | 27 +- .../03_Register_S3_TSV_With_Athena.ipynb | 59 +- ...onvert_S3_TSV_To_Parquet_With_Athena.ipynb | 59 +- .../05_Query_Data_With_AWS_DataWrangler.ipynb | 76 +- 05_explore/01_Visualize_Reviews_Dataset.ipynb | 338 +++-- .../02_Prepare_Dataset_Bias_Analysis.ipynb | 84 +- .../03_Run_Data_Bias_Analysis_AdHoc.ipynb | 40 +- ...Run_Data_Bias_Analysis_ProcessingJob.ipynb | 77 +- ...e_Data_Quality_ProcessingJob_PySpark.ipynb | 105 +- ...GENERATED_Data_Wrangler_Job_Notebook.ipynb | 35 +- 05_explore/99_GENERATED_Python_Code.py | 151 ++- ...TED_SageMaker_Feature_Store_Notebook.ipynb | 215 ++-- ...ENERATED_SageMaker_Pipeline_Notebook.ipynb | 95 +- .../01_Visualize_Reviews_Dataset.ipynb | 402 +++--- .../archive/02_Explore_Redshift_Data.ipynb | 113 +- 05_explore/preprocess-deequ-pyspark.py | 166 ++- ...taset_BERT_Scikit_AdHoc_FeatureStore.ipynb | 268 ++-- ..._BERT_Scikit_ScriptMode_FeatureStore.ipynb | 250 ++-- .../data-wrangler/DataWranglerJob_Antje.ipynb | 35 +- .../DataWrangler_To_FeatureStore_Antje.ipynb | 219 ++-- .../DataWrangler_To_Pipeline_Antje.ipynb | 110 +- .../data-wrangler/data_wrangler_antje.py | 148 ++- ...ocess-scikit-text-to-bert-feature-store.py | 695 +++++----- ...s_BERT_Transformers_TensorFlow_AdHoc.ipynb | 197 ++- ...T_Transformers_TensorFlow_ScriptMode.ipynb | 367 +++--- ...T_Transformers_TensorFlow_To_PyTorch.ipynb | 105 +- 07_train/04_Evaluate_Model_Metrics.ipynb | 161 +-- .../00_Prepare_Dataset_BERT.ipynb | 143 ++- .../00_setup_eks/00_01_Setup_EKS.ipynb | 3 +- .../00_setup_eks/00_04_Setup_FSX.ipynb | 51 +- .../01_Develop_Code_Notebook.ipynb | 413 +++--- .../03_Run_ML_Training_SageMaker.ipynb | 155 ++- 07_train/container-demo/code/train.py | 417 +++--- 07_train/evaluate_model_metrics.py | 218 ++-- 07_train/src/inference.py | 113 +- 07_train/src/tf_bert_reviews.py | 676 +++++----- ...meter_Tuning_Reviews_BERT_TensorFlow.ipynb | 233 ++-- ...meter_Tuning_Reviews_BERT_TensorFlow.ipynb | 300 +++-- 08_optimize/src/inference.py | 113 +- 08_optimize/src/tf_bert_reviews.py | 676 +++++----- ...ageMaker_Autopilot_Model_From_Athena.ipynb | 140 ++- ...y_Reviews_BERT_PyTorch_REST_Endpoint.ipynb | 87 +- ...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 105 +- ...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 93 +- ...views_BERT_TensorFlow_REST_Endpoints.ipynb | 389 +++--- 09_deploy/code-pytorch/inference.py | 96 +- 09_deploy/code/inference.py | 113 +- 09_deploy/common/docker_utils.py | 52 +- 09_deploy/common/env_utils.py | 72 +- 09_deploy/common/markdown_helper.py | 23 +- 09_deploy/common/misc.py | 90 +- .../common/sagemaker_rl/coach_launcher.py | 158 +-- .../common/sagemaker_rl/configuration_list.py | 27 +- 09_deploy/common/sagemaker_rl/docker_utils.py | 10 +- 09_deploy/common/sagemaker_rl/mpi_launcher.py | 104 +- 09_deploy/common/sagemaker_rl/onnx_utils.py | 28 +- .../clients/ddb/experiment_db_client.py | 124 +- .../clients/ddb/join_db_client.py | 98 +- .../clients/ddb/model_db_client.py | 126 +- .../exceptions/ddb_client_exceptions.py | 4 +- .../exceptions/workflow_exceptions.py | 8 +- .../orchestrator/resource_manager.py | 350 +++--- .../orchestrator/utils/cloudwatch_logger.py | 173 +-- .../workflow/datatypes/experiment_record.py | 39 +- .../workflow/datatypes/join_job_record.py | 73 +- .../workflow/datatypes/model_record.py | 103 +- .../workflow/manager/experiment_manager.py | 1116 +++++++++-------- .../workflow/manager/join_manager.py | 271 ++-- .../workflow/manager/model_manager.py | 314 ++--- 09_deploy/common/sagemaker_rl/ray_launcher.py | 99 +- .../sagemaker_rl/sage_cluster_communicator.py | 21 +- .../sagemaker_rl/stable_baselines_launcher.py | 89 +- .../common/sagemaker_rl/tf_serving_utils.py | 17 +- 09_deploy/src/eval-cfa-vw.py | 38 +- 09_deploy/src/io_utils.py | 14 +- 09_deploy/src/train-vw.py | 47 +- 09_deploy/src/vw_model.py | 39 +- 09_deploy/src/vw_utils.py | 2 +- ...eate_SageMaker_Pipeline_BERT_Reviews.ipynb | 567 ++++----- .../02_Evaluate_Pipeline_Execution.ipynb | 77 +- 10_pipeline/03_Register_Deploy_Model.ipynb | 156 +-- 10_pipeline/airflow/00_Create_S3_Bucket.ipynb | 34 +- .../01_Setup_Airflow_Dependencies.ipynb | 152 ++- .../02_Create_Airflow_Environment.ipynb | 74 +- .../03_Trigger_Airflow_Environment.ipynb | 52 +- 10_pipeline/airflow/dags/bert_reviews.py | 83 +- 10_pipeline/airflow/dags/config.py | 29 +- 10_pipeline/airflow/dags/pipeline/prepare.py | 65 +- .../airflow/dags/pipeline/preprocess.py | 79 +- 10_pipeline/airflow/src/config.py | 29 +- .../dag_ml_pipeline_amazon_video_reviews.py | 91 +- 10_pipeline/evaluate_model_metrics.py | 218 ++-- 10_pipeline/human/00_Overview.ipynb | 2 +- .../01_Setup_Augmented_AI_Workflow.ipynb | 64 +- ...om_Comprehend_Custom_Text_Classifier.ipynb | 103 +- 10_pipeline/kubeflow/00_00_Setup_EKS.ipynb | 8 +- ..._05_Launch_Kubeflow_Jupyter_Notebook.ipynb | 4 +- .../02_Kubeflow_Pipeline_Simple.ipynb | 38 +- ...flow_Pipeline_Reviews_BERT_SageMaker.ipynb | 334 ++--- ...LIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb | 2 +- 10_pipeline/kubeflow/code/inference.py | 113 +- 10_pipeline/kubeflow/code/tf_bert_reviews.py | 676 +++++----- .../kubeflow/evaluate_model_metrics.py | 218 ++-- ...ocess-scikit-text-to-bert-feature-store.py | 695 +++++----- ...ageMaker_Pipeline_BERT_Reviews_MLOps.ipynb | 389 +++--- .../dsoaws/evaluate_model_metrics.py | 218 ++-- .../pipelines/dsoaws/inference.py | 113 +- .../pipelines/dsoaws/pipeline.py | 458 +++---- ...ocess-scikit-text-to-bert-feature-store.py | 695 +++++----- .../pipelines/dsoaws/tf_bert_reviews.py | 676 +++++----- .../pipelines/run_pipeline.py | 137 +- .../sagemaker-project-modelbuild/setup.py | 8 +- .../test/test.py | 4 +- ...ocess-scikit-text-to-bert-feature-store.py | 695 +++++----- 10_pipeline/src/inference.py | 113 +- 10_pipeline/src/tf_bert_reviews.py | 676 +++++----- ...eviews_BERT_TensorFlow_REST_Endpoint.ipynb | 34 +- ...y_Reviews_BERT_TensorFlow_S3_Trigger.ipynb | 716 +++++------ ...d_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb | 291 +++-- 11_stream/01_Setup_IAM.ipynb | 211 +--- ...02_Create_Lambda_To_Invoke_SageMaker.ipynb | 135 +- .../03_Create_Kinesis_Data_Firehose.ipynb | 162 ++- 11_stream/04_Create_Kinesis_Data_Stream.ipynb | 59 +- ...Create_Lambda_Destination_CloudWatch.ipynb | 89 +- .../06_Create_Lambda_Destination_SNS.ipynb | 104 +- ...07_Create_Kinesis_Data_Analytics_App.ipynb | 227 ++-- ...Put_Reviews_On_Kinesis_Data_Firehose.ipynb | 217 ++-- .../archive/11_stream.orig/00_Overview.ipynb | 2 +- .../archive/11_stream.orig/01_Setup_IAM.ipynb | 192 +-- .../02_Create_Kinesis_Data_Firehose.ipynb | 89 +- .../03_Create_Kinesis_Data_Stream.ipynb | 61 +- .../04_Create_Lambda_Destination.ipynb | 84 +- ...05_Create_Kinesis_Data_Analytics_App.ipynb | 201 ++- ...Put_Reviews_On_Kinesis_Data_Firehose.ipynb | 202 +-- .../11_stream.orig/src/lambda_function.py | 48 +- .../src/deliver_metrics_to_cloudwatch.py | 48 +- .../src/invoke_sm_endpoint_from_kinesis.py | 75 +- 11_stream/src/push_notification_to_sns.py | 43 +- 12_security/01_Secrets_Manager.ipynb | 29 +- 12_security/02_Insecure_DataAccess_S3.ipynb | 4 +- ...cure_DataAccess_S3_BucketPolicy_Role.ipynb | 32 +- ...ecure_DataAccess_S3_BucketPolicy_VPC.ipynb | 51 +- ..._Secure_DataAccess_S3_IAMPolicy_Role.ipynb | 48 +- ...a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb | 57 +- ...5_Secure_SageMaker_Notebook_Instance.ipynb | 16 +- 12_security/07_Insecure_Train.ipynb | 185 +-- .../08_Secure_Train_IAMPolicy_Role.ipynb | 251 ++-- .../08a_Secure_Train_IAMPolicy_VPC.ipynb | 419 ++++--- ...ure_Train_IAMPolicy_VPC_ConditionKey.ipynb | 451 ++++--- ...09_Secure_Train_EncryptionAtRest_KMS.ipynb | 193 +-- .../10_Secure_Train_EncryptionInTransit.ipynb | 183 +-- .../11_Secure_Train_NetworkIsolation.ipynb | 187 +-- 12_security/src/inference.py | 113 +- 12_security/src/tf_bert_reviews.py | 639 +++++----- 99_cleanup/01_Cleanup.ipynb | 28 +- pyproject.toml | 17 + 200 files changed, 16361 insertions(+), 16815 deletions(-) create mode 100644 pyproject.toml diff --git a/00_quickstart/01_Setup_Dependencies.ipynb b/00_quickstart/01_Setup_Dependencies.ipynb index f4194b22..1755c0c8 100644 --- a/00_quickstart/01_Setup_Dependencies.ipynb +++ b/00_quickstart/01_Setup_Dependencies.ipynb @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "!conda install -y pytorch==1.6.0 -c pytorch " + "!conda install -y pytorch==1.6.0 -c pytorch" ] }, { @@ -260,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_dependencies_passed=True" + "setup_dependencies_passed = True" ] }, { diff --git a/00_quickstart/02_Copy_TSV_To_S3.ipynb b/00_quickstart/02_Copy_TSV_To_S3.ipynb index a9aa8bfc..a6664daf 100644 --- a/00_quickstart/02_Copy_TSV_To_S3.ipynb +++ b/00_quickstart/02_Copy_TSV_To_S3.ipynb @@ -72,13 +72,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -99,9 +99,9 @@ "try:\n", " setup_dependencies_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -126,7 +126,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'" + "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\"" ] }, { @@ -151,7 +151,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n", + "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n", "print(s3_private_path_tsv)" ] }, @@ -223,7 +223,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Bucket'.format(region, account_id, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Bucket'.format(\n", + " region, account_id, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/00_quickstart/03_Create_Athena_Database.ipynb b/00_quickstart/03_Create_Athena_Database.ipynb index 80a857be..d4a918f9 100644 --- a/00_quickstart/03_Create_Athena_Database.ipynb +++ b/00_quickstart/03_Create_Athena_Database.ipynb @@ -29,7 +29,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -62,10 +62,10 @@ "try:\n", " s3_public_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -95,10 +95,10 @@ "try:\n", " s3_private_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -141,7 +141,7 @@ "metadata": {}, "outputs": [], "source": [ - "database_name = 'dsoaws'" + "database_name = \"dsoaws\"" ] }, { @@ -160,7 +160,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -178,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'CREATE DATABASE IF NOT EXISTS {}'.format(database_name)\n", + "statement = \"CREATE DATABASE IF NOT EXISTS {}\".format(database_name)\n", "print(statement)" ] }, @@ -189,6 +189,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "pd.read_sql(statement, conn)" ] }, @@ -205,7 +206,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW DATABASES'\n", + "statement = \"SHOW DATABASES\"\n", "\n", "df_show = pd.read_sql(statement, conn)\n", "df_show.head(5)" diff --git a/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb b/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb index e8b935b4..33fff7c9 100644 --- a/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb +++ b/00_quickstart/04_Register_S3_TSV_With_Athena.ipynb @@ -31,7 +31,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -64,9 +64,9 @@ "try:\n", " ingest_create_athena_db_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -85,11 +85,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_db_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]') " + " print(\"[OK]\")" ] }, { @@ -110,10 +110,10 @@ "try:\n", " s3_private_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -179,7 +179,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -189,8 +189,8 @@ "outputs": [], "source": [ "# Set Athena parameters\n", - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'" + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"" ] }, { @@ -226,7 +226,9 @@ " review_body string,\n", " review_date string\n", ") ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\\\t' LINES TERMINATED BY '\\\\n' LOCATION '{}'\n", - "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(database_name, table_name_tsv, s3_private_path_tsv)\n", + "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(\n", + " database_name, table_name_tsv, s3_private_path_tsv\n", + ")\n", "\n", "print(statement)" ] @@ -238,6 +240,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "pd.read_sql(statement, conn)" ] }, @@ -254,7 +257,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW TABLES in {}'.format(database_name)\n", + "statement = \"SHOW TABLES in {}\".format(database_name)\n", "\n", "df_show = pd.read_sql(statement, conn)\n", "df_show.head(5)" @@ -292,10 +295,12 @@ "metadata": {}, "outputs": [], "source": [ - "product_category = 'Digital_Software'\n", + "product_category = \"Digital_Software\"\n", "\n", "statement = \"\"\"SELECT * FROM {}.{}\n", - " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_tsv, product_category)\n", + " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n", + " database_name, table_name_tsv, product_category\n", + ")\n", "\n", "print(statement)" ] @@ -317,11 +322,11 @@ "outputs": [], "source": [ "if not df.empty:\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "else:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -339,7 +344,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review AWS Glue Catalog'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review AWS Glue Catalog'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb b/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb index 454235c0..a41e834f 100644 --- a/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb +++ b/00_quickstart/05_Convert_S3_TSV_To_Parquet_With_Athena.ipynb @@ -30,7 +30,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -63,9 +63,9 @@ "try:\n", " ingest_create_athena_table_tsv_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -84,11 +84,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_tsv_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -123,12 +123,12 @@ "outputs": [], "source": [ "# Set S3 path to Parquet data\n", - "s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(bucket)\n", + "s3_path_parquet = \"s3://{}/amazon-reviews-pds/parquet\".format(bucket)\n", "\n", "# Set Athena parameters\n", - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'\n", - "table_name_parquet = 'amazon_reviews_parquet'" + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"\n", + "table_name_parquet = \"amazon_reviews_parquet\"" ] }, { @@ -138,7 +138,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -185,7 +185,9 @@ " CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,\n", " DATE(review_date) AS review_date,\n", " product_category\n", - "FROM {}.{}\"\"\".format(database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv)\n", + "FROM {}.{}\"\"\".format(\n", + " database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv\n", + ")\n", "\n", "print(statement)" ] @@ -221,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)\n", + "statement = \"MSCK REPAIR TABLE {}.{}\".format(database_name, table_name_parquet)\n", "\n", "print(statement)" ] @@ -233,6 +235,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "df = pd.read_sql(statement, conn)\n", "df.head(5)" ] @@ -250,7 +253,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)\n", + "statement = \"SHOW PARTITIONS {}.{}\".format(database_name, table_name_parquet)\n", "\n", "print(statement)" ] @@ -278,7 +281,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW TABLES in {}'.format(database_name)" + "statement = \"SHOW TABLES in {}\".format(database_name)" ] }, { @@ -323,10 +326,12 @@ "metadata": {}, "outputs": [], "source": [ - "product_category = 'Digital_Software'\n", + "product_category = \"Digital_Software\"\n", "\n", "statement = \"\"\"SELECT * FROM {}.{}\n", - " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_parquet, product_category)\n", + " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n", + " database_name, table_name_parquet, product_category\n", + ")\n", "\n", "print(statement)" ] @@ -348,11 +353,11 @@ "outputs": [], "source": [ "if not df.empty:\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "else:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -370,7 +375,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review AWS Glue Catalog'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review AWS Glue Catalog'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/00_quickstart/06_Visualize_Reviews_Dataset.ipynb b/00_quickstart/06_Visualize_Reviews_Dataset.ipynb index 616653c8..bbd9ffbd 100644 --- a/00_quickstart/06_Visualize_Reviews_Dataset.ipynb +++ b/00_quickstart/06_Visualize_Reviews_Dataset.ipynb @@ -39,7 +39,7 @@ "import sagemaker\n", "import boto3\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -63,9 +63,9 @@ "try:\n", " ingest_create_athena_table_parquet_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -84,11 +84,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_parquet_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -97,9 +97,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Set Athena database & table \n", - "database_name = 'dsoaws'\n", - "table_name = 'amazon_reviews_parquet'" + "# Set Athena database & table\n", + "database_name = \"dsoaws\"\n", + "table_name = \"amazon_reviews_parquet\"" ] }, { @@ -118,7 +118,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -148,6 +148,7 @@ "import seaborn as sns\n", "\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'" ] @@ -158,23 +159,27 @@ "metadata": {}, "outputs": [], "source": [ - "sns.set_style = 'seaborn-whitegrid'\n", - "\n", - "sns.set(rc={\"font.style\":\"normal\",\n", - " \"axes.facecolor\":\"white\",\n", - " 'grid.color': '.8',\n", - " 'grid.linestyle': '-',\n", - " \"figure.facecolor\":\"white\",\n", - " \"figure.titlesize\":20,\n", - " \"text.color\":\"black\",\n", - " \"xtick.color\":\"black\",\n", - " \"ytick.color\":\"black\",\n", - " \"axes.labelcolor\":\"black\",\n", - " \"axes.grid\":True,\n", - " 'axes.labelsize':10,\n", - " 'xtick.labelsize':10,\n", - " 'font.size':10,\n", - " 'ytick.labelsize':10})" + "sns.set_style = \"seaborn-whitegrid\"\n", + "\n", + "sns.set(\n", + " rc={\n", + " \"font.style\": \"normal\",\n", + " \"axes.facecolor\": \"white\",\n", + " \"grid.color\": \".8\",\n", + " \"grid.linestyle\": \"-\",\n", + " \"figure.facecolor\": \"white\",\n", + " \"figure.titlesize\": 20,\n", + " \"text.color\": \"black\",\n", + " \"xtick.color\": \"black\",\n", + " \"ytick.color\": \"black\",\n", + " \"axes.labelcolor\": \"black\",\n", + " \"axes.grid\": True,\n", + " \"axes.labelsize\": 10,\n", + " \"xtick.labelsize\": 10,\n", + " \"font.size\": 10,\n", + " \"ytick.labelsize\": 10,\n", + " }\n", + ")" ] }, { @@ -195,7 +200,7 @@ " for p in ax.patches:\n", " _x = p.get_x() + p.get_width() + float(space)\n", " _y = p.get_y() + p.get_height()\n", - " value = round(float(p.get_width()),2)\n", + " value = round(float(p.get_width()), 2)\n", " ax.text(_x, _y, value, ha=\"left\")\n", "\n", " if isinstance(axs, np.ndarray):\n", @@ -224,7 +229,9 @@ "FROM {}.{} \n", "GROUP BY product_category \n", "ORDER BY avg_star_rating DESC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -271,16 +278,16 @@ "outputs": [], "source": [ "# Create plot\n", - "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df, saturation=1)\n", "\n", "if num_categories < 10:\n", - " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n", - " \n", - "# Set title and x-axis ticks \n", - "plt.title('Average Rating by Product Category')\n", - "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n", + " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n", "\n", - "# Helper code to show actual values afters bars \n", + "# Set title and x-axis ticks\n", + "plt.title(\"Average Rating by Product Category\")\n", + "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n", + "\n", + "# Helper code to show actual values afters bars\n", "show_values_barplot(barplot, 0.1)\n", "\n", "plt.xlabel(\"Average Rating\")\n", @@ -323,7 +330,9 @@ "FROM {}.{}\n", "GROUP BY product_category \n", "ORDER BY count_star_rating DESC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -345,10 +354,10 @@ "outputs": [], "source": [ "# Store counts\n", - "count_ratings = df['count_star_rating']\n", + "count_ratings = df[\"count_star_rating\"]\n", "\n", "# Store max ratings\n", - "max_ratings = df['count_star_rating'].max()\n", + "max_ratings = df[\"count_star_rating\"].max()\n", "print(max_ratings)" ] }, @@ -366,20 +375,20 @@ "outputs": [], "source": [ "# Create Seaborn barplot\n", - "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n", "\n", "if num_categories < 10:\n", - " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n", + " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n", "\n", "# Set title\n", "plt.title(\"Number of Ratings per Product Category for Subset of Product Categories\")\n", "\n", - "# Set x-axis ticks to match scale \n", + "# Set x-axis ticks to match scale\n", "if max_ratings > 200000:\n", - " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n", + " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n", " plt.xlim(0, 20000000)\n", "elif max_ratings <= 200000:\n", - " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '150K', '200K'])\n", + " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"150K\", \"200K\"])\n", " plt.xlim(0, 200000)\n", "\n", "plt.xlabel(\"Number of Ratings\")\n", @@ -417,13 +426,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category, MIN(review_date) AS first_review_date\n", "FROM {}.{}\n", "GROUP BY product_category\n", "ORDER BY first_review_date \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -446,7 +457,8 @@ "source": [ "# Convert date strings (e.g. 2014-10-18) to datetime\n", "import datetime as datetime\n", - "dates = pd.to_datetime(df['first_review_date'])\n" + "\n", + "dates = pd.to_datetime(df[\"first_review_date\"])" ] }, { @@ -457,16 +469,18 @@ "source": [ "# See: https://stackoverflow.com/questions/60761410/how-to-graph-events-on-a-timeline\n", "\n", + "\n", "def modify_dataframe(df):\n", " \"\"\" Modify dataframe to include new columns \"\"\"\n", - " df['year'] = pd.to_datetime(df['first_review_date'], format='%Y-%m-%d').dt.year\n", + " df[\"year\"] = pd.to_datetime(df[\"first_review_date\"], format=\"%Y-%m-%d\").dt.year\n", " return df\n", "\n", + "\n", "def get_x_y(df):\n", " \"\"\" Get X and Y coordinates; return tuple \"\"\"\n", - " series = df['year'].value_counts().sort_index()\n", + " series = df[\"year\"].value_counts().sort_index()\n", " # new_series = series.reindex(range(1,21)).fillna(0).astype(int)\n", - " return series.index, series.values\n" + " return series.index, series.values" ] }, { @@ -494,20 +508,20 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plt.figure(figsize=(12,5))\n", + "fig = plt.figure(figsize=(12, 5))\n", "ax = plt.gca()\n", "\n", - "ax.set_title('Number Of First Product Category Reviews Per Year for Subset of Categories')\n", - "ax.set_xlabel('Year')\n", - "ax.set_ylabel('Count')\n", + "ax.set_title(\"Number Of First Product Category Reviews Per Year for Subset of Categories\")\n", + "ax.set_xlabel(\"Year\")\n", + "ax.set_ylabel(\"Count\")\n", "\n", "ax.plot(X, Y, color=\"black\", linewidth=2, marker=\"o\")\n", - "ax.fill_between(X, [0]*len(X), Y, facecolor='lightblue')\n", + "ax.fill_between(X, [0] * len(X), Y, facecolor=\"lightblue\")\n", "\n", "ax.locator_params(integer=True)\n", "\n", "ax.set_xticks(range(1995, 2016, 1))\n", - "ax.set_yticks(range(0, max(Y)+2, 1))\n", + "ax.set_yticks(range(0, max(Y) + 2, 1))\n", "\n", "plt.xticks(rotation=45)\n", "\n", @@ -538,7 +552,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category,\n", " star_rating,\n", @@ -546,7 +560,9 @@ "FROM {}.{}\n", "GROUP BY product_category, star_rating\n", "ORDER BY product_category ASC, star_rating DESC, count_reviews\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -575,14 +591,14 @@ "outputs": [], "source": [ "# Create grouped DataFrames by category and by star rating\n", - "grouped_category = df.groupby('product_category')\n", - "grouped_star = df.groupby('star_rating')\n", + "grouped_category = df.groupby(\"product_category\")\n", + "grouped_star = df.groupby(\"star_rating\")\n", "\n", "# Create sum of ratings per star rating\n", - "df_sum = df.groupby(['star_rating']).sum()\n", + "df_sum = df.groupby([\"star_rating\"]).sum()\n", "\n", "# Calculate total number of star ratings\n", - "total = df_sum['count_reviews'].sum()\n", + "total = df_sum[\"count_reviews\"].sum()\n", "print(total)" ] }, @@ -595,17 +611,17 @@ "# Create dictionary of product categories and array of star rating distribution per category\n", "distribution = {}\n", "count_reviews_per_star = []\n", - "i=0\n", - " \n", + "i = 0\n", + "\n", "for category, ratings in grouped_category:\n", " count_reviews_per_star = []\n", - " for star in ratings['star_rating']:\n", - " count_reviews_per_star.append(ratings.at[i, 'count_reviews'])\n", - " i=i+1;\n", + " for star in ratings[\"star_rating\"]:\n", + " count_reviews_per_star.append(ratings.at[i, \"count_reviews\"])\n", + " i = i + 1\n", " distribution[category] = count_reviews_per_star\n", "\n", "# Check if distribution has been created succesfully\n", - "print(distribution)\n" + "print(distribution)" ] }, { @@ -644,8 +660,8 @@ "# Sort distribution by highest average rating per category\n", "sorted_distribution = {}\n", "\n", - "average_star_ratings.iloc[:,0]\n", - "for index, value in average_star_ratings.iloc[:,0].items():\n", + "average_star_ratings.iloc[:, 0]\n", + "for index, value in average_star_ratings.iloc[:, 0].items():\n", " sorted_distribution[value] = distribution[value]" ] }, @@ -706,7 +722,7 @@ "proportion_star5 = np.true_divide(star5, total) * 100\n", "\n", "# Add colors\n", - "colors = ['red', 'purple','blue','orange','green']\n", + "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n", "\n", "# The position of the bars on the x-axis\n", "r = range(len(categories))\n", @@ -714,21 +730,53 @@ "\n", "# Plot bars\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", - "\n", - "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n", - "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n", - "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n", - "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n", - "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", + "\n", + "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n", + "ax4 = plt.barh(\n", + " r,\n", + " proportion_star4,\n", + " left=proportion_star5,\n", + " color=colors[3],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"4-Star Ratings\",\n", + ")\n", + "ax3 = plt.barh(\n", + " r,\n", + " proportion_star3,\n", + " left=proportion_star5 + proportion_star4,\n", + " color=colors[2],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"3-Star Ratings\",\n", + ")\n", + "ax2 = plt.barh(\n", + " r,\n", + " proportion_star2,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3,\n", + " color=colors[1],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"2-Star Ratings\",\n", + ")\n", + "ax1 = plt.barh(\n", + " r,\n", + " proportion_star1,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n", + " color=colors[0],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"1-Star Ratings\",\n", + ")\n", "\n", - "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n", - "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n", - "plt.yticks(r, categories, fontweight='regular')\n", + "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n", + "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n", + "plt.yticks(r, categories, fontweight=\"regular\")\n", "\n", - "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n", + "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n", "plt.gca().invert_yaxis()\n", "plt.tight_layout()\n", "\n", @@ -759,14 +807,16 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT star_rating,\n", " COUNT(*) AS count_reviews\n", "FROM {}.{}\n", "GROUP BY star_rating\n", "ORDER BY star_rating DESC, count_reviews \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -797,15 +847,12 @@ "metadata": {}, "outputs": [], "source": [ - "chart = df.plot.bar(x='star_rating', \n", - " y='count_reviews', \n", - " rot='0',\n", - " figsize=(10,5), \n", - " title='Review Count by Star Ratings', \n", - " legend=False)\n", + "chart = df.plot.bar(\n", + " x=\"star_rating\", y=\"count_reviews\", rot=\"0\", figsize=(10, 5), title=\"Review Count by Star Ratings\", legend=False\n", + ")\n", "\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")\n", "\n", "plt.show(chart)" ] @@ -842,13 +889,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT year, ROUND(AVG(star_rating),4) AS avg_rating\n", "FROM {}.{}\n", "GROUP BY year\n", "ORDER BY year\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -869,7 +918,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year\n" + "df[\"year\"] = pd.to_datetime(df[\"year\"], format=\"%Y\").dt.year" ] }, { @@ -886,21 +935,21 @@ "outputs": [], "source": [ "fig = plt.gcf()\n", - "fig.set_size_inches(12,5)\n", + "fig.set_size_inches(12, 5)\n", "\n", - "fig.suptitle('Average Star Rating Over Time (Across Subset of Product Categories)')\n", + "fig.suptitle(\"Average Star Rating Over Time (Across Subset of Product Categories)\")\n", "\n", "ax = plt.gca()\n", - "#ax = plt.gca().set_xticks(df['year'])\n", + "# ax = plt.gca().set_xticks(df['year'])\n", "ax.locator_params(integer=True)\n", - "ax.set_xticks(df['year'].unique())\n", + "ax.set_xticks(df[\"year\"].unique())\n", "\n", - "df.plot(kind='line',x='year',y='avg_rating', color='red', ax=ax)\n", + "df.plot(kind=\"line\", x=\"year\", y=\"avg_rating\", color=\"red\", ax=ax)\n", "\n", - "#plt.xticks(range(1995, 2016, 1))\n", - "#plt.yticks(range(0,6,1))\n", - "plt.xlabel('Years')\n", - "plt.ylabel('Average Star Rating')\n", + "# plt.xticks(range(1995, 2016, 1))\n", + "# plt.yticks(range(0,6,1))\n", + "plt.xlabel(\"Years\")\n", + "plt.ylabel(\"Average Star Rating\")\n", "plt.xticks(rotation=45)\n", "\n", "# fig.savefig('average-rating.png', dpi=300)\n", @@ -930,13 +979,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category, year, ROUND(AVG(star_rating), 4) AS avg_rating_category\n", "FROM {}.{}\n", "GROUP BY product_category, year\n", "ORDER BY year \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -965,11 +1016,20 @@ "outputs": [], "source": [ "def plot_categories(df):\n", - " df_categories = df['product_category'].unique()\n", + " df_categories = df[\"product_category\"].unique()\n", " for category in df_categories:\n", " # print(category)\n", - " df_plot = df.loc[df['product_category'] == category]\n", - " df_plot.plot(kind='line',x='year',y='avg_rating_category', c=np.random.rand(3,), ax=ax, label=category)" + " df_plot = df.loc[df[\"product_category\"] == category]\n", + " df_plot.plot(\n", + " kind=\"line\",\n", + " x=\"year\",\n", + " y=\"avg_rating_category\",\n", + " c=np.random.rand(\n", + " 3,\n", + " ),\n", + " ax=ax,\n", + " label=category,\n", + " )" ] }, { @@ -979,19 +1039,19 @@ "outputs": [], "source": [ "fig = plt.gcf()\n", - "fig.set_size_inches(12,5)\n", + "fig.set_size_inches(12, 5)\n", + "\n", + "fig.suptitle(\"Average Star Rating Over Time Across Subset Of Categories\")\n", "\n", - "fig.suptitle('Average Star Rating Over Time Across Subset Of Categories')\n", - " \n", "ax = plt.gca()\n", "\n", "ax.locator_params(integer=True)\n", - "ax.set_xticks(df['year'].unique())\n", + "ax.set_xticks(df[\"year\"].unique())\n", "\n", "plot_categories(df)\n", "\n", - "plt.xlabel('Year')\n", - "plt.ylabel('Average Star Rating')\n", + "plt.xlabel(\"Year\")\n", + "plt.ylabel(\"Average Star Rating\")\n", "plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n", "\n", "# fig.savefig('average_rating_category_all_data.png', dpi=300)\n", @@ -1021,14 +1081,16 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT star_rating,\n", " AVG(helpful_votes) AS avg_helpful_votes\n", "FROM {}.{}\n", "GROUP BY star_rating\n", "ORDER BY star_rating ASC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1066,10 +1128,12 @@ "metadata": {}, "outputs": [], "source": [ - "chart = df.plot.bar(x='star_rating', y='avg_helpful_votes', rot='0', figsize=(10,5), title='Helpfulness Of Star Ratings', legend=False )\n", + "chart = df.plot.bar(\n", + " x=\"star_rating\", y=\"avg_helpful_votes\", rot=\"0\", figsize=(10, 5), title=\"Helpfulness Of Star Ratings\", legend=False\n", + ")\n", "\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Average Helpful Votes')\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Average Helpful Votes\")\n", "\n", "# chart.get_figure().savefig('helpful-votes.png', dpi=300)\n", "plt.show(chart)" @@ -1098,7 +1162,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_title,\n", " helpful_votes,\n", @@ -1107,7 +1171,9 @@ " SUBSTR(review_body, 1, 100) AS review_body_substr\n", "FROM {}.{}\n", "ORDER BY helpful_votes DESC LIMIT 10 \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1145,7 +1211,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT (CAST(positive_review_count AS DOUBLE) / CAST(negative_review_count AS DOUBLE)) AS positive_to_negative_sentiment_ratio\n", "FROM (\n", @@ -1157,7 +1223,9 @@ " FROM {}.{}\n", " WHERE star_rating < 4\n", ")\n", - "\"\"\".format(database_name, table_name, database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name, database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1195,7 +1263,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT customer_id, product_category, product_title, \n", "ROUND(AVG(star_rating),4) AS avg_star_rating, COUNT(*) AS review_count \n", @@ -1204,7 +1272,9 @@ "HAVING COUNT(*) > 1 \n", "ORDER BY review_count DESC\n", "LIMIT 5\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1265,7 +1335,7 @@ "metadata": {}, "outputs": [], "source": [ - "summary = df['num_words'].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n", + "summary = df[\"num_words\"].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n", "summary" ] }, @@ -1275,9 +1345,9 @@ "metadata": {}, "outputs": [], "source": [ - "df['num_words'].plot.hist(xticks=[0, 16, 32, 64, 128, 256], \n", - " bins=100,\n", - " range=[0, 256]).axvline(x=summary['80%'], c='red')" + "df[\"num_words\"].plot.hist(xticks=[0, 16, 32, 64, 128, 256], bins=100, range=[0, 256]).axvline(\n", + " x=summary[\"80%\"], c=\"red\"\n", + ")" ] }, { @@ -1332,4 +1402,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb b/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb index c1d4cf89..d206f883 100644 --- a/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb +++ b/00_quickstart/07_Prepare_Dataset_Bias_Analysis.ipynb @@ -44,7 +44,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -85,10 +85,12 @@ "source": [ "import csv\n", "\n", - "df_giftcards = pd.read_csv('./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_giftcards = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_giftcards.shape" ] }, @@ -109,10 +111,12 @@ "source": [ "import csv\n", "\n", - "df_software = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_software = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_software.shape" ] }, @@ -133,10 +137,12 @@ "source": [ "import csv\n", "\n", - "df_videogames = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_videogames = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_videogames.shape" ] }, @@ -163,12 +169,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_giftcards[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_giftcards[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -178,12 +187,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_software[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_software[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -193,12 +205,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_videogames[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_videogames[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -270,7 +285,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df, x='star_rating', hue='product_category')" + "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -286,7 +301,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n", + "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n", "df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n", "df_balanced.shape" ] @@ -299,7 +314,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')" + "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -331,7 +346,7 @@ "metadata": {}, "outputs": [], "source": [ - "path = './data-clarify/amazon_reviews_us_giftcards_software_videogames.csv'\n", + "path = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\"\n", "df.to_csv(path, index=False, header=True)" ] }, @@ -357,7 +372,7 @@ "metadata": {}, "outputs": [], "source": [ - "path_balanced = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv'\n", + "path_balanced = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv\"\n", "df_balanced.to_csv(path_balanced, index=False, header=True)" ] }, @@ -374,8 +389,8 @@ "metadata": {}, "outputs": [], "source": [ - "path_jsonlines = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl'\n", - "df_balanced.to_json(path_or_buf=path_jsonlines, orient='records', lines=True)" + "path_jsonlines = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl\"\n", + "df_balanced.to_json(path_or_buf=path_jsonlines, orient=\"records\", lines=True)" ] }, { @@ -392,9 +407,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path)\n", + "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path)\n", "bias_data_s3_uri" ] }, @@ -413,7 +429,9 @@ "metadata": {}, "outputs": [], "source": [ - "balanced_bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_balanced)\n", + "balanced_bias_data_s3_uri = sess.upload_data(\n", + " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_balanced\n", + ")\n", "balanced_bias_data_s3_uri" ] }, @@ -432,7 +450,9 @@ "metadata": {}, "outputs": [], "source": [ - "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_jsonlines)\n", + "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(\n", + " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_jsonlines\n", + ")\n", "balanced_bias_data_jsonlines_s3_uri" ] }, diff --git a/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb b/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb index cb27a9c5..03d6d536 100644 --- a/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb +++ b/00_quickstart/08_Run_Data_Bias_Analysis_AdHoc.ipynb @@ -114,7 +114,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n", + "df = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n", "df.shape" ] }, @@ -132,7 +132,7 @@ }, "outputs": [], "source": [ - "sns.countplot(data=df, x='star_rating', hue='product_category')" + "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -166,11 +166,9 @@ }, "outputs": [], "source": [ - "facet_column = report.FacetColumn(name='product_category')\n", - "label_column = report.LabelColumn(name='star_rating', \n", - " data=df['star_rating'], \n", - " positive_label_values=[5, 4])\n", - "group_variable = df['product_category']" + "facet_column = report.FacetColumn(name=\"product_category\")\n", + "label_column = report.LabelColumn(name=\"star_rating\", data=df[\"star_rating\"], positive_label_values=[5, 4])\n", + "group_variable = df[\"product_category\"]" ] }, { @@ -194,11 +192,9 @@ }, "outputs": [], "source": [ - "report.bias_report(df, \n", - " facet_column, \n", - " label_column, \n", - " stage_type=report.StageType.PRE_TRAINING, \n", - " group_variable=group_variable)" + "report.bias_report(\n", + " df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n", + ")" ] }, { @@ -214,7 +210,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n", + "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n", "df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n", "df_balanced.shape" ] @@ -227,7 +223,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')" + "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -255,12 +251,10 @@ "source": [ "from smclarify.bias import report\n", "\n", - "facet_column = report.FacetColumn(name='product_category')\n", - "label_column = report.LabelColumn(name='star_rating',\n", - " data=df_balanced['star_rating'],\n", - " positive_label_values=[5, 4])\n", + "facet_column = report.FacetColumn(name=\"product_category\")\n", + "label_column = report.LabelColumn(name=\"star_rating\", data=df_balanced[\"star_rating\"], positive_label_values=[5, 4])\n", "\n", - "group_variable = df_balanced['product_category']" + "group_variable = df_balanced[\"product_category\"]" ] }, { @@ -276,11 +270,9 @@ "metadata": {}, "outputs": [], "source": [ - "report.bias_report(df_balanced,\n", - " facet_column,\n", - " label_column,\n", - " stage_type=report.StageType.PRE_TRAINING,\n", - " group_variable=group_variable)" + "report.bias_report(\n", + " df_balanced, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n", + ")" ] }, { diff --git a/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb b/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb index 9635551f..d37e1e1e 100644 --- a/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb +++ b/00_quickstart/09_Run_Data_Bias_Analysis_ProcessingJob.ipynb @@ -20,12 +20,12 @@ "import pandas as pd\n", "import numpy as np\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -72,7 +72,7 @@ "source": [ "import pandas as pd\n", "\n", - "data = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n", + "data = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n", "data.head()" ] }, @@ -101,7 +101,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=data, x='star_rating', hue='product_category')" + "sns.countplot(data=data, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -121,10 +121,9 @@ "source": [ "from sagemaker import clarify\n", "\n", - "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n", - " instance_count=1,\n", - " instance_type='ml.c5.2xlarge',\n", - " sagemaker_session=sess)" + "clarify_processor = clarify.SageMakerClarifyProcessor(\n", + " role=role, instance_count=1, instance_type=\"ml.c5.2xlarge\", sagemaker_session=sess\n", + ")" ] }, { @@ -151,13 +150,15 @@ "metadata": {}, "outputs": [], "source": [ - "bias_report_output_path = 's3://{}/clarify'.format(bucket)\n", + "bias_report_output_path = \"s3://{}/clarify\".format(bucket)\n", "\n", - "bias_data_config = clarify.DataConfig(s3_data_input_path=bias_data_s3_uri,\n", - " s3_output_path=bias_report_output_path,\n", - " label='star_rating',\n", - " headers=data.columns.to_list(),\n", - " dataset_type='text/csv')" + "bias_data_config = clarify.DataConfig(\n", + " s3_data_input_path=bias_data_s3_uri,\n", + " s3_output_path=bias_report_output_path,\n", + " label=\"star_rating\",\n", + " headers=data.columns.to_list(),\n", + " dataset_type=\"text/csv\",\n", + ")" ] }, { @@ -176,9 +177,9 @@ "metadata": {}, "outputs": [], "source": [ - "bias_config = clarify.BiasConfig(label_values_or_threshold=[5, 4],\n", - " facet_name='product_category',\n", - " group_name='product_category')" + "bias_config = clarify.BiasConfig(\n", + " label_values_or_threshold=[5, 4], facet_name=\"product_category\", group_name=\"product_category\"\n", + ")" ] }, { @@ -195,11 +196,8 @@ "outputs": [], "source": [ "clarify_processor.run_pre_training_bias(\n", - " data_config=bias_data_config,\n", - " data_bias_config=bias_config,\n", - " methods='all',\n", - " wait=False,\n", - " logs=False)" + " data_config=bias_data_config, data_bias_config=bias_config, methods=\"all\", wait=False, logs=False\n", + ")" ] }, { @@ -220,7 +218,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Processing Job'.format(region, run_pre_training_bias_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Job'.format(\n", + " region, run_pre_training_bias_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -231,7 +235,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, run_pre_training_bias_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, run_pre_training_bias_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -242,7 +252,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, run_pre_training_bias_processing_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n", + " bucket, run_pre_training_bias_processing_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -251,8 +267,9 @@ "metadata": {}, "outputs": [], "source": [ - "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=run_pre_training_bias_processing_job_name,\n", - " sagemaker_session=sess)\n", + "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n", + " processing_job_name=run_pre_training_bias_processing_job_name, sagemaker_session=sess\n", + ")\n", "\n", "processing_job_description = running_processor.describe()\n", "\n", @@ -302,7 +319,7 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Bias Report'))\n" + "display(HTML('Review Bias Report'))" ] }, { diff --git a/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb b/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb index 41439e6d..516a2567 100644 --- a/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb +++ b/00_quickstart/10_Create_SageMaker_Pipeline_BERT_Reviews.ipynb @@ -32,12 +32,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -54,9 +54,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "pipeline_name = 'BERT-pipeline-{}'.format(timestamp)" + "pipeline_name = \"BERT-pipeline-{}\".format(timestamp)" ] }, { @@ -95,12 +96,13 @@ "from smexperiments.experiment import Experiment\n", "\n", "pipeline_experiment = Experiment.create(\n", - " experiment_name=pipeline_name,\n", - " description='Amazon Customer Reviews BERT Pipeline Experiment', \n", - " sagemaker_boto_client=sm)\n", + " experiment_name=pipeline_name,\n", + " description=\"Amazon Customer Reviews BERT Pipeline Experiment\",\n", + " sagemaker_boto_client=sm,\n", + ")\n", "\n", "pipeline_experiment_name = pipeline_experiment.experiment_name\n", - "print('Pipeline experiment name: {}'.format(pipeline_experiment_name))" + "print(\"Pipeline experiment name: {}\".format(pipeline_experiment_name))" ] }, { @@ -128,12 +130,12 @@ "import time\n", "from smexperiments.trial import Trial\n", "\n", - "pipeline_trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n", - " experiment_name=pipeline_experiment_name,\n", - " sagemaker_boto_client=sm)\n", + "pipeline_trial = Trial.create(\n", + " trial_name=\"trial-{}\".format(timestamp), experiment_name=pipeline_experiment_name, sagemaker_boto_client=sm\n", + ")\n", "\n", "pipeline_trial_name = pipeline_trial.trial_name\n", - "print('Trial name: {}'.format(pipeline_trial_name))" + "print(\"Trial name: {}\".format(pipeline_trial_name))" ] }, { @@ -238,7 +240,7 @@ "metadata": {}, "outputs": [], "source": [ - "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n", + "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n", "print(raw_input_data_s3_uri)" ] }, @@ -258,6 +260,7 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", "input_data = ParameterString(\n", @@ -265,15 +268,9 @@ " default_value=raw_input_data_s3_uri,\n", ")\n", "\n", - "processing_instance_count = ParameterInteger(\n", - " name=\"ProcessingInstanceCount\",\n", - " default_value=1\n", - ")\n", + "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n", "\n", - "processing_instance_type = ParameterString(\n", - " name=\"ProcessingInstanceType\",\n", - " default_value=\"ml.c5.2xlarge\"\n", - ")\n", + "processing_instance_type = ParameterString(name=\"ProcessingInstanceType\", default_value=\"ml.c5.2xlarge\")\n", "\n", "max_seq_length = ParameterInteger(\n", " name=\"MaxSeqLength\",\n", @@ -284,7 +281,7 @@ " name=\"BalanceDataset\",\n", " default_value=\"True\",\n", ")\n", - " \n", + "\n", "train_split_percentage = ParameterFloat(\n", " name=\"TrainSplitPercentage\",\n", " default_value=0.90,\n", @@ -305,10 +302,7 @@ " default_value=\"reviews-feature-store-\" + str(timestamp),\n", ")\n", "\n", - "feature_group_name = ParameterString(\n", - " name=\"FeatureGroupName\",\n", - " default_value=\"reviews-feature-group-\" + str(timestamp)\n", - ")" + "feature_group_name = ParameterString(name=\"FeatureGroupName\", default_value=\"reviews-feature-group-\" + str(timestamp))" ] }, { @@ -341,12 +335,13 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " env={'AWS_DEFAULT_REGION': region}, \n", - " )" + "processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " env={\"AWS_DEFAULT_REGION\": region},\n", + ")" ] }, { @@ -358,45 +353,56 @@ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "from sagemaker.workflow.steps import ProcessingStep\n", "\n", - "processing_inputs=[\n", - " ProcessingInput(\n", - " input_name='raw-input-data',\n", - " source=input_data,\n", - " destination='/opt/ml/processing/input/data/',\n", - " s3_data_distribution_type='ShardedByS3Key'\n", - " )\n", + "processing_inputs = [\n", + " ProcessingInput(\n", + " input_name=\"raw-input-data\",\n", + " source=input_data,\n", + " destination=\"/opt/ml/processing/input/data/\",\n", + " s3_data_distribution_type=\"ShardedByS3Key\",\n", + " )\n", "]\n", "\n", - "processing_outputs=[\n", - " ProcessingOutput(output_name='bert-train',\n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/bert/train',\n", - " ),\n", - " ProcessingOutput(output_name='bert-validation',\n", - " s3_upload_mode='EndOfJob', \n", - " source='/opt/ml/processing/output/bert/validation',\n", - " ),\n", - " ProcessingOutput(output_name='bert-test',\n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/bert/test',\n", - " ),\n", - "] \n", + "processing_outputs = [\n", + " ProcessingOutput(\n", + " output_name=\"bert-train\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/train\",\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-validation\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/validation\",\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-test\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/test\",\n", + " ),\n", + "]\n", "\n", "processing_step = ProcessingStep(\n", - " name='Processing', \n", - " code='preprocess-scikit-text-to-bert-feature-store.py',\n", + " name=\"Processing\",\n", + " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n", " processor=processor,\n", " inputs=processing_inputs,\n", " outputs=processing_outputs,\n", - " job_arguments=['--train-split-percentage', str(train_split_percentage.default_value), \n", - " '--validation-split-percentage', str(validation_split_percentage.default_value),\n", - " '--test-split-percentage', str(test_split_percentage.default_value),\n", - " '--max-seq-length', str(max_seq_length.default_value),\n", - " '--balance-dataset', str(balance_dataset.default_value),\n", - " '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),\n", - " '--feature-group-name', str(feature_group_name.default_value)\n", - " ]\n", - ") \n", + " job_arguments=[\n", + " \"--train-split-percentage\",\n", + " str(train_split_percentage.default_value),\n", + " \"--validation-split-percentage\",\n", + " str(validation_split_percentage.default_value),\n", + " \"--test-split-percentage\",\n", + " str(test_split_percentage.default_value),\n", + " \"--max-seq-length\",\n", + " str(max_seq_length.default_value),\n", + " \"--balance-dataset\",\n", + " str(balance_dataset.default_value),\n", + " \"--feature-store-offline-prefix\",\n", + " str(feature_store_offline_prefix.default_value),\n", + " \"--feature-group-name\",\n", + " str(feature_group_name.default_value),\n", + " ],\n", + ")\n", "\n", "print(processing_step)" ] @@ -439,15 +445,9 @@ "metadata": {}, "outputs": [], "source": [ - "train_instance_type = ParameterString(\n", - " name=\"TrainingInstanceType\",\n", - " default_value=\"ml.c5.9xlarge\"\n", - ")\n", + "train_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.c5.9xlarge\")\n", "\n", - "train_instance_count = ParameterInteger(\n", - " name=\"TrainingInstanceCount\",\n", - " default_value=1\n", - ")" + "train_instance_count = ParameterInteger(name=\"TrainingInstanceCount\", default_value=1)" ] }, { @@ -464,56 +464,26 @@ "metadata": {}, "outputs": [], "source": [ - "epochs = ParameterInteger(\n", - " name=\"Epochs\",\n", - " default_value=1\n", - ")\n", - " \n", - "learning_rate = ParameterFloat(\n", - " name=\"LearningRate\",\n", - " default_value=0.00001\n", - ") \n", - " \n", - "epsilon = ParameterFloat(\n", - " name=\"Epsilon\",\n", - " default_value=0.00000001\n", - ")\n", - " \n", - "train_batch_size = ParameterInteger(\n", - " name=\"TrainBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "validation_batch_size = ParameterInteger(\n", - " name=\"ValidationBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "test_batch_size = ParameterInteger(\n", - " name=\"TestBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "train_steps_per_epoch = ParameterInteger(\n", - " name=\"TrainStepsPerEpoch\",\n", - " default_value=50\n", - ")\n", - " \n", - "validation_steps = ParameterInteger(\n", - " name=\"ValidationSteps\",\n", - " default_value=50\n", - ")\n", - " \n", - "test_steps = ParameterInteger(\n", - " name=\"TestSteps\",\n", - " default_value=50\n", - ")\n", - " \n", - "train_volume_size = ParameterInteger(\n", - " name=\"TrainVolumeSize\",\n", - " default_value=1024\n", - ") \n", - " \n", + "epochs = ParameterInteger(name=\"Epochs\", default_value=1)\n", + "\n", + "learning_rate = ParameterFloat(name=\"LearningRate\", default_value=0.00001)\n", + "\n", + "epsilon = ParameterFloat(name=\"Epsilon\", default_value=0.00000001)\n", + "\n", + "train_batch_size = ParameterInteger(name=\"TrainBatchSize\", default_value=128)\n", + "\n", + "validation_batch_size = ParameterInteger(name=\"ValidationBatchSize\", default_value=128)\n", + "\n", + "test_batch_size = ParameterInteger(name=\"TestBatchSize\", default_value=128)\n", + "\n", + "train_steps_per_epoch = ParameterInteger(name=\"TrainStepsPerEpoch\", default_value=50)\n", + "\n", + "validation_steps = ParameterInteger(name=\"ValidationSteps\", default_value=50)\n", + "\n", + "test_steps = ParameterInteger(name=\"TestSteps\", default_value=50)\n", + "\n", + "train_volume_size = ParameterInteger(name=\"TrainVolumeSize\", default_value=1024)\n", + "\n", "use_xla = ParameterString(\n", " name=\"UseXLA\",\n", " default_value=\"True\",\n", @@ -523,7 +493,7 @@ " name=\"UseAMP\",\n", " default_value=\"True\",\n", ")\n", - " \n", + "\n", "freeze_bert_layer = ParameterString(\n", " name=\"FreezeBERTLayer\",\n", " default_value=\"False\",\n", @@ -533,7 +503,7 @@ " name=\"EnableSageMakerDebugger\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "enable_checkpointing = ParameterString(\n", " name=\"EnableCheckpointing\",\n", " default_value=\"False\",\n", @@ -543,7 +513,7 @@ " name=\"EnableTensorboard\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "input_mode = ParameterString(\n", " name=\"InputMode\",\n", " default_value=\"File\",\n", @@ -558,7 +528,7 @@ " name=\"RunTest\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "run_sample_predictions = ParameterString(\n", " name=\"RunSamplePredictions\",\n", " default_value=\"False\",\n", @@ -579,10 +549,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -618,36 +588,39 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size, \n", - " py_version='py37',\n", - " framework_version='2.3.1',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " metric_definitions=metrics_definitions,\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py37\",\n", + " framework_version=\"2.3.1\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " metric_definitions=metrics_definitions,\n", + ")" ] }, { @@ -669,27 +642,21 @@ "from sagemaker.workflow.steps import TrainingStep\n", "\n", "training_step = TrainingStep(\n", - " name='Train',\n", + " name=\"Train\",\n", " estimator=estimator,\n", " inputs={\n", - " 'train': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-train'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", + " \"train\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-train\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " \"validation\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-validation\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", " ),\n", - " 'validation': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-validation'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", + " \"test\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-test\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", " ),\n", - " 'test': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-test'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", - " ) \n", " },\n", ")\n", "\n", @@ -743,12 +710,14 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "evaluation_processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " env={'AWS_DEFAULT_REGION': region},\n", - " max_runtime_in_seconds=7200)" + "evaluation_processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " env={\"AWS_DEFAULT_REGION\": region},\n", + " max_runtime_in_seconds=7200,\n", + ")" ] }, { @@ -759,7 +728,7 @@ }, "outputs": [], "source": [ - "!pygmentize evaluate_model_metrics.py\n" + "!pygmentize evaluate_model_metrics.py" ] }, { @@ -779,11 +748,7 @@ "source": [ "from sagemaker.workflow.properties import PropertyFile\n", "\n", - "evaluation_report = PropertyFile(\n", - " name='EvaluationReport',\n", - " output_name='metrics',\n", - " path='evaluation.json'\n", - ")" + "evaluation_report = PropertyFile(name=\"EvaluationReport\", output_name=\"metrics\", path=\"evaluation.json\")" ] }, { @@ -793,27 +758,28 @@ "outputs": [], "source": [ "evaluation_step = ProcessingStep(\n", - " name='EvaluateModel',\n", + " name=\"EvaluateModel\",\n", " processor=evaluation_processor,\n", - " code='evaluate_model_metrics.py',\n", + " code=\"evaluate_model_metrics.py\",\n", " inputs=[\n", " ProcessingInput(\n", " source=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n", - " destination='/opt/ml/processing/input/model'\n", + " destination=\"/opt/ml/processing/input/model\",\n", " ),\n", " ProcessingInput(\n", - " source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri,\n", - " destination='/opt/ml/processing/input/data'\n", - " )\n", + " source=processing_step.properties.ProcessingInputs[\"raw-input-data\"].S3Input.S3Uri,\n", + " destination=\"/opt/ml/processing/input/data\",\n", + " ),\n", " ],\n", " outputs=[\n", - " ProcessingOutput(output_name='metrics', \n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/metrics/'),\n", + " ProcessingOutput(\n", + " output_name=\"metrics\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/metrics/\"\n", + " ),\n", " ],\n", " job_arguments=[\n", - " '--max-seq-length', str(max_seq_length.default_value),\n", - " ],\n", + " \"--max-seq-length\",\n", + " str(max_seq_length.default_value),\n", + " ],\n", " property_files=[evaluation_report],\n", ")" ] @@ -831,14 +797,14 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.model_metrics import MetricsSource, ModelMetrics \n", + "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n", "\n", "model_metrics = ModelMetrics(\n", " model_statistics=MetricsSource(\n", " s3_uri=\"{}/evaluation.json\".format(\n", " evaluation_step.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", " ),\n", - " content_type=\"application/json\"\n", + " content_type=\"application/json\",\n", " )\n", ")\n", "\n", @@ -870,20 +836,11 @@ "metadata": {}, "outputs": [], "source": [ - "model_approval_status = ParameterString(\n", - " name=\"ModelApprovalStatus\",\n", - " default_value=\"PendingManualApproval\"\n", - ")\n", + "model_approval_status = ParameterString(name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\")\n", "\n", - "deploy_instance_type = ParameterString(\n", - " name=\"DeployInstanceType\",\n", - " default_value=\"ml.m5.4xlarge\"\n", - ")\n", + "deploy_instance_type = ParameterString(name=\"DeployInstanceType\", default_value=\"ml.m5.4xlarge\")\n", "\n", - "deploy_instance_count = ParameterInteger(\n", - " name=\"DeployInstanceCount\",\n", - " default_value=1\n", - ")" + "deploy_instance_count = ParameterInteger(name=\"DeployInstanceCount\", default_value=1)" ] }, { @@ -909,7 +866,7 @@ " version=\"2.3.1\",\n", " py_version=\"py37\",\n", " instance_type=deploy_instance_type,\n", - " image_scope=\"inference\"\n", + " image_scope=\"inference\",\n", ")\n", "print(inference_image_uri)" ] @@ -924,10 +881,10 @@ "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", - "# entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n", - "# source_dir='src',\n", + " # entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n", + " # source_dir='src',\n", " estimator=estimator,\n", - " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n", + " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n", " model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " content_types=[\"application/json\"],\n", " response_types=[\"application/json\"],\n", @@ -935,7 +892,7 @@ " transform_instances=[\"ml.c5.18xlarge\"],\n", " model_package_group_name=model_package_group_name,\n", " approval_status=model_approval_status,\n", - " model_metrics=model_metrics\n", + " model_metrics=model_metrics,\n", ")" ] }, @@ -956,7 +913,7 @@ "source": [ "from sagemaker.model import Model\n", "\n", - "model_name = 'bert-model-{}'.format(timestamp)\n", + "model_name = \"bert-model-{}\".format(timestamp)\n", "\n", "model = Model(\n", " name=model_name,\n", @@ -976,7 +933,7 @@ "from sagemaker.inputs import CreateModelInput\n", "\n", "create_inputs = CreateModelInput(\n", - " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n", + " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n", ")" ] }, @@ -1018,10 +975,7 @@ "metadata": {}, "outputs": [], "source": [ - "min_accuracy_value = ParameterFloat(\n", - " name=\"MinAccuracyValue\",\n", - " default_value=0.01\n", - ")" + "min_accuracy_value = ParameterFloat(name=\"MinAccuracyValue\", default_value=0.01)" ] }, { @@ -1042,14 +996,14 @@ " property_file=evaluation_report,\n", " json_path=\"metrics.accuracy.value\",\n", " ),\n", - " right=min_accuracy_value # accuracy\n", + " right=min_accuracy_value, # accuracy\n", ")\n", "\n", "minimum_accuracy_condition_step = ConditionStep(\n", " name=\"AccuracyCondition\",\n", " conditions=[minimum_accuracy_condition],\n", - " if_steps=[register_step, create_step], # success, continue with model registration\n", - " else_steps=[], # fail, end the pipeline\n", + " if_steps=[register_step, create_step], # success, continue with model registration\n", + " else_steps=[], # fail, end the pipeline\n", ")" ] }, @@ -1125,7 +1079,7 @@ " min_accuracy_value,\n", " model_approval_status,\n", " deploy_instance_type,\n", - " deploy_instance_count\n", + " deploy_instance_count,\n", " ],\n", " steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],\n", " sagemaker_session=sess,\n", @@ -1213,16 +1167,16 @@ " parameters=dict(\n", " InputData=raw_input_data_s3_uri,\n", " ProcessingInstanceCount=1,\n", - " ProcessingInstanceType='ml.c5.2xlarge',\n", + " ProcessingInstanceType=\"ml.c5.2xlarge\",\n", " MaxSeqLength=64,\n", - " BalanceDataset='True',\n", + " BalanceDataset=\"True\",\n", " TrainSplitPercentage=0.9,\n", " ValidationSplitPercentage=0.05,\n", " TestSplitPercentage=0.05,\n", - " FeatureStoreOfflinePrefix='reviews-feature-store-'+str(timestamp),\n", - " FeatureGroupName='reviews-feature-group-'+str(timestamp),\n", + " FeatureStoreOfflinePrefix=\"reviews-feature-store-\" + str(timestamp),\n", + " FeatureGroupName=\"reviews-feature-group-\" + str(timestamp),\n", " LearningRate=0.000012,\n", - " TrainingInstanceType='ml.c5.9xlarge',\n", + " TrainingInstanceType=\"ml.c5.9xlarge\",\n", " TrainingInstanceCount=1,\n", " Epochs=1,\n", " Epsilon=0.00000001,\n", @@ -1233,20 +1187,20 @@ " ValidationSteps=50,\n", " TestSteps=50,\n", " TrainVolumeSize=1024,\n", - " UseXLA='True',\n", - " UseAMP='True',\n", - " FreezeBERTLayer='False',\n", - " EnableSageMakerDebugger='False',\n", - " EnableCheckpointing='False',\n", - " EnableTensorboard='False',\n", - " InputMode='File',\n", - " RunValidation='True',\n", - " RunTest='Fasle',\n", - " RunSamplePredictions='False', \n", + " UseXLA=\"True\",\n", + " UseAMP=\"True\",\n", + " FreezeBERTLayer=\"False\",\n", + " EnableSageMakerDebugger=\"False\",\n", + " EnableCheckpointing=\"False\",\n", + " EnableTensorboard=\"False\",\n", + " InputMode=\"File\",\n", + " RunValidation=\"True\",\n", + " RunTest=\"Fasle\",\n", + " RunSamplePredictions=\"False\",\n", " MinAccuracyValue=0.01,\n", - " ModelApprovalStatus='PendingManualApproval', \n", - " DeployInstanceType='ml.m5.4xlarge',\n", - " DeployInstanceCount=1 \n", + " ModelApprovalStatus=\"PendingManualApproval\",\n", + " DeployInstanceType=\"ml.m5.4xlarge\",\n", + " DeployInstanceCount=1,\n", " )\n", ")\n", "\n", @@ -1287,7 +1241,7 @@ "metadata": {}, "outputs": [], "source": [ - "execution_run_name = execution_run['PipelineExecutionDisplayName']\n", + "execution_run_name = execution_run[\"PipelineExecutionDisplayName\"]\n", "print(execution_run_name)" ] }, @@ -1297,7 +1251,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = execution_run['PipelineExecutionArn']\n", + "pipeline_execution_arn = execution_run[\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -1349,18 +1303,18 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -1377,7 +1331,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -1387,7 +1341,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -1411,7 +1365,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -1441,8 +1395,8 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_name=None\n", - "training_job_name=None" + "processing_job_name = None\n", + "training_job_name = None" ] }, { @@ -1456,15 +1410,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -1486,7 +1440,7 @@ "outputs": [], "source": [ "# -aws-processing-job is the default name assigned by ProcessingJob\n", - "processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)\n", + "processing_job_tc = \"{}-aws-processing-job\".format(processing_job_name)\n", "print(processing_job_tc)" ] }, @@ -1514,10 +1468,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = sm.associate_trial_component(\n", - " TrialComponentName=processing_job_tc,\n", - " TrialName=pipeline_trial_name\n", - ")" + "response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=pipeline_trial_name)" ] }, { @@ -1527,7 +1478,7 @@ "outputs": [], "source": [ "# -aws-training-job is the default name assigned by TrainingJob\n", - "training_job_tc = '{}-aws-training-job'.format(training_job_name)\n", + "training_job_tc = \"{}-aws-training-job\".format(training_job_name)\n", "print(training_job_tc)" ] }, @@ -1537,10 +1488,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = sm.associate_trial_component(\n", - " TrialComponentName=training_job_tc,\n", - " TrialName=pipeline_trial_name\n", - ")" + "response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=pipeline_trial_name)" ] }, { @@ -1560,9 +1508,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"balance_dataset\": str(balance_dataset), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"balance_dataset\": str(balance_dataset),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1574,9 +1524,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"train_split_percentage\": str(train_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"train_split_percentage\": str(train_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1588,9 +1540,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"validation_split_percentage\": str(validation_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"validation_split_percentage\": str(validation_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1602,9 +1556,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"test_split_percentage\": str(test_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"test_split_percentage\": str(test_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1616,9 +1572,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"max_seq_length\": str(max_seq_length), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"max_seq_length\": str(max_seq_length),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1630,11 +1588,13 @@ "metadata": {}, "outputs": [], "source": [ - "time.sleep(5) # avoid throttling exception \n", + "time.sleep(5) # avoid throttling exception\n", "\n", - "processing_job_tracker.log_parameters({\n", - " \"feature_store_offline_prefix\": str(feature_store_offline_prefix), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"feature_store_offline_prefix\": str(feature_store_offline_prefix),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1646,11 +1606,13 @@ "metadata": {}, "outputs": [], "source": [ - "time.sleep(5) # avoid throttling exception \n", + "time.sleep(5) # avoid throttling exception\n", "\n", - "processing_job_tracker.log_parameters({\n", - " \"feature_group_name\": str(feature_group_name), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"feature_group_name\": str(feature_group_name),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1671,9 +1633,10 @@ "source": [ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", - "time.sleep(30) # avoid throttling exception\n", + "time.sleep(30) # avoid throttling exception\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", diff --git a/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb b/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb index bbe0e00f..d281310e 100644 --- a/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb +++ b/00_quickstart/11_Evaluate_Pipeline_Execution.ipynb @@ -24,12 +24,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -79,18 +79,18 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -107,7 +107,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -117,7 +117,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -147,14 +147,16 @@ "metadata": {}, "outputs": [], "source": [ - "#for execution_step in reversed(execution.list_steps()):\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", - " if execution_step['StepName'] == 'EvaluateModel':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + "# for execution_step in reversed(execution.list_steps()):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", + " if execution_step[\"StepName\"] == \"EvaluateModel\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", "\n", "describe_evaluation_processing_job_response = sm.describe_processing_job(ProcessingJobName=processing_job_name)\n", "\n", - "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n", + "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response[\"ProcessingOutputConfig\"][\"Outputs\"][0][\n", + " \"S3Output\"\n", + "][\"S3Uri\"]\n", "evaluation_metrics_s3_uri" ] }, @@ -167,9 +169,7 @@ "import json\n", "from pprint import pprint\n", "\n", - "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(\n", - " evaluation_metrics_s3_uri\n", - "))\n", + "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(evaluation_metrics_s3_uri))\n", "\n", "pprint(json.loads(evaluation_json))" ] @@ -187,15 +187,15 @@ "metadata": {}, "outputs": [], "source": [ - "training_job_arn=None\n", + "training_job_arn = None\n", "\n", - "for execution_step in steps['PipelineExecutionSteps']:\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", " if execution_step[\"StepName\"] == \"Train\":\n", " training_job_arn = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"]\n", - " \n", + "\n", " break\n", - " \n", - "training_job_name = training_job_arn.split('/')[-1]\n", + "\n", + "training_job_name = training_job_arn.split(\"/\")[-1]\n", "print(training_job_name)" ] }, @@ -205,7 +205,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)['ModelArtifacts']['S3ModelArtifacts']" + "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)[\"ModelArtifacts\"][\"S3ModelArtifacts\"]" ] }, { @@ -223,8 +223,8 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -p ./model \n", - "!tar -zxvf model.tar.gz -C ./model " + "!mkdir -p ./model\n", + "!tar -zxvf model.tar.gz -C ./model" ] }, { @@ -261,8 +261,8 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_name=None\n", - "training_job_name=None" + "processing_job_name = None\n", + "training_job_name = None" ] }, { @@ -276,15 +276,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -307,9 +307,10 @@ "source": [ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", - "time.sleep(30) # avoid throttling exception\n", + "time.sleep(30) # avoid throttling exception\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", diff --git a/00_quickstart/12_Register_Deploy_Model.ipynb b/00_quickstart/12_Register_Deploy_Model.ipynb index cee60eea..d72054e1 100644 --- a/00_quickstart/12_Register_Deploy_Model.ipynb +++ b/00_quickstart/12_Register_Deploy_Model.ipynb @@ -28,12 +28,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -65,20 +65,20 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", - "# print('Executions for our pipeline...')\n", - "# print(pipeline_execution_status)\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", + " # print('Executions for our pipeline...')\n", + " # print(pipeline_execution_status)\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -135,9 +135,9 @@ "metadata": {}, "outputs": [], "source": [ - "for execution_step in steps['PipelineExecutionSteps']:\n", - " if execution_step['StepName'] == 'RegisterModel':\n", - " model_package_arn = execution_step['Metadata']['RegisterModel']['Arn']\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", + " if execution_step[\"StepName\"] == \"RegisterModel\":\n", + " model_package_arn = execution_step[\"Metadata\"][\"RegisterModel\"][\"Arn\"]\n", " break\n", "print(model_package_arn)" ] @@ -150,7 +150,7 @@ "source": [ "model_package_update_response = sm.update_model_package(\n", " ModelPackageArn=model_package_arn,\n", - " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n", + " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n", ")" ] }, @@ -167,13 +167,13 @@ "metadata": {}, "outputs": [], "source": [ - "for execution_step in steps['PipelineExecutionSteps']:\n", - " if execution_step['StepName'] == 'CreateModel':\n", - " model_arn = execution_step['Metadata']['Model']['Arn']\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", + " if execution_step[\"StepName\"] == \"CreateModel\":\n", + " model_arn = execution_step[\"Metadata\"][\"Model\"][\"Arn\"]\n", " break\n", "print(model_arn)\n", "\n", - "model_name = model_arn.split('/')[-1]\n", + "model_name = model_arn.split(\"/\")[-1]\n", "print(model_name)" ] }, @@ -192,13 +192,14 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "model_from_registry_name = 'bert-model-from-registry-{}'.format(timestamp)\n", + "model_from_registry_name = \"bert-model-from-registry-{}\".format(timestamp)\n", "print(\"Model from registry name : {}\".format(model_from_registry_name))\n", "\n", "model_registry_package_container = {\n", - " 'ModelPackageName': model_package_arn,\n", + " \"ModelPackageName\": model_package_arn,\n", "}" ] }, @@ -208,12 +209,10 @@ "metadata": {}, "outputs": [], "source": [ - "from pprint import pprint \n", + "from pprint import pprint\n", "\n", "create_model_from_registry_respose = sm.create_model(\n", - " ModelName = model_from_registry_name,\n", - " ExecutionRoleArn = role,\n", - " PrimaryContainer = model_registry_package_container\n", + " ModelName=model_from_registry_name, ExecutionRoleArn=role, PrimaryContainer=model_registry_package_container\n", ")\n", "pprint(create_model_from_registry_respose)" ] @@ -224,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_from_registry_arn = create_model_from_registry_respose['ModelArn']\n", + "model_from_registry_arn = create_model_from_registry_respose[\"ModelArn\"]\n", "model_from_registry_arn" ] }, @@ -234,17 +233,21 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint_config_name = 'bert-model-from-registry-epc-{}'.format(timestamp)\n", + "endpoint_config_name = \"bert-model-from-registry-epc-{}\".format(timestamp)\n", "print(endpoint_config_name)\n", "\n", "create_endpoint_config_response = sm.create_endpoint_config(\n", - " EndpointConfigName = endpoint_config_name,\n", - " ProductionVariants=[{\n", - " 'InstanceType':'ml.m5.4xlarge',\n", - " 'InitialVariantWeight':1,\n", - " 'InitialInstanceCount':1,\n", - " 'ModelName': model_name,\n", - " 'VariantName':'AllTraffic'}])" + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"InstanceType\": \"ml.m5.4xlarge\",\n", + " \"InitialVariantWeight\": 1,\n", + " \"InitialInstanceCount\": 1,\n", + " \"ModelName\": model_name,\n", + " \"VariantName\": \"AllTraffic\",\n", + " }\n", + " ],\n", + ")" ] }, { @@ -253,13 +256,13 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_endpoint_name = 'bert-model-from-registry-ep-{}'.format(timestamp)\n", + "pipeline_endpoint_name = \"bert-model-from-registry-ep-{}\".format(timestamp)\n", "print(\"EndpointName={}\".format(pipeline_endpoint_name))\n", "\n", "create_endpoint_response = sm.create_endpoint(\n", - " EndpointName=pipeline_endpoint_name,\n", - " EndpointConfigName=endpoint_config_name)\n", - "print(create_endpoint_response['EndpointArn'])" + " EndpointName=pipeline_endpoint_name, EndpointConfigName=endpoint_config_name\n", + ")\n", + "print(create_endpoint_response[\"EndpointArn\"])" ] }, { @@ -270,7 +273,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, pipeline_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, pipeline_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -297,7 +306,7 @@ "source": [ "%%time\n", "\n", - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=pipeline_endpoint_name)" ] }, @@ -348,20 +357,20 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", " display(viz.show(pipeline_execution_step=execution_step))\n", - " time.sleep(5)\n" + " time.sleep(5)" ] }, { diff --git a/00_quickstart/13_Cleanup.ipynb b/00_quickstart/13_Cleanup.ipynb index d724df10..6dbe74c8 100644 --- a/00_quickstart/13_Cleanup.ipynb +++ b/00_quickstart/13_Cleanup.ipynb @@ -31,9 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "sm.delete_endpoint(\n", - " EndpointName=pipeline_endpoint_name\n", - ")" + "sm.delete_endpoint(EndpointName=pipeline_endpoint_name)" ] } ], diff --git a/00_quickstart/evaluate_model_metrics.py b/00_quickstart/evaluate_model_metrics.py index 024afdec..f3523174 100644 --- a/00_quickstart/evaluate_model_metrics.py +++ b/00_quickstart/evaluate_model_metrics.py @@ -4,13 +4,16 @@ from datetime import datetime import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) import pandas as pd import os import re @@ -33,99 +36,99 @@ from sklearn.utils import resample -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] -config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) +config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, +) def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--input-model', type=str, - default='/opt/ml/processing/input/model', + parser.add_argument( + "--input-model", + type=str, + default="/opt/ml/processing/input/model", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - + ) + return parser.parse_args() - + def process(args): - print('Current host: {}'.format(args.current_host)) - - print('input_data: {}'.format(args.input_data)) - print('input_model: {}'.format(args.input_model)) - - print('Listing contents of input model dir: {}'.format(args.input_model)) + print("Current host: {}".format(args.current_host)) + + print("input_data: {}".format(args.input_data)) + print("input_model: {}".format(args.input_model)) + + print("Listing contents of input model dir: {}".format(args.input_model)) input_files = os.listdir(args.input_model) for file in input_files: print(file) - model_tar_path = '{}/model.tar.gz'.format(args.input_model) + model_tar_path = "{}/model.tar.gz".format(args.input_model) model_tar = tarfile.open(model_tar_path) model_tar.extractall(args.input_model) - model_tar.close() + model_tar.close() - model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model)) + model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model)) print(model) - + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=args.max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -133,81 +136,86 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) - + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) ########################################################################################### # TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz # - ########################################################################################### -# evaluation_data_path = '/opt/ml/processing/input/data/' - - print('Listing contents of input data dir: {}'.format(args.input_data)) + ########################################################################################### + # evaluation_data_path = '/opt/ml/processing/input/data/' + + print("Listing contents of input data dir: {}".format(args.input_data)) input_files = os.listdir(args.input_data) - test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data) - print('Using only {} to evaluate.'.format(test_data_path)) - df_test_reviews = pd.read_csv(test_data_path, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data) + print("Using only {} to evaluate.".format(test_data_path)) + df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[ + ["review_body", "star_rating"] + ] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - y_actual = df_test_reviews['star_rating'] + y_actual = df_test_reviews["star_rating"] y_actual print(classification_report(y_true=y_test, y_pred=y_actual)) - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) def plot_conf_mat(cm, classes, title, cmap): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.ylabel("True label") + plt.xlabel("Predicted label") cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=CLASSES, - title='Confusion Matrix', - cmap=plt.cm.Greens) + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens) - # Save the confusion matrix + # Save the confusion matrix plt.show() - # Model Output - metrics_path = os.path.join(args.output_data, 'metrics/') + # Model Output + metrics_path = os.path.join(args.output_data, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) report_dict = { "metrics": { @@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap): evaluation_path = "{}/evaluation.json".format(metrics_path) with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict)) - - print('Listing contents of output dir: {}'.format(args.output_data)) + + print("Listing contents of output dir: {}".format(args.output_data)) output_files = os.listdir(args.output_data) for file in output_files: print(file) - print('Listing contents of output/metrics dir: {}'.format(metrics_path)) - output_files = os.listdir('{}'.format(metrics_path)) + print("Listing contents of output/metrics dir: {}".format(metrics_path)) + output_files = os.listdir("{}".format(metrics_path)) for file in output_files: print(file) - print('Complete') - - + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) - process(args) + process(args) diff --git a/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py b/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py index 1211ba85..7e1cd385 100644 --- a/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py +++ b/00_quickstart/preprocess-scikit-text-to-bert-feature-store.py @@ -20,16 +20,18 @@ import subprocess ## PIP INSTALLS ## -# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to +# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to # use anaconda and anaconda only supports 2.3.0 at this time -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"]) import pandas as pd import re import sagemaker @@ -40,51 +42,55 @@ FeatureTypeEnum, ) -region = os.environ['AWS_DEFAULT_REGION'] -print('Region: {}'.format(region)) +region = os.environ["AWS_DEFAULT_REGION"] +print("Region: {}".format(region)) ############################# ## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc. ## Role and Bucket are malformed if we do this later. -sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region) +sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region) caller_identity = sts.get_caller_identity() -print('caller_identity: {}'.format(caller_identity)) +print("caller_identity: {}".format(caller_identity)) -assumed_role_arn = caller_identity['Arn'] -print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn)) +assumed_role_arn = caller_identity["Arn"] +print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn)) -assumed_role_name = assumed_role_arn.split('/')[-2] +assumed_role_name = assumed_role_arn.split("/")[-2] -iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region) -get_role_response = iam.get_role(RoleName=assumed_role_name) -print('get_role_response {}'.format(get_role_response)) -role = get_role_response['Role']['Arn'] -print('role {}'.format(role)) +iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region) +get_role_response = iam.get_role(RoleName=assumed_role_name) +print("get_role_response {}".format(get_role_response)) +role = get_role_response["Role"]["Arn"] +print("role {}".format(role)) bucket = sagemaker.Session().default_bucket() -print('The DEFAULT BUCKET is {}'.format(bucket)) +print("The DEFAULT BUCKET is {}".format(bucket)) ############################# -sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region) +sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region) -featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region) +featurestore_runtime = boto3.Session(region_name=region).client( + service_name="sagemaker-featurestore-runtime", region_name=region +) -s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region) +s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region) -sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region), - sagemaker_client=sm, - sagemaker_featurestore_runtime_client=featurestore_runtime) +sagemaker_session = sagemaker.Session( + boto_session=boto3.Session(region_name=region), + sagemaker_client=sm, + sagemaker_featurestore_runtime_client=featurestore_runtime, +) -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -REVIEW_BODY_COLUMN = 'review_body' -REVIEW_ID_COLUMN = 'review_id' +REVIEW_BODY_COLUMN = "review_body" +REVIEW_ID_COLUMN = "review_id" # DATE_COLUMN = 'date' -LABEL_COLUMN = 'star_rating' +LABEL_COLUMN = "star_rating" LABEL_VALUES = [1, 2, 3, 4, 5] - + label_map = {} for (i, label) in enumerate(LABEL_VALUES): label_map[label] = i @@ -92,94 +98,88 @@ def cast_object_to_string(data_frame): for label in data_frame.columns: - if data_frame.dtypes[label] == 'object': + if data_frame.dtypes[label] == "object": data_frame[label] = data_frame[label].astype("str").astype("string") return data_frame - + def wait_for_feature_group_creation_complete(feature_group): try: status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) if status != "Created": - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.") except: - print('No feature group created yet.') - - + print("No feature group created yet.") + + def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records - feature_definitions= [ - FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), -# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) + feature_definitions = [ + FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), + # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] - + feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=sagemaker_session) - - print('Feature Group: {}'.format(feature_group)) - - try: - print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...') + name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session + ) + + print("Feature Group: {}".format(feature_group)) + + try: + print( + "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." + ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: - print('Before CREATE FG wait exeption: {}'.format(e)) -# pass - + print("Before CREATE FG wait exeption: {}".format(e)) + # pass + try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" - - print('Creating Feature Group with role {}...'.format(role)) + + print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, - enable_online_store=True + enable_online_store=True, ) - print('Creating Feature Group. Completed.') - - print('Waiting for new Feature Group to become available...') + print("Creating Feature Group. Completed.") + + print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) - print('Feature Group available.') + print("Feature Group available.") feature_group.describe() - + except Exception as e: - print('Exception: {}'.format(e)) - + print("Exception: {}".format(e)) + return feature_group - + class InputFeatures(object): - """BERT feature vectors.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - review_id, - date, - label): -# review_body): + """BERT feature vectors.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label): + # review_body): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids @@ -187,36 +187,38 @@ def __init__(self, self.review_id = review_id self.date = date self.label = label + + # self.review_body = review_body - - + + class Input(object): - """A single training/test input for sequence classification.""" - - def __init__(self, text, review_id, date, label=None): - """Constructs an Input. - Args: - text: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.text = text - self.review_id = review_id - self.date = date - self.label = label - - + """A single training/test input for sequence classification.""" + + def __init__(self, text, review_id, date, label=None): + """Constructs an Input. + Args: + text: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.text = text + self.review_id = review_id + self.date = date + self.label = label + + def convert_input(the_input, max_seq_length): # First, we need to preprocess our data so that it matches the data BERT was trained on: # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) - # + # # Fortunately, the Transformers tokenizer does this for us! # - tokens = tokenizer.tokenize(the_input.text) + tokens = tokenizer.tokenize(the_input.text) # Next, we need to do the following: # @@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length): # # Again, the Transformers tokenizer does this for us! # - encode_plus_tokens = tokenizer.encode_plus(the_input.text, - pad_to_max_length=True, - max_length=max_seq_length, -# truncation=True - ) + encode_plus_tokens = tokenizer.encode_plus( + the_input.text, + pad_to_max_length=True, + max_length=max_seq_length, + # truncation=True + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] - - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + input_ids = encode_plus_tokens["input_ids"] + + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction. segment_ids = [0] * max_seq_length @@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length): label_id=label_id, review_id=the_input.review_id, date=the_input.date, - label=the_input.label) -# review_body=the_input.text) - -# print('**input_ids**\n{}\n'.format(features.input_ids)) -# print('**input_mask**\n{}\n'.format(features.input_mask)) -# print('**segment_ids**\n{}\n'.format(features.segment_ids)) -# print('**label_id**\n{}\n'.format(features.label_id)) -# print('**review_id**\n{}\n'.format(features.review_id)) -# print('**date**\n{}\n'.format(features.date)) -# print('**label**\n{}\n'.format(features.label)) -# print('**review_body**\n{}\n'.format(features.review_body)) + label=the_input.label, + ) + # review_body=the_input.text) + + # print('**input_ids**\n{}\n'.format(features.input_ids)) + # print('**input_mask**\n{}\n'.format(features.input_mask)) + # print('**segment_ids**\n{}\n'.format(features.segment_ids)) + # print('**label_id**\n{}\n'.format(features.label_id)) + # print('**review_id**\n{}\n'.format(features.review_id)) + # print('**date**\n{}\n'.format(features.date)) + # print('**label**\n{}\n'.format(features.label)) + # print('**review_body**\n{}\n'.format(features.review_body)) return features -def transform_inputs_to_tfrecord(inputs, - output_file, - max_seq_length): +def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length): """Convert a set of `Input`s to a TFRecord file.""" records = [] tf_record_writer = tf.io.TFRecordWriter(output_file) - + for (input_idx, the_input) in enumerate(inputs): if input_idx % 10000 == 0: - print('Writing input {} of {}\n'.format(input_idx, len(inputs))) + print("Writing input {} of {}\n".format(input_idx, len(inputs))) features = convert_input(the_input, max_seq_length) all_features = collections.OrderedDict() - all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) - all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) - all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) - all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) + all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) + all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) + all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) + all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) tf_record = tf.train.Example(features=tf.train.Features(feature=all_features)) tf_record_writer.write(tf_record.SerializeToString()) - records.append({#'tf_record': tf_record.SerializeToString(), - 'input_ids': features.input_ids, - 'input_mask': features.input_mask, - 'segment_ids': features.segment_ids, - 'label_id': features.label_id, - 'review_id': the_input.review_id, - 'date': the_input.date, - 'label': features.label, -# 'review_body': features.review_body - }) + records.append( + { #'tf_record': tf_record.SerializeToString(), + "input_ids": features.input_ids, + "input_mask": features.input_mask, + "segment_ids": features.segment_ids, + "label_id": features.label_id, + "review_id": the_input.review_id, + "date": the_input.date, + "label": features.label, + # 'review_body': features.review_body + } + ) ##################################### ####### TODO: REMOVE THIS BREAK ####### - ##################################### + ##################################### # break - + tf_record_writer.close() - + return records - + def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--train-split-percentage', type=float, + parser.add_argument( + "--train-split-percentage", + type=float, default=0.90, ) - parser.add_argument('--validation-split-percentage', type=float, - default=0.05, - ) - parser.add_argument('--test-split-percentage', type=float, + parser.add_argument( + "--validation-split-percentage", + type=float, default=0.05, ) - parser.add_argument('--balance-dataset', type=eval, - default=True + parser.add_argument( + "--test-split-percentage", + type=float, + default=0.05, ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument("--balance-dataset", type=eval, default=True) + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - parser.add_argument('--feature-store-offline-prefix', type=str, + ) + parser.add_argument( + "--feature-store-offline-prefix", + type=str, default=None, - ) - parser.add_argument('--feature-group-name', type=str, + ) + parser.add_argument( + "--feature-group-name", + type=str, default=None, - ) - + ) + return parser.parse_args() - -def _transform_tsv_to_tfrecord(file, - max_seq_length, - balance_dataset, - prefix, - feature_group_name): - print('file {}'.format(file)) - print('max_seq_length {}'.format(max_seq_length)) - print('balance_dataset {}'.format(balance_dataset)) - print('prefix {}'.format(prefix)) - print('feature_group_name {}'.format(feature_group_name)) + +def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name): + print("file {}".format(file)) + print("max_seq_length {}".format(max_seq_length)) + print("balance_dataset {}".format(balance_dataset)) + print("prefix {}".format(prefix)) + print("feature_group_name {}".format(feature_group_name)) # need to re-load since we can't pass feature_group object in _partial functions for some reason feature_group = create_or_load_feature_group(prefix, feature_group_name) - + filename_without_extension = Path(Path(file).stem).stem - df = pd.read_csv(file, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip') + df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip") df.isna().values.any() df = df.dropna() df = df.reset_index(drop=True) - print('Shape of dataframe {}'.format(df.shape)) + print("Shape of dataframe {}".format(df.shape)) - if balance_dataset: + if balance_dataset: # Balance the dataset down to the minority class from sklearn.utils import resample - five_star_df = df.query('star_rating == 5') - four_star_df = df.query('star_rating == 4') - three_star_df = df.query('star_rating == 3') - two_star_df = df.query('star_rating == 2') - one_star_df = df.query('star_rating == 1') - - minority_count = min(five_star_df.shape[0], - four_star_df.shape[0], - three_star_df.shape[0], - two_star_df.shape[0], - one_star_df.shape[0]) - - five_star_df = resample(five_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - four_star_df = resample(four_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - three_star_df = resample(three_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - two_star_df = resample(two_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - one_star_df = resample(one_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) + five_star_df = df.query("star_rating == 5") + four_star_df = df.query("star_rating == 4") + three_star_df = df.query("star_rating == 3") + two_star_df = df.query("star_rating == 2") + one_star_df = df.query("star_rating == 1") + + minority_count = min( + five_star_df.shape[0], + four_star_df.shape[0], + three_star_df.shape[0], + two_star_df.shape[0], + one_star_df.shape[0], + ) + + five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27) + + four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27) + + three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27) + + two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27) + + one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27) df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df]) - df_balanced = df_balanced.reset_index(drop=True) - print('Shape of balanced dataframe {}'.format(df_balanced.shape)) - print(df_balanced['star_rating'].head(100)) + df_balanced = df_balanced.reset_index(drop=True) + print("Shape of balanced dataframe {}".format(df_balanced.shape)) + print(df_balanced["star_rating"].head(100)) df = df_balanced - - print('Shape of dataframe before splitting {}'.format(df.shape)) - - print('train split percentage {}'.format(args.train_split_percentage)) - print('validation split percentage {}'.format(args.validation_split_percentage)) - print('test split percentage {}'.format(args.test_split_percentage)) - + + print("Shape of dataframe before splitting {}".format(df.shape)) + + print("train split percentage {}".format(args.train_split_percentage)) + print("validation split percentage {}".format(args.validation_split_percentage)) + print("test split percentage {}".format(args.test_split_percentage)) + holdout_percentage = 1.00 - args.train_split_percentage - print('holdout percentage {}'.format(holdout_percentage)) - df_train, df_holdout = train_test_split(df, - test_size=holdout_percentage, - stratify=df['star_rating']) + print("holdout percentage {}".format(holdout_percentage)) + df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"]) test_holdout_percentage = args.test_split_percentage / holdout_percentage - print('test holdout percentage {}'.format(test_holdout_percentage)) - df_validation, df_test = train_test_split(df_holdout, - test_size=test_holdout_percentage, - stratify=df_holdout['star_rating']) - + print("test holdout percentage {}".format(test_holdout_percentage)) + df_validation, df_test = train_test_split( + df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"] + ) + df_train = df_train.reset_index(drop=True) df_validation = df_validation.reset_index(drop=True) df_test = df_test.reset_index(drop=True) - print('Shape of train dataframe {}'.format(df_train.shape)) - print('Shape of validation dataframe {}'.format(df_validation.shape)) - print('Shape of test dataframe {}'.format(df_test.shape)) + print("Shape of train dataframe {}".format(df_train.shape)) + print("Shape of validation dataframe {}".format(df_validation.shape)) + print("Shape of test dataframe {}".format(df_test.shape)) timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) - train_inputs = df_train.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - validation_inputs = df_validation.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - test_inputs = df_test.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) + train_inputs = df_train.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + validation_inputs = df_validation.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + test_inputs = df_test.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): - # - # + # + # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) # 4. Map our words to indexes using a vocab file that BERT provides # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) - # + # # We don't have to worry about these details. The Transformers tokenizer does this for us. - # - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) + # + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow. - train_records = transform_inputs_to_tfrecord(train_inputs, - '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension), - max_seq_length) - - validation_records = transform_inputs_to_tfrecord(validation_inputs, - '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension), - max_seq_length) - - test_records = transform_inputs_to_tfrecord(test_inputs, - '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension), - max_seq_length) - + train_records = transform_inputs_to_tfrecord( + train_inputs, + "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + validation_records = transform_inputs_to_tfrecord( + validation_inputs, + "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + test_records = transform_inputs_to_tfrecord( + test_inputs, + "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension), + max_seq_length, + ) + df_train_records = pd.DataFrame.from_dict(train_records) - df_train_records['split_type'] = 'train' - df_train_records.head() - + df_train_records["split_type"] = "train" + df_train_records.head() + df_validation_records = pd.DataFrame.from_dict(validation_records) - df_validation_records['split_type'] = 'validation' - df_validation_records.head() + df_validation_records["split_type"] = "validation" + df_validation_records.head() df_test_records = pd.DataFrame.from_dict(test_records) - df_test_records['split_type'] = 'test' - df_test_records.head() - - # Add record to feature store + df_test_records["split_type"] = "test" + df_test_records.head() + + # Add record to feature store df_fs_train_records = cast_object_to_string(df_train_records) df_fs_validation_records = cast_object_to_string(df_validation_records) df_fs_test_records = cast_object_to_string(df_test_records) - print('Ingesting Features...') - feature_group.ingest( - data_frame=df_fs_train_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_validation_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_test_records, max_workers=3, wait=True - ) - print('Feature ingest completed.') + print("Ingesting Features...") + feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True) + print("Feature ingest completed.") def process(args): - print('Current host: {}'.format(args.current_host)) - - feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) + print("Current host: {}".format(args.current_host)) + + feature_group = create_or_load_feature_group( + prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name + ) feature_group.describe() - + print(feature_group.as_hive_ddl()) - - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) - - transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord, - max_seq_length=args.max_seq_length, - balance_dataset=args.balance_dataset, - prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) - - input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data)) + + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) + + transform_tsv_to_tfrecord = functools.partial( + _transform_tsv_to_tfrecord, + max_seq_length=args.max_seq_length, + balance_dataset=args.balance_dataset, + prefix=args.feature_store_offline_prefix, + feature_group_name=args.feature_group_name, + ) + + input_files = glob.glob("{}/*.tsv.gz".format(args.input_data)) num_cpus = multiprocessing.cpu_count() - print('num_cpus {}'.format(num_cpus)) + print("num_cpus {}".format(num_cpus)) p = multiprocessing.Pool(num_cpus) p.map(transform_tsv_to_tfrecord, input_files) - print('Listing contents of {}'.format(args.output_data)) + print("Listing contents of {}".format(args.output_data)) dirs_output = os.listdir(args.output_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(train_data)) + print("Listing contents of {}".format(train_data)) dirs_output = os.listdir(train_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(validation_data)) + print("Listing contents of {}".format(validation_data)) dirs_output = os.listdir(validation_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(test_data)) + print("Listing contents of {}".format(test_data)) dirs_output = os.listdir(test_data) for file in dirs_output: print(file) - + offline_store_contents = None - while (offline_store_contents is None): - objects_in_bucket = s3.list_objects(Bucket=bucket, - Prefix=args.feature_store_offline_prefix) - if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): - offline_store_contents = objects_in_bucket['Contents'] + while offline_store_contents is None: + objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix) + if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1: + offline_store_contents = objects_in_bucket["Contents"] else: - print('Waiting for data in offline store...\n') + print("Waiting for data in offline store...\n") sleep(60) - print('Data available.') - - print('Complete') - - + print("Data available.") + + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) process(args) diff --git a/00_quickstart/src/inference.py b/00_quickstart/src/inference.py index 2975dc2d..53196737 100644 --- a/00_quickstart/src/inference.py +++ b/00_quickstart/src/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/00_quickstart/src/tf_bert_reviews.py b/00_quickstart/src/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/00_quickstart/src/tf_bert_reviews.py +++ b/00_quickstart/src/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/01_setup/01_Setup_Dependencies.ipynb b/01_setup/01_Setup_Dependencies.ipynb index d74392a0..66967a9e 100644 --- a/01_setup/01_Setup_Dependencies.ipynb +++ b/01_setup/01_Setup_Dependencies.ipynb @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "!conda install -y pytorch==1.6.0 -c pytorch " + "!conda install -y pytorch==1.6.0 -c pytorch" ] }, { @@ -260,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_dependencies_passed=True" + "setup_dependencies_passed = True" ] }, { diff --git a/01_setup/02_Check_Environment.ipynb b/01_setup/02_Check_Environment.ipynb index 45ff455e..90925adb 100644 --- a/01_setup/02_Check_Environment.ipynb +++ b/01_setup/02_Check_Environment.ipynb @@ -18,8 +18,8 @@ "region = boto3.Session().region_name\n", "session = boto3.session.Session()\n", "\n", - "ec2 = boto3.Session().client(service_name='ec2', region_name=region)\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)\n", + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -36,21 +36,22 @@ "outputs": [], "source": [ "import json\n", + "\n", "notebook_instance_name = None\n", "\n", "try:\n", - " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n", + " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n", " data = json.load(notebook_info)\n", - " domain_id = data['DomainId']\n", - " resource_arn = data['ResourceArn']\n", - " region = resource_arn.split(':')[3]\n", - " name = data['ResourceName']\n", - " print('DomainId: {}'.format(domain_id))\n", - " print('Name: {}'.format(name)) \n", + " domain_id = data[\"DomainId\"]\n", + " resource_arn = data[\"ResourceArn\"]\n", + " region = resource_arn.split(\":\")[3]\n", + " name = data[\"ResourceName\"]\n", + " print(\"DomainId: {}\".format(domain_id))\n", + " print(\"Name: {}\".format(name))\n", "except:\n", - " print('+++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR]: COULD NOT RETRIEVE THE METADATA.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR]: COULD NOT RETRIEVE THE METADATA.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -59,10 +60,8 @@ "metadata": {}, "outputs": [], "source": [ - "describe_domain_response = sm.describe_domain(\n", - " DomainId=domain_id\n", - ")\n", - "print(describe_domain_response['Status'])" + "describe_domain_response = sm.describe_domain(DomainId=domain_id)\n", + "print(describe_domain_response[\"Status\"])" ] }, { @@ -73,7 +72,7 @@ "source": [ "try:\n", " get_status_response = sm.get_sagemaker_servicecatalog_portfolio_status()\n", - " print(get_status_response['Status'])\n", + " print(get_status_response[\"Status\"])\n", "except:\n", " pass" ] @@ -91,17 +90,21 @@ "metadata": {}, "outputs": [], "source": [ - "if describe_domain_response['Status'] == 'InService' and get_status_response['Status'] == 'Enabled' and 'datascience' in name:\n", - " setup_instance_check_passed=True\n", - " print('[OK] Checks passed! Great Job!! Please Continue.')\n", + "if (\n", + " describe_domain_response[\"Status\"] == \"InService\"\n", + " and get_status_response[\"Status\"] == \"Enabled\"\n", + " and \"datascience\" in name\n", + "):\n", + " setup_instance_check_passed = True\n", + " print(\"[OK] Checks passed! Great Job!! Please Continue.\")\n", "else:\n", - " setup_instance_check_passed=False\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR]: WE HAVE IDENTIFIED A MISCONFIGURATION.')\n", - " print(describe_domain_response['Status'])\n", - " print(get_status_response['Status'])\n", + " setup_instance_check_passed = False\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR]: WE HAVE IDENTIFIED A MISCONFIGURATION.\")\n", + " print(describe_domain_response[\"Status\"])\n", + " print(get_status_response[\"Status\"])\n", " print(name)\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -146,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "%store " + "%store" ] }, { diff --git a/01_setup/03_Create_S3_Bucket.ipynb b/01_setup/03_Create_S3_Bucket.ipynb index 30ee5836..a7072e17 100644 --- a/01_setup/03_Create_S3_Bucket.ipynb +++ b/01_setup/03_Create_S3_Bucket.ipynb @@ -21,7 +21,7 @@ "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket()\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)" + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)" ] }, { @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_s3_bucket_passed=False" + "setup_s3_bucket_passed = False" ] }, { @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Default bucket: {}'.format(bucket))" + "print(\"Default bucket: {}\".format(bucket))" ] }, { @@ -73,9 +73,9 @@ "try:\n", " response = s3.head_bucket(Bucket=bucket)\n", " print(response)\n", - " setup_s3_bucket_passed=True\n", + " setup_s3_bucket_passed = True\n", "except ClientError as e:\n", - " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(bucket, response, e))" + " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(bucket, response, e))" ] }, { diff --git a/01_setup/04_Update_IAM_Roles_And_Policies.ipynb b/01_setup/04_Update_IAM_Roles_And_Policies.ipynb index fef8781a..587ade3e 100644 --- a/01_setup/04_Update_IAM_Roles_And_Policies.ipynb +++ b/01_setup/04_Update_IAM_Roles_And_Policies.ipynb @@ -25,14 +25,9 @@ "\n", "from botocore.config import Config\n", "\n", - "config = Config(\n", - " retries = {\n", - " 'max_attempts': 10,\n", - " 'mode': 'adaptive'\n", - " }\n", - ")\n", - "\n", - "iam = boto3.client('iam', config=config)" + "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n", + "\n", + "iam = boto3.client(\"iam\", config=config)" ] }, { @@ -48,9 +43,9 @@ "metadata": {}, "outputs": [], "source": [ - "role_name = role.split('/')[-1]\n", + "role_name = role.split(\"/\")[-1]\n", "\n", - "print('Role name: {}'.format(role_name))" + "print(\"Role name: {}\".format(role_name))" ] }, { @@ -59,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_iam_roles_passed=False" + "setup_iam_roles_passed = False" ] }, { @@ -76,14 +71,14 @@ "outputs": [], "source": [ "admin = False\n", - "post_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n", + "post_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n", "for post_policy in post_policies:\n", - " if post_policy['PolicyName'] == 'AdministratorAccess':\n", + " if post_policy[\"PolicyName\"] == \"AdministratorAccess\":\n", " admin = True\n", " break\n", "\n", - "setup_iam_roles_passed=True\n", - "print('[OK] You are all set up to continue with this workshop!')" + "setup_iam_roles_passed = True\n", + "print(\"[OK] You are all set up to continue with this workshop!\")" ] }, { @@ -93,27 +88,29 @@ "outputs": [], "source": [ "if not admin:\n", - " pre_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n", + " pre_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n", + "\n", + " required_policies = [\"IAMFullAccess\"]\n", "\n", - " required_policies = ['IAMFullAccess']\n", - " \n", " for pre_policy in pre_policies:\n", " for role_req in required_policies:\n", - " if pre_policy['PolicyName'] == role_req:\n", - " print('Attached: {}'.format(pre_policy['PolicyName']))\n", + " if pre_policy[\"PolicyName\"] == role_req:\n", + " print(\"Attached: {}\".format(pre_policy[\"PolicyName\"]))\n", " try:\n", - " required_policies.remove(pre_policy['PolicyName'])\n", + " required_policies.remove(pre_policy[\"PolicyName\"])\n", " except:\n", " pass\n", "\n", " if len(required_policies) > 0:\n", - " print('*************** [ERROR] You need to attach the following policies in order to continue with this workshop *****************\\n')\n", + " print(\n", + " \"*************** [ERROR] You need to attach the following policies in order to continue with this workshop *****************\\n\"\n", + " )\n", " for required_policy in required_policies:\n", - " print('Not Attached: {}'.format(required_policy))\n", + " print(\"Not Attached: {}\".format(required_policy))\n", " else:\n", - " print('[OK] You are all set to continue with this notebook!')\n", + " print(\"[OK] You are all set to continue with this notebook!\")\n", "else:\n", - " print('[OK] You are all set to continue with this notebook!')" + " print(\"[OK] You are all set to continue with this notebook!\")" ] }, { @@ -132,19 +129,16 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AdministratorAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AdministratorAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", "\n", "time.sleep(5)" ] @@ -158,19 +152,16 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonSageMakerFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonSageMakerFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", "\n", "time.sleep(5)" ] @@ -184,19 +175,16 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='IAMFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"IAMFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", "\n", "time.sleep(5)" ] @@ -210,20 +198,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonS3FullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonS3FullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -236,20 +221,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='ComprehendFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"ComprehendFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -262,20 +244,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonAthenaFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonAthenaFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -288,20 +267,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='SecretsManagerReadWrite'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"SecretsManagerReadWrite\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -314,20 +290,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonRedshiftFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonRedshiftFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -340,20 +313,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonEC2ContainerRegistryFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonEC2ContainerRegistryFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -366,20 +336,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AWSStepFunctionsFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AWSStepFunctionsFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -392,20 +359,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonKinesisFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonKinesisFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -418,20 +382,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonKinesisFirehoseFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonKinesisFirehoseFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -444,20 +405,17 @@ "from botocore.exceptions import ClientError\n", "\n", "try:\n", - " policy='AmazonKinesisAnalyticsFullAccess'\n", - " response = iam.attach_role_policy(\n", - " PolicyArn='arn:aws:iam::aws:policy/{}'.format(policy),\n", - " RoleName=role_name\n", - " )\n", - " print('Policy {} has been succesfully attached to role: {}'.format(policy, role_name))\n", + " policy = \"AmazonKinesisAnalyticsFullAccess\"\n", + " response = iam.attach_role_policy(PolicyArn=\"arn:aws:iam::aws:policy/{}\".format(policy), RoleName=role_name)\n", + " print(\"Policy {} has been succesfully attached to role: {}\".format(policy, role_name))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", - " print('[OK] Policy is already attached.')\n", - " elif e.response['Error']['Code'] == 'LimitExceeded':\n", - " print('[OK]')\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", + " print(\"[OK] Policy is already attached.\")\n", + " elif e.response[\"Error\"][\"Code\"] == \"LimitExceeded\":\n", + " print(\"[OK]\")\n", " else:\n", - " print('*************** [ERROR] {} *****************'.format(e))\n", - " \n", + " print(\"*************** [ERROR] {} *****************\".format(e))\n", + "\n", "time.sleep(5)" ] }, @@ -474,47 +432,47 @@ "metadata": {}, "outputs": [], "source": [ - "#role = iam.get_role(RoleName=role_name)\n", - "post_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']\n", + "# role = iam.get_role(RoleName=role_name)\n", + "post_policies = iam.list_attached_role_policies(RoleName=role_name)[\"AttachedPolicies\"]\n", "\n", "required_policies = [\n", - " 'AdministratorAccess',\n", - " 'SecretsManagerReadWrite', \n", - " 'IAMFullAccess', \n", - " 'AmazonS3FullAccess', \n", - " 'AmazonAthenaFullAccess', \n", - " 'ComprehendFullAccess',\n", - " 'AmazonEC2ContainerRegistryFullAccess',\n", - " 'AmazonRedshiftFullAccess',\n", - " 'AWSStepFunctionsFullAccess',\n", - " 'AmazonSageMakerFullAccess',\n", - " 'AmazonKinesisFullAccess',\n", - " 'AmazonKinesisFirehoseFullAccess',\n", - " 'AmazonKinesisAnalyticsFullAccess'\n", - " ]\n", + " \"AdministratorAccess\",\n", + " \"SecretsManagerReadWrite\",\n", + " \"IAMFullAccess\",\n", + " \"AmazonS3FullAccess\",\n", + " \"AmazonAthenaFullAccess\",\n", + " \"ComprehendFullAccess\",\n", + " \"AmazonEC2ContainerRegistryFullAccess\",\n", + " \"AmazonRedshiftFullAccess\",\n", + " \"AWSStepFunctionsFullAccess\",\n", + " \"AmazonSageMakerFullAccess\",\n", + " \"AmazonKinesisFullAccess\",\n", + " \"AmazonKinesisFirehoseFullAccess\",\n", + " \"AmazonKinesisAnalyticsFullAccess\",\n", + "]\n", "\n", "admin = False\n", "\n", "for post_policy in post_policies:\n", - " if post_policy['PolicyName'] == 'AdministratorAccess':\n", + " if post_policy[\"PolicyName\"] == \"AdministratorAccess\":\n", " admin = True\n", " try:\n", - " required_policies.remove(post_policy['PolicyName'])\n", + " required_policies.remove(post_policy[\"PolicyName\"])\n", " except:\n", " break\n", - " else: \n", + " else:\n", " try:\n", - " required_policies.remove(post_policy['PolicyName'])\n", + " required_policies.remove(post_policy[\"PolicyName\"])\n", " except:\n", " pass\n", "\n", "if not admin and len(required_policies) > 0:\n", - " print('*************** [ERROR] RE-RUN THIS NOTEBOOK *****************')\n", + " print(\"*************** [ERROR] RE-RUN THIS NOTEBOOK *****************\")\n", " for required_policy in required_policies:\n", - " print('Not Attached: {}'.format(required_policy))\n", + " print(\"Not Attached: {}\".format(required_policy))\n", "else:\n", - " setup_iam_roles_passed=True\n", - " print('[OK] You are all set up to continue with this workshop!')" + " setup_iam_roles_passed = True\n", + " print(\"[OK] You are all set up to continue with this workshop!\")" ] }, { diff --git a/02_usecases/01_Setup.ipynb b/02_usecases/01_Setup.ipynb index b7703836..ee112f14 100644 --- a/02_usecases/01_Setup.ipynb +++ b/02_usecases/01_Setup.ipynb @@ -21,7 +21,7 @@ "bucket = sagemaker_session.default_bucket()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { diff --git a/02_usecases/03_Celebrity_Recognition.ipynb b/02_usecases/03_Celebrity_Recognition.ipynb index 82a20d00..4e38bbe0 100644 --- a/02_usecases/03_Celebrity_Recognition.ipynb +++ b/02_usecases/03_Celebrity_Recognition.ipynb @@ -61,8 +61,8 @@ "metadata": {}, "outputs": [], "source": [ - "rekognition = boto3.client('rekognition')\n", - "s3 = boto3.client('s3')" + "rekognition = boto3.client(\"rekognition\")\n", + "s3 = boto3.client(\"s3\")" ] }, { @@ -72,7 +72,7 @@ "outputs": [], "source": [ "!mkdir -p ./tmp\n", - "temp_folder = 'tmp/'" + "temp_folder = \"tmp/\"" ] }, { @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "imageName = 'content-moderation/media/GrandTourjc.png'" + "imageName = \"content-moderation/media/GrandTourjc.png\"" ] }, { @@ -97,7 +97,7 @@ "metadata": {}, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))" + "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))" ] }, { @@ -116,9 +116,9 @@ "source": [ "recognizeCelebritiesResponse = rekognition.recognize_celebrities(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': imageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": imageName,\n", " }\n", " }\n", ")" @@ -156,10 +156,10 @@ "metadata": {}, "outputs": [], "source": [ - "def drawBoundingBoxes (sourceImage, boxes):\n", + "def drawBoundingBoxes(sourceImage, boxes):\n", " # blue, green, red, grey\n", - " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n", - " \n", + " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n", + "\n", " # Download image locally\n", " imageLocation = temp_folder + os.path.basename(sourceImage)\n", " s3.download_file(bucket, sourceImage, imageLocation)\n", @@ -170,24 +170,24 @@ " width, height = bbImage.size\n", " col = 0\n", " maxcol = len(colors)\n", - " line= 3\n", + " line = 3\n", " for box in boxes:\n", - " x1 = int(box[1]['Left'] * width)\n", - " y1 = int(box[1]['Top'] * height)\n", - " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n", - " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n", - " \n", - " draw.text((x1,y1),box[0],colors[col])\n", + " x1 = int(box[1][\"Left\"] * width)\n", + " y1 = int(box[1][\"Top\"] * height)\n", + " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n", + " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n", + "\n", + " draw.text((x1, y1), box[0], colors[col])\n", " for l in range(line):\n", - " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n", - " col = (col+1)%maxcol\n", - " \n", + " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n", + " col = (col + 1) % maxcol\n", + "\n", " imageFormat = \"PNG\"\n", " ext = sourceImage.lower()\n", - " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n", - " imageFormat = 'JPEG'\n", + " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n", + " imageFormat = \"JPEG\"\n", "\n", - " bbImage.save(imageLocation,format=imageFormat)\n", + " bbImage.save(imageLocation, format=imageFormat)\n", "\n", " display(bbImage)" ] @@ -199,10 +199,10 @@ "outputs": [], "source": [ "boxes = []\n", - "celebrities = recognizeCelebritiesResponse['CelebrityFaces']\n", + "celebrities = recognizeCelebritiesResponse[\"CelebrityFaces\"]\n", "for celebrity in celebrities:\n", - " boxes.append ((celebrity['Name'], celebrity['Face']['BoundingBox']))\n", - " \n", + " boxes.append((celebrity[\"Name\"], celebrity[\"Face\"][\"BoundingBox\"]))\n", + "\n", "drawBoundingBoxes(imageName, boxes)" ] }, @@ -224,9 +224,9 @@ "metadata": {}, "outputs": [], "source": [ - "videoName = 'content-moderation/media/GrandTour720.mp4'\n", - "strDetail = 'Celebrites detected in video
=======================================
'\n", - "strOverall = 'Celebrities in the overall video:
=======================================
'" + "videoName = \"content-moderation/media/GrandTour720.mp4\"\n", + "strDetail = \"Celebrites detected in video
=======================================
\"\n", + "strOverall = \"Celebrities in the overall video:
=======================================
\"" ] }, { @@ -236,14 +236,18 @@ "outputs": [], "source": [ "s3FilePrefix = \"https://s3.amazonaws.com\"\n", - "if(not region == 'us-east-1'):\n", + "if not region == \"us-east-1\":\n", " s3FilePrefix = \"https://s3-{}.amazonaws.com\".format(region)\n", "\n", "s3VideoUrl = \"{0}/{1}/{2}\".format(s3FilePrefix, bucket, videoName)\n", "\n", - "videoTag = \"\".format(s3VideoUrl)\n", + "videoTag = \"\".format(\n", + " s3VideoUrl\n", + ")\n", "\n", - "videoui = \"
{}
{}
\".format(videoTag, strDetail)\n", + "videoui = \"
{}
{}
\".format(\n", + " videoTag, strDetail\n", + ")\n", "\n", "display(HTML(videoui))" ] @@ -263,14 +267,14 @@ "source": [ "startCelebrityRekognition = rekognition.start_celebrity_recognition(\n", " Video={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': videoName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": videoName,\n", " }\n", " },\n", ")\n", "\n", - "celebrityJobId = startCelebrityRekognition['JobId']\n", + "celebrityJobId = startCelebrityRekognition[\"JobId\"]\n", "display(\"Job Id: {0}\".format(celebrityJobId))" ] }, @@ -290,20 +294,15 @@ "source": [ "%%time\n", "\n", - "getCelebrityRecognition = rekognition.get_celebrity_recognition(\n", - " JobId=celebrityJobId,\n", - " SortBy='TIMESTAMP'\n", - ")\n", + "getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n", "\n", - "while(getCelebrityRecognition['JobStatus'] == 'IN_PROGRESS'):\n", + "while getCelebrityRecognition[\"JobStatus\"] == \"IN_PROGRESS\":\n", " time.sleep(5)\n", - " print('.', end='')\n", - " \n", - " getCelebrityRecognition = rekognition.get_celebrity_recognition(\n", - " JobId=celebrityJobId,\n", - " SortBy='TIMESTAMP')\n", - " \n", - "display(getCelebrityRecognition['JobStatus'])" + " print(\".\", end=\"\")\n", + "\n", + " getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n", + "\n", + "display(getCelebrityRecognition[\"JobStatus\"])" ] }, { @@ -343,16 +342,16 @@ "theCelebs = {}\n", "\n", "# Celebrities detected in each frame\n", - "for celebrity in getCelebrityRecognition['Celebrities']:\n", - " if 'Celebrity' in celebrity :\n", + "for celebrity in getCelebrityRecognition[\"Celebrities\"]:\n", + " if \"Celebrity\" in celebrity:\n", " cconfidence = celebrity[\"Celebrity\"][\"Confidence\"]\n", - " if(cconfidence > 95):\n", - " ts = celebrity [\"Timestamp\"]\n", + " if cconfidence > 95:\n", + " ts = celebrity[\"Timestamp\"]\n", " cname = celebrity[\"Celebrity\"][\"Name\"]\n", - " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence,2))\n", + " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence, 2))\n", " if not cname in theCelebs:\n", " theCelebs[cname] = cname\n", - " \n", + "\n", "\n", "# Unique faces detected in video\n", "for theCeleb in theCelebs:\n", @@ -376,7 +375,7 @@ "metadata": {}, "outputs": [], "source": [ - "customCelebrityImageName = 'content-moderation/media/chris-antje.png'" + "customCelebrityImageName = \"content-moderation/media/chris-antje.png\"" ] }, { @@ -385,7 +384,9 @@ "metadata": {}, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': customCelebrityImageName})))" + "display(\n", + " IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": customCelebrityImageName}))\n", + ")" ] }, { @@ -398,9 +399,9 @@ "\n", "customCelebrityResponse = rekognition.recognize_celebrities(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': customCelebrityImageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": customCelebrityImageName,\n", " }\n", " }\n", ")" @@ -441,10 +442,10 @@ "outputs": [], "source": [ "cboxes = []\n", - "faces = customCelebrityResponse['UnrecognizedFaces']\n", + "faces = customCelebrityResponse[\"UnrecognizedFaces\"]\n", "for face in faces:\n", - " cboxes.append (('Unrecognized Face', face['BoundingBox']))\n", - " \n", + " cboxes.append((\"Unrecognized Face\", face[\"BoundingBox\"]))\n", + "\n", "drawBoundingBoxes(customCelebrityImageName, cboxes)" ] }, diff --git a/02_usecases/04_Content_Moderation.ipynb b/02_usecases/04_Content_Moderation.ipynb index fd0a8152..a46282f2 100644 --- a/02_usecases/04_Content_Moderation.ipynb +++ b/02_usecases/04_Content_Moderation.ipynb @@ -49,8 +49,8 @@ "metadata": {}, "outputs": [], "source": [ - "rekognition = boto3.client('rekognition')\n", - "s3 = boto3.client('s3')" + "rekognition = boto3.client(\"rekognition\")\n", + "s3 = boto3.client(\"s3\")" ] }, { @@ -66,7 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "imageName = 'content-moderation/media/weapon.png'" + "imageName = \"content-moderation/media/weapon.png\"" ] }, { @@ -84,12 +84,12 @@ "outputs": [], "source": [ "detectModerationLabelsResponse = rekognition.detect_moderation_labels(\n", - " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': imageName,\n", - " }\n", - " }\n", + " Image={\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": imageName,\n", + " }\n", + " }\n", ")" ] }, @@ -99,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))" + "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))" ] }, { @@ -157,10 +157,10 @@ "metadata": {}, "outputs": [], "source": [ - "videoName = 'content-moderation/media/weapon.mp4'\n", + "videoName = \"content-moderation/media/weapon.mp4\"\n", "\n", - "strDetail = 'Moderation labels in video
=======================================
'\n", - "strOverall = 'Moderation labels in the overall video:
=======================================
'" + "strDetail = \"Moderation labels in video
=======================================
\"\n", + "strOverall = \"Moderation labels in the overall video:
=======================================
\"" ] }, { @@ -171,9 +171,11 @@ }, "outputs": [], "source": [ - "s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': videoName})\n", + "s3VideoUrl = s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": videoName})\n", "\n", - "videoTag = \"\".format(s3VideoUrl)\n", + "videoTag = \"\".format(\n", + " s3VideoUrl\n", + ")\n", "\n", "videoui = \"
{}
\".format(videoTag)\n", "\n", @@ -210,14 +212,14 @@ "# Start content moderation job\n", "startModerationLabelDetection = rekognition.start_content_moderation(\n", " Video={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': videoName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": videoName,\n", " }\n", " },\n", ")\n", "\n", - "moderationJobId = startModerationLabelDetection['JobId']\n", + "moderationJobId = startModerationLabelDetection[\"JobId\"]\n", "display(\"Job Id: {0}\".format(moderationJobId))" ] }, @@ -237,20 +239,15 @@ "source": [ "%%time\n", "\n", - "getContentModeration = rekognition.get_content_moderation(\n", - " JobId=moderationJobId,\n", - " SortBy='TIMESTAMP'\n", - ")\n", + "getContentModeration = rekognition.get_content_moderation(JobId=moderationJobId, SortBy=\"TIMESTAMP\")\n", "\n", - "while(getContentModeration['JobStatus'] == 'IN_PROGRESS'):\n", + "while getContentModeration[\"JobStatus\"] == \"IN_PROGRESS\":\n", " time.sleep(5)\n", - " print('.', end='')\n", - " \n", - " getContentModeration = rekognition.get_content_moderation(\n", - " JobId=moderationJobId,\n", - " SortBy='TIMESTAMP')\n", - " \n", - "display(getContentModeration['JobStatus'])" + " print(\".\", end=\"\")\n", + "\n", + " getContentModeration = rekognition.get_content_moderation(JobId=moderationJobId, SortBy=\"TIMESTAMP\")\n", + "\n", + "display(getContentModeration[\"JobStatus\"])" ] }, { @@ -289,16 +286,16 @@ "theObjects = {}\n", "\n", "# Potentially unsafe detected in each frame\n", - "for obj in getContentModeration['ModerationLabels']:\n", - " ts = obj [\"Timestamp\"]\n", - " cconfidence = obj['ModerationLabel'][\"Confidence\"]\n", - " oname = obj['ModerationLabel'][\"Name\"]\n", - " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n", + "for obj in getContentModeration[\"ModerationLabels\"]:\n", + " ts = obj[\"Timestamp\"]\n", + " cconfidence = obj[\"ModerationLabel\"][\"Confidence\"]\n", + " oname = obj[\"ModerationLabel\"][\"Name\"]\n", + " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence, 2))\n", " if oname in theObjects:\n", " cojb = theObjects[oname]\n", - " theObjects[oname] = {\"Name\" : oname, \"Count\": 1+cojb[\"Count\"]}\n", + " theObjects[oname] = {\"Name\": oname, \"Count\": 1 + cojb[\"Count\"]}\n", " else:\n", - " theObjects[oname] = {\"Name\" : oname, \"Count\": 1}\n", + " theObjects[oname] = {\"Name\": oname, \"Count\": 1}\n", "\n", "# Unique objects detected in video\n", "for theObject in theObjects:\n", diff --git a/02_usecases/05_Inappropriate_Text_Detection.ipynb b/02_usecases/05_Inappropriate_Text_Detection.ipynb index 6b6d2cae..8273cf6c 100644 --- a/02_usecases/05_Inappropriate_Text_Detection.ipynb +++ b/02_usecases/05_Inappropriate_Text_Detection.ipynb @@ -48,8 +48,8 @@ "metadata": {}, "outputs": [], "source": [ - "rekognition = boto3.client('rekognition')\n", - "s3 = boto3.client('s3')" + "rekognition = boto3.client(\"rekognition\")\n", + "s3 = boto3.client(\"s3\")" ] }, { @@ -59,7 +59,7 @@ "outputs": [], "source": [ "!mkdir -p ./tmp\n", - "temp_folder = 'tmp/'" + "temp_folder = \"tmp/\"" ] }, { @@ -75,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "imageName = 'content-moderation/media/coffee.jpg'" + "imageName = \"content-moderation/media/coffee.jpg\"" ] }, { @@ -86,7 +86,7 @@ }, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))" + "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))" ] }, { @@ -105,16 +105,12 @@ "source": [ "detectTextResponse = rekognition.detect_text(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': imageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": imageName,\n", " }\n", - " },\n", - " Filters={\n", - " 'WordFilter': {\n", - " 'MinConfidence': 90\n", - " }\n", - " }\n", + " },\n", + " Filters={\"WordFilter\": {\"MinConfidence\": 90}},\n", ")" ] }, @@ -149,11 +145,12 @@ "outputs": [], "source": [ "import string\n", + "\n", "unsafeWords = [\"crap\", \"darn\", \"damm\"]\n", "for textDetection in detectTextResponse[\"TextDetections\"]:\n", " # strip punctuation before checking match\n", - " text = textDetection[\"DetectedText\"].translate(str.maketrans('', '', string.punctuation))\n", - " if(textDetection[\"Type\"] == \"WORD\" and text in unsafeWords):\n", + " text = textDetection[\"DetectedText\"].translate(str.maketrans(\"\", \"\", string.punctuation))\n", + " if textDetection[\"Type\"] == \"WORD\" and text in unsafeWords:\n", " print(\"Detected unsafe word: {}\".format(textDetection[\"DetectedText\"]))" ] }, @@ -170,10 +167,10 @@ "metadata": {}, "outputs": [], "source": [ - "def drawBoundingBoxes (sourceImage, boxes):\n", + "def drawBoundingBoxes(sourceImage, boxes):\n", " # blue, green, red, grey\n", - " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n", - " \n", + " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n", + "\n", " # Download image locally\n", " imageLocation = temp_folder + os.path.basename(sourceImage)\n", " s3.download_file(bucket, sourceImage, imageLocation)\n", @@ -184,24 +181,24 @@ " width, height = bbImage.size\n", " col = 0\n", " maxcol = len(colors)\n", - " line= 3\n", + " line = 3\n", " for box in boxes:\n", - " x1 = int(box[1]['Left'] * width)\n", - " y1 = int(box[1]['Top'] * height)\n", - " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n", - " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n", - " \n", - " draw.text((x1,y1),box[0],colors[col])\n", + " x1 = int(box[1][\"Left\"] * width)\n", + " y1 = int(box[1][\"Top\"] * height)\n", + " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n", + " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n", + "\n", + " draw.text((x1, y1), box[0], colors[col])\n", " for l in range(line):\n", - " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n", - " col = (col+1)%maxcol\n", - " \n", + " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n", + " col = (col + 1) % maxcol\n", + "\n", " imageFormat = \"PNG\"\n", " ext = sourceImage.lower()\n", - " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n", - " imageFormat = 'JPEG'\n", + " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n", + " imageFormat = \"JPEG\"\n", "\n", - " bbImage.save(imageLocation,format=imageFormat)\n", + " bbImage.save(imageLocation, format=imageFormat)\n", "\n", " display(bbImage)" ] @@ -215,10 +212,10 @@ "outputs": [], "source": [ "boxes = []\n", - "textDetections = detectTextResponse['TextDetections']\n", + "textDetections = detectTextResponse[\"TextDetections\"]\n", "for textDetection in textDetections:\n", - " boxes.append ((textDetection['Type'], textDetection[\"Geometry\"]['BoundingBox']))\n", - " \n", + " boxes.append((textDetection[\"Type\"], textDetection[\"Geometry\"][\"BoundingBox\"]))\n", + "\n", "drawBoundingBoxes(imageName, boxes)" ] }, @@ -235,7 +232,7 @@ "metadata": {}, "outputs": [], "source": [ - "imageName = 'content-moderation/media/coffee.jpg'" + "imageName = \"content-moderation/media/coffee.jpg\"" ] }, { @@ -244,7 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': imageName})))" + "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": imageName})))" ] }, { @@ -258,28 +255,17 @@ "\n", "detectTextResponse = rekognition.detect_text(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': imageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": imageName,\n", " }\n", " },\n", " Filters={\n", - " 'WordFilter': {\n", - " 'MinConfidence': 90,\n", - " 'MinBoundingBoxHeight': 0.05,\n", - " 'MinBoundingBoxWidth': 0.02\n", - " },\n", - " 'RegionsOfInterest': [\n", - " {\n", - " 'BoundingBox': {\n", - " 'Width': 0.1,\n", - " 'Height': 0.05,\n", - " 'Left': 0.01,\n", - " 'Top': 0.01\n", - " }\n", - " },\n", - " ]\n", - " }\n", + " \"WordFilter\": {\"MinConfidence\": 90, \"MinBoundingBoxHeight\": 0.05, \"MinBoundingBoxWidth\": 0.02},\n", + " \"RegionsOfInterest\": [\n", + " {\"BoundingBox\": {\"Width\": 0.1, \"Height\": 0.05, \"Left\": 0.01, \"Top\": 0.01}},\n", + " ],\n", + " },\n", ")" ] }, @@ -308,7 +294,7 @@ "source": [ "for textDetection in detectTextResponse[\"TextDetections\"]:\n", " text = textDetection[\"DetectedText\"]\n", - " if(textDetection[\"Type\"] == \"WORD\"):\n", + " if textDetection[\"Type\"] == \"WORD\":\n", " print(\"Word: {}\".format(textDetection[\"DetectedText\"]))" ] }, @@ -331,10 +317,10 @@ "metadata": {}, "outputs": [], "source": [ - "videoName = 'content-moderation/media/serverless-bytes.mov'\n", + "videoName = \"content-moderation/media/serverless-bytes.mov\"\n", "\n", - "strDetail = 'Text detected in video
=======================================
'\n", - "strOverall = 'Text in the overall video:
=======================================
'" + "strDetail = \"Text detected in video
=======================================
\"\n", + "strOverall = \"Text in the overall video:
=======================================
\"" ] }, { @@ -343,9 +329,11 @@ "metadata": {}, "outputs": [], "source": [ - "s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': videoName})\n", + "s3VideoUrl = s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucket, \"Key\": videoName})\n", "\n", - "videoTag = \"\".format(s3VideoUrl)\n", + "videoTag = \"\".format(\n", + " s3VideoUrl\n", + ")\n", "\n", "videoui = \"
{}
\".format(videoTag)\n", "\n", @@ -367,14 +355,14 @@ "source": [ "startTextDetection = rekognition.start_text_detection(\n", " Video={\n", - " 'S3Object': {\n", - " 'Bucket': bucket,\n", - " 'Name': videoName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucket,\n", + " \"Name\": videoName,\n", " }\n", " },\n", ")\n", "\n", - "textJobId = startTextDetection['JobId']\n", + "textJobId = startTextDetection[\"JobId\"]\n", "display(\"Job Id: {0}\".format(textJobId))" ] }, @@ -392,19 +380,15 @@ "metadata": {}, "outputs": [], "source": [ - "getTextDetection = rekognition.get_text_detection(\n", - " JobId=textJobId\n", - ")\n", + "getTextDetection = rekognition.get_text_detection(JobId=textJobId)\n", "\n", - "while(getTextDetection['JobStatus'] == 'IN_PROGRESS'):\n", + "while getTextDetection[\"JobStatus\"] == \"IN_PROGRESS\":\n", " time.sleep(5)\n", - " print('.', end='')\n", - " \n", - " getTextDetection = rekognition.get_text_detection(\n", - " JobId=textJobId\n", - " )\n", - " \n", - "display(getTextDetection['JobStatus'])" + " print(\".\", end=\"\")\n", + "\n", + " getTextDetection = rekognition.get_text_detection(JobId=textJobId)\n", + "\n", + "display(getTextDetection[\"JobStatus\"])" ] }, { @@ -444,21 +428,21 @@ "theLines = {}\n", "\n", "# Objects detected in each frame\n", - "for obj in getTextDetection['TextDetections']:\n", - " if(obj['TextDetection']['Type'] == 'WORD'):\n", - " ts = obj [\"Timestamp\"]\n", - " cconfidence = obj['TextDetection'][\"Confidence\"]\n", - " oname = obj['TextDetection'][\"DetectedText\"]\n", + "for obj in getTextDetection[\"TextDetections\"]:\n", + " if obj[\"TextDetection\"][\"Type\"] == \"WORD\":\n", + " ts = obj[\"Timestamp\"]\n", + " cconfidence = obj[\"TextDetection\"][\"Confidence\"]\n", + " oname = obj[\"TextDetection\"][\"DetectedText\"]\n", "\n", - " if(oname in flaggedTextInVideo):\n", - " print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence,2)))\n", + " if oname in flaggedTextInVideo:\n", + " print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence, 2)))\n", "\n", - " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n", + " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence, 2))\n", " if oname in theLines:\n", " cojb = theLines[oname]\n", - " theLines[oname] = {\"Text\" : oname, \"Count\": 1+cojb[\"Count\"]}\n", + " theLines[oname] = {\"Text\": oname, \"Count\": 1 + cojb[\"Count\"]}\n", " else:\n", - " theLines[oname] = {\"Text\" : oname, \"Count\": 1}\n", + " theLines[oname] = {\"Text\": oname, \"Count\": 1}\n", "\n", "# Unique objects detected in video\n", "for theLine in theLines:\n", diff --git a/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb b/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb index 997206f8..4aaa6d8f 100644 --- a/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb +++ b/02_usecases/06_Text_Classification_Prepare_Dataset.ipynb @@ -44,7 +44,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -76,10 +76,12 @@ "source": [ "import csv\n", "\n", - "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df.shape" ] }, @@ -99,12 +101,13 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(kind=\"bar\", title=\"Breakdown by Star Rating\")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -122,43 +125,26 @@ "source": [ "from sklearn.utils import resample\n", "\n", - "five_star_df = df.query('star_rating == 5')\n", - "four_star_df = df.query('star_rating == 4')\n", - "three_star_df = df.query('star_rating == 3')\n", - "two_star_df = df.query('star_rating == 2')\n", - "one_star_df = df.query('star_rating == 1')\n", + "five_star_df = df.query(\"star_rating == 5\")\n", + "four_star_df = df.query(\"star_rating == 4\")\n", + "three_star_df = df.query(\"star_rating == 3\")\n", + "two_star_df = df.query(\"star_rating == 2\")\n", + "one_star_df = df.query(\"star_rating == 1\")\n", "\n", "# Check which sentiment has the least number of samples\n", - "minority_count = min(five_star_df.shape[0], \n", - " four_star_df.shape[0], \n", - " three_star_df.shape[0], \n", - " two_star_df.shape[0], \n", - " one_star_df.shape[0]) \n", + "minority_count = min(\n", + " five_star_df.shape[0], four_star_df.shape[0], three_star_df.shape[0], two_star_df.shape[0], one_star_df.shape[0]\n", + ")\n", "\n", - "five_star_df = resample(five_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "four_star_df = resample(four_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "three_star_df = resample(three_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "two_star_df = resample(two_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "one_star_df = resample(one_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", "df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])\n", "df_balanced = df_balanced.reset_index(drop=True)\n", @@ -172,9 +158,11 @@ "metadata": {}, "outputs": [], "source": [ - "df_balanced[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_balanced[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -202,14 +190,10 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Split all data into 90% train and 10% holdout\n", - "df_train, df_holdout = train_test_split(df_balanced, \n", - " test_size=0.10,\n", - " stratify=df_balanced['star_rating'])\n", + "df_train, df_holdout = train_test_split(df_balanced, test_size=0.10, stratify=df_balanced[\"star_rating\"])\n", "\n", "# Split holdout data into 50% validation and 50% test\n", - "df_validation, df_test = train_test_split(df_holdout,\n", - " test_size=0.50, \n", - " stratify=df_holdout['star_rating'])\n" + "df_validation, df_test = train_test_split(df_holdout, test_size=0.50, stratify=df_holdout[\"star_rating\"])" ] }, { @@ -219,16 +203,16 @@ "outputs": [], "source": [ "# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n", - "labels = ['Train', 'Validation', 'Test']\n", + "labels = [\"Train\", \"Validation\", \"Test\"]\n", "sizes = [len(df_train.index), len(df_validation.index), len(df_test.index)]\n", - "explode = (0.1, 0, 0) \n", + "explode = (0.1, 0, 0)\n", "\n", "fig1, ax1 = plt.subplots()\n", "\n", - "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)\n", + "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90)\n", "\n", "# Equal aspect ratio ensures that pie is drawn as a circle.\n", - "ax1.axis('equal') \n", + "ax1.axis(\"equal\")\n", "\n", "plt.show()" ] @@ -255,7 +239,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_train[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='90% Train Breakdown by Star Rating')" + "df_train[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"90% Train Breakdown by Star Rating\"\n", + ")" ] }, { @@ -280,7 +266,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_validation[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Validation Breakdown by Star Rating')" + "df_validation[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"5% Validation Breakdown by Star Rating\"\n", + ")" ] }, { @@ -305,7 +293,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_test[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Test Breakdown by Star Rating')" + "df_test[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"5% Test Breakdown by Star Rating\"\n", + ")" ] }, { @@ -321,7 +311,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_train = df_train[['star_rating', 'review_body']]\n", + "df_train = df_train[[\"star_rating\", \"review_body\"]]\n", "df_train.shape" ] }, @@ -347,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "comprehend_train_path = './amazon_reviews_us_Digital_Software_v1_00_comprehend.csv'\n", + "comprehend_train_path = \"./amazon_reviews_us_Digital_Software_v1_00_comprehend.csv\"\n", "df_train.to_csv(comprehend_train_path, index=False, header=False)" ] }, @@ -364,7 +354,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_s3_prefix = 'data'\n", + "train_s3_prefix = \"data\"\n", "comprehend_train_s3_uri = sess.upload_data(path=comprehend_train_path, key_prefix=train_s3_prefix)\n", "comprehend_train_s3_uri" ] diff --git a/02_usecases/07_Text_Classification_Train_Model.ipynb b/02_usecases/07_Text_Classification_Train_Model.ipynb index 904fb775..045f2703 100644 --- a/02_usecases/07_Text_Classification_Train_Model.ipynb +++ b/02_usecases/07_Text_Classification_Train_Model.ipynb @@ -38,22 +38,17 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", "from botocore.config import Config\n", "\n", - "config = Config(\n", - " retries = {\n", - " 'max_attempts': 10,\n", - " 'mode': 'adaptive'\n", - " }\n", - ")\n", + "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n", "\n", - "iam = boto3.client('iam', config=config)\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "iam = boto3.client(\"iam\", config=config)\n", + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -69,15 +64,28 @@ "metadata": {}, "outputs": [], "source": [ - "if region in ['ap-south-1', 'eu-west-2', 'eu-west-1', 'ap-northeast-2', 'ap-northeast-1', 'ca-central-1', 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'us-east-1', 'us-east-2', 'us-west-2']:\n", - " print(' [OK] COMPREHEND IS SUPPORTED IN {}'.format(region))\n", - " print(' [OK] Please proceed with this notebook.' )\n", + "if region in [\n", + " \"ap-south-1\",\n", + " \"eu-west-2\",\n", + " \"eu-west-1\",\n", + " \"ap-northeast-2\",\n", + " \"ap-northeast-1\",\n", + " \"ca-central-1\",\n", + " \"ap-southeast-1\",\n", + " \"ap-southeast-2\",\n", + " \"eu-central-1\",\n", + " \"us-east-1\",\n", + " \"us-east-2\",\n", + " \"us-west-2\",\n", + "]:\n", + " print(\" [OK] COMPREHEND IS SUPPORTED IN {}\".format(region))\n", + " print(\" [OK] Please proceed with this notebook.\")\n", "else:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print(' [ERROR] COMPREHEND IS NOT YET SUPPORTED IN {}.'.format(region))\n", - " print(' [INFO] This is OK. Skip this notebook and continue with the next use case.' )\n", - " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\" [ERROR] COMPREHEND IS NOT YET SUPPORTED IN {}.\".format(region))\n", + " print(\" [INFO] This is OK. Skip this notebook and continue with the next use case.\")\n", + " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -86,7 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "comprehend = boto3.client('comprehend')" + "comprehend = boto3.client(\"comprehend\")" ] }, { @@ -112,10 +120,10 @@ "outputs": [], "source": [ "if not comprehend_train_s3_uri:\n", - " print('****************************************************************************************')\n", - " print('**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************')\n", - " print('**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************')\n", - " print('****************************************************************************************')" + " print(\"****************************************************************************************\")\n", + " print(\"**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************\")\n", + " print(\"**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************\")\n", + " print(\"****************************************************************************************\")" ] }, { @@ -160,7 +168,7 @@ "source": [ "import csv\n", "\n", - "df = pd.read_csv('./tmp/amazon_reviews_us_Digital_Software_v1_00_comprehend.csv', header=None)\n", + "df = pd.read_csv(\"./tmp/amazon_reviews_us_Digital_Software_v1_00_comprehend.csv\", header=None)\n", "df.head()" ] }, @@ -185,17 +193,11 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"comprehend.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"comprehend.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n", + " ],\n", + "}" ] }, { @@ -211,7 +213,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_comprehend_role_name = 'DSOAWS_Comprehend'" + "iam_comprehend_role_name = \"DSOAWS_Comprehend\"" ] }, { @@ -229,15 +231,15 @@ " iam_role_comprehend = iam.create_role(\n", " RoleName=iam_comprehend_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Comprehend Role'\n", + " Description=\"DSOAWS Comprehend Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)\n", " print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)\n", - " \n", + "\n", "time.sleep(30)" ] }, @@ -250,34 +252,10 @@ "comprehend_s3_policy_doc = {\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " {\n", - " \"Action\": [\n", - " \"s3:GetObject\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:s3:::{}/*\".format(bucket)\n", - " ],\n", - " \"Effect\": \"Allow\"\n", - " },\n", - " {\n", - " \"Action\": [\n", - " \"s3:ListBucket\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:s3:::{}\".format(bucket)\n", - " ],\n", - " \"Effect\": \"Allow\"\n", - " },\n", - " {\n", - " \"Action\": [\n", - " \"s3:PutObject\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:s3:::{}/*\".format(bucket)\n", - " ],\n", - " \"Effect\": \"Allow\"\n", - " }\n", - " ]\n", + " {\"Action\": [\"s3:GetObject\"], \"Resource\": [\"arn:aws:s3:::{}/*\".format(bucket)], \"Effect\": \"Allow\"},\n", + " {\"Action\": [\"s3:ListBucket\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)], \"Effect\": \"Allow\"},\n", + " {\"Action\": [\"s3:PutObject\"], \"Resource\": [\"arn:aws:s3:::{}/*\".format(bucket)], \"Effect\": \"Allow\"},\n", + " ],\n", "}\n", "\n", "print(comprehend_s3_policy_doc)" @@ -300,8 +278,8 @@ "\n", "response = iam.put_role_policy(\n", " RoleName=iam_comprehend_role_name,\n", - " PolicyName='DSOAWS_ComprehendPolicyToS3',\n", - " PolicyDocument=json.dumps(comprehend_s3_policy_doc)\n", + " PolicyName=\"DSOAWS_ComprehendPolicyToS3\",\n", + " PolicyDocument=json.dumps(comprehend_s3_policy_doc),\n", ")\n", "\n", "print(response)\n", @@ -322,9 +300,9 @@ "metadata": {}, "outputs": [], "source": [ - "prefix = 'models'\n", + "prefix = \"models\"\n", "\n", - "s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')\n", + "s3_output_job = \"s3://{}/{}/{}\".format(bucket, prefix, \"comprehend/output\")\n", "print(s3_output_job)" ] }, @@ -334,7 +312,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_comprehend_arn = iam_role_comprehend['Role']['Arn']" + "iam_role_comprehend_arn = iam_role_comprehend[\"Role\"][\"Arn\"]" ] }, { @@ -348,7 +326,7 @@ "\n", "timestamp = str(datetime.datetime.now().strftime(\"%s\"))\n", "\n", - "comprehend_training_job_name = 'Amazon-Customer-Reviews-Classifier-{}'.format(timestamp) \n", + "comprehend_training_job_name = \"Amazon-Customer-Reviews-Classifier-{}\".format(timestamp)\n", "\n", "print(comprehend_training_job_name)" ] @@ -362,13 +340,9 @@ "training_job = comprehend.create_document_classifier(\n", " DocumentClassifierName=comprehend_training_job_name,\n", " DataAccessRoleArn=iam_role_comprehend_arn,\n", - " InputDataConfig={\n", - " 'S3Uri': comprehend_train_s3_uri\n", - " },\n", - " OutputDataConfig={\n", - " 'S3Uri': s3_output_job\n", - " },\n", - " LanguageCode='en'\n", + " InputDataConfig={\"S3Uri\": comprehend_train_s3_uri},\n", + " OutputDataConfig={\"S3Uri\": s3_output_job},\n", + " LanguageCode=\"en\",\n", ")\n", "\n", "time.sleep(30)" @@ -380,7 +354,7 @@ "metadata": {}, "outputs": [], "source": [ - "comprehend_training_job_arn = training_job['DocumentClassifierArn']\n", + "comprehend_training_job_arn = training_job[\"DocumentClassifierArn\"]\n", "\n", "print(comprehend_training_job_arn)" ] @@ -393,7 +367,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Comprehend Training Job'.format(region, comprehend_training_job_arn)))\n" + "display(\n", + " HTML(\n", + " 'Review Comprehend Training Job'.format(\n", + " region, comprehend_training_job_arn\n", + " )\n", + " )\n", + ")" ] }, { @@ -414,21 +394,21 @@ "source": [ "import time\n", "\n", - "max_time = time.time() + 3 * 60 * 60 # 3 hours\n", + "max_time = time.time() + 3 * 60 * 60 # 3 hours\n", "while time.time() < max_time:\n", " describe_custom_classifier = comprehend.describe_document_classifier(\n", - " DocumentClassifierArn = comprehend_training_job_arn\n", + " DocumentClassifierArn=comprehend_training_job_arn\n", " )\n", " status = describe_custom_classifier[\"DocumentClassifierProperties\"][\"Status\"]\n", " print(\"Custom classifier: {}\".format(status))\n", - " \n", + "\n", " if status == \"TRAINED\" or status == \"IN_ERROR\":\n", - " print('')\n", - " print('Status {}'.format(status))\n", - " print('')\n", + " print(\"\")\n", + " print(\"Status {}\".format(status))\n", + " print(\"\")\n", " print(describe_custom_classifier[\"DocumentClassifierProperties\"])\n", " break\n", - " \n", + "\n", " time.sleep(10)" ] }, @@ -479,11 +459,12 @@ "outputs": [], "source": [ "import os\n", - "#Retrieve the S3URI from the model output and create jobkey variable.\n", + "\n", + "# Retrieve the S3URI from the model output and create jobkey variable.\n", "job_output = describe_custom_classifier[\"DocumentClassifierProperties\"][\"OutputDataConfig\"][\"S3Uri\"]\n", "print(job_output)\n", "\n", - "path_prefix = 's3://{}/'.format(bucket)\n", + "path_prefix = \"s3://{}/\".format(bucket)\n", "\n", "job_key = os.path.relpath(job_output, path_prefix)\n", "\n", @@ -503,9 +484,9 @@ "metadata": {}, "outputs": [], "source": [ - "s3 = boto3.resource('s3')\n", + "s3 = boto3.resource(\"s3\")\n", "\n", - "s3.Bucket(bucket).download_file(job_key, './output.tar.gz')" + "s3.Bucket(bucket).download_file(job_key, \"./output.tar.gz\")" ] }, { @@ -514,7 +495,7 @@ "metadata": {}, "outputs": [], "source": [ - "#Unpack the gzip file\n", + "# Unpack the gzip file\n", "!tar xvzf ./output.tar.gz" ] }, @@ -526,7 +507,7 @@ "source": [ "import json\n", "\n", - "with open('./output/confusion_matrix.json') as json_file:\n", + "with open(\"./output/confusion_matrix.json\") as json_file:\n", " data = json.load(json_file)\n", "print(json.dumps(data, indent=2, default=str))" ] @@ -548,14 +529,52 @@ "source": [ "from IPython.display import HTML, display\n", "import tabulate\n", - "table = [['', '1', '2', '3', '4', '5', '(Predicted)'],\n", - " ['1', data['confusion_matrix'][0][0], data['confusion_matrix'][0][1], data['confusion_matrix'][0][2], data['confusion_matrix'][0][3], data['confusion_matrix'][0][4]],\n", - " ['2', data['confusion_matrix'][1][0], data['confusion_matrix'][1][1], data['confusion_matrix'][1][2], data['confusion_matrix'][1][3], data['confusion_matrix'][1][4]],\n", - " ['3', data['confusion_matrix'][2][0], data['confusion_matrix'][2][1], data['confusion_matrix'][2][2], data['confusion_matrix'][2][3], data['confusion_matrix'][2][4]],\n", - " ['4', data['confusion_matrix'][3][0], data['confusion_matrix'][3][1], data['confusion_matrix'][3][2], data['confusion_matrix'][3][3], data['confusion_matrix'][3][4]],\n", - " ['5', data['confusion_matrix'][4][0], data['confusion_matrix'][4][1], data['confusion_matrix'][4][2], data['confusion_matrix'][4][3], data['confusion_matrix'][4][4]],\n", - " ['(Actual)']]\n", - "display(HTML(tabulate.tabulate(table, tablefmt='html')))" + "\n", + "table = [\n", + " [\"\", \"1\", \"2\", \"3\", \"4\", \"5\", \"(Predicted)\"],\n", + " [\n", + " \"1\",\n", + " data[\"confusion_matrix\"][0][0],\n", + " data[\"confusion_matrix\"][0][1],\n", + " data[\"confusion_matrix\"][0][2],\n", + " data[\"confusion_matrix\"][0][3],\n", + " data[\"confusion_matrix\"][0][4],\n", + " ],\n", + " [\n", + " \"2\",\n", + " data[\"confusion_matrix\"][1][0],\n", + " data[\"confusion_matrix\"][1][1],\n", + " data[\"confusion_matrix\"][1][2],\n", + " data[\"confusion_matrix\"][1][3],\n", + " data[\"confusion_matrix\"][1][4],\n", + " ],\n", + " [\n", + " \"3\",\n", + " data[\"confusion_matrix\"][2][0],\n", + " data[\"confusion_matrix\"][2][1],\n", + " data[\"confusion_matrix\"][2][2],\n", + " data[\"confusion_matrix\"][2][3],\n", + " data[\"confusion_matrix\"][2][4],\n", + " ],\n", + " [\n", + " \"4\",\n", + " data[\"confusion_matrix\"][3][0],\n", + " data[\"confusion_matrix\"][3][1],\n", + " data[\"confusion_matrix\"][3][2],\n", + " data[\"confusion_matrix\"][3][3],\n", + " data[\"confusion_matrix\"][3][4],\n", + " ],\n", + " [\n", + " \"5\",\n", + " data[\"confusion_matrix\"][4][0],\n", + " data[\"confusion_matrix\"][4][1],\n", + " data[\"confusion_matrix\"][4][2],\n", + " data[\"confusion_matrix\"][4][3],\n", + " data[\"confusion_matrix\"][4][4],\n", + " ],\n", + " [\"(Actual)\"],\n", + "]\n", + "display(HTML(tabulate.tabulate(table, tablefmt=\"html\")))" ] }, { @@ -572,14 +591,13 @@ "outputs": [], "source": [ "from time import gmtime, strftime, sleep\n", - "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", "\n", - "comprehend_endpoint_name = 'comprehend-inference-ep-' + timestamp_suffix\n", + "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n", + "\n", + "comprehend_endpoint_name = \"comprehend-inference-ep-\" + timestamp_suffix\n", "\n", "inference_endpoint_response = comprehend.create_endpoint(\n", - " EndpointName=comprehend_endpoint_name,\n", - " ModelArn=model_arn,\n", - " DesiredInferenceUnits=1\n", + " EndpointName=comprehend_endpoint_name, ModelArn=model_arn, DesiredInferenceUnits=1\n", ")" ] }, diff --git a/02_usecases/08_Text_Classification_Predict.ipynb b/02_usecases/08_Text_Classification_Predict.ipynb index 0e3c0cd3..52ef9466 100644 --- a/02_usecases/08_Text_Classification_Predict.ipynb +++ b/02_usecases/08_Text_Classification_Predict.ipynb @@ -38,21 +38,16 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", "from botocore.config import Config\n", "\n", - "config = Config(\n", - " retries = {\n", - " 'max_attempts': 10,\n", - " 'mode': 'adaptive'\n", - " }\n", - ")\n", + "config = Config(retries={\"max_attempts\": 10, \"mode\": \"adaptive\"})\n", "\n", - "comprehend = boto3.Session().client(service_name='comprehend', region_name=region)" + "comprehend = boto3.Session().client(service_name=\"comprehend\", region_name=region)" ] }, { @@ -73,10 +68,10 @@ "try:\n", " comprehend_training_job_arn\n", "except NameError:\n", - " print('***************************************************************************')\n", - " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n", - " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n", - " print('***************************************************************************')" + " print(\"***************************************************************************\")\n", + " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n", + " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n", + " print(\"***************************************************************************\")" ] }, { @@ -108,10 +103,10 @@ "try:\n", " comprehend_endpoint_arn\n", "except NameError:\n", - " print('***************************************************************************')\n", - " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n", - " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n", - " print('***************************************************************************')" + " print(\"***************************************************************************\")\n", + " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n", + " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n", + " print(\"***************************************************************************\")" ] }, { @@ -140,9 +135,7 @@ }, "outputs": [], "source": [ - "describe_response = comprehend.describe_endpoint(\n", - " EndpointArn = comprehend_endpoint_arn\n", - ")\n", + "describe_response = comprehend.describe_endpoint(EndpointArn=comprehend_endpoint_arn)\n", "print(describe_response)" ] }, @@ -161,7 +154,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Comprehend Model Endpoint'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))" + "display(\n", + " HTML(\n", + " 'Review Comprehend Model Endpoint'.format(\n", + " region, comprehend_training_job_arn, comprehend_endpoint_arn\n", + " )\n", + " )\n", + ")" ] }, { @@ -172,17 +171,15 @@ "source": [ "import time\n", "\n", - "max_time = time.time() + 3*60*60 # 3 hours\n", + "max_time = time.time() + 3 * 60 * 60 # 3 hours\n", "while time.time() < max_time:\n", - " describe_response = comprehend.describe_endpoint(\n", - " EndpointArn = comprehend_endpoint_arn\n", - " )\n", + " describe_response = comprehend.describe_endpoint(EndpointArn=comprehend_endpoint_arn)\n", " status = describe_response[\"EndpointProperties\"][\"Status\"]\n", " print(\"Endpoint: {}\".format(status))\n", - " \n", + "\n", " if status == \"IN_SERVICE\" or status == \"IN_ERROR\":\n", " break\n", - " \n", + "\n", " time.sleep(5)" ] }, @@ -208,12 +205,10 @@ "source": [ "txt = \"\"\"I loved it! I will recommend this to everyone.\"\"\"\n", "\n", - "response = comprehend.classify_document(\n", - " Text= txt,\n", - " EndpointArn = comprehend_endpoint_arn\n", - ")\n", + "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n", "\n", "import json\n", + "\n", "print(json.dumps(response, indent=2, default=str))" ] }, @@ -225,12 +220,10 @@ "source": [ "txt = \"\"\"It's OK.\"\"\"\n", "\n", - "response = comprehend.classify_document(\n", - " Text= txt,\n", - " EndpointArn = comprehend_endpoint_arn\n", - ")\n", + "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n", "\n", "import json\n", + "\n", "print(json.dumps(response, indent=2, default=str))" ] }, @@ -244,12 +237,10 @@ "source": [ "txt = \"\"\"Really bad. I hope they don't make this anymore.\"\"\"\n", "\n", - "response = comprehend.classify_document(\n", - " Text= txt,\n", - " EndpointArn = comprehend_endpoint_arn\n", - ")\n", + "response = comprehend.classify_document(Text=txt, EndpointArn=comprehend_endpoint_arn)\n", "\n", "import json\n", + "\n", "print(json.dumps(response, indent=2, default=str))" ] }, diff --git a/02_usecases/archive/05_Celebrity_Detection.ipynb b/02_usecases/archive/05_Celebrity_Detection.ipynb index f542e985..1f9e1b63 100644 --- a/02_usecases/archive/05_Celebrity_Detection.ipynb +++ b/02_usecases/archive/05_Celebrity_Detection.ipynb @@ -68,8 +68,8 @@ "outputs": [], "source": [ "# Init clients\n", - "rekognition = boto3.client('rekognition')\n", - "s3 = boto3.client('s3')" + "rekognition = boto3.client(\"rekognition\")\n", + "s3 = boto3.client(\"s3\")" ] }, { @@ -98,7 +98,7 @@ "# around recognized celebrities to show them here in the notebook.\n", "\n", "!mkdir -p m1tmp\n", - "tempFolder = 'm1tmp/'" + "tempFolder = \"m1tmp/\"" ] }, { @@ -137,7 +137,7 @@ } ], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))" + "display(IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucketName, \"Key\": imageName})))" ] }, { @@ -172,9 +172,9 @@ "\n", "recognizeCelebritiesResponse = rekognition.recognize_celebrities(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucketName,\n", - " 'Name': imageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucketName,\n", + " \"Name\": imageName,\n", " }\n", " }\n", ")" @@ -195,7 +195,7 @@ "source": [ "# Show JSON response returned by Rekognition Celebrity Recognition API\n", "# In the JSON response below, you will see CelebrityFaces which contains information about recognized celebrities.\n", - "# For each recognized celebrity, you will see information like Name, Id, Urls and additional information about \n", + "# For each recognized celebrity, you will see information like Name, Id, Urls and additional information about\n", "# their facial attributes.\n", "\n", "display(recognizeCelebritiesResponse)" @@ -216,13 +216,14 @@ "source": [ "# Define a function that will display image with bounded boxes around recognized celebrites\n", "# We will call this function in next step\n", - " \n", - "def drawBoundingBoxes (sourceImage, boxes):\n", + "\n", + "\n", + "def drawBoundingBoxes(sourceImage, boxes):\n", " # blue, green, red, grey\n", - " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n", - " \n", + " colors = ((255, 255, 255), (255, 255, 255), (76, 182, 252), (52, 194, 123))\n", + "\n", " # Download image locally\n", - " imageLocation = tempFolder+os.path.basename(sourceImage)\n", + " imageLocation = tempFolder + os.path.basename(sourceImage)\n", " s3.download_file(bucketName, sourceImage, imageLocation)\n", "\n", " # Draws BB on Image\n", @@ -231,24 +232,24 @@ " width, height = bbImage.size\n", " col = 0\n", " maxcol = len(colors)\n", - " line= 3\n", + " line = 3\n", " for box in boxes:\n", - " x1 = int(box[1]['Left'] * width)\n", - " y1 = int(box[1]['Top'] * height)\n", - " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n", - " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n", - " \n", - " draw.text((x1,y1),box[0],colors[col])\n", + " x1 = int(box[1][\"Left\"] * width)\n", + " y1 = int(box[1][\"Top\"] * height)\n", + " x2 = int(box[1][\"Left\"] * width + box[1][\"Width\"] * width)\n", + " y2 = int(box[1][\"Top\"] * height + box[1][\"Height\"] * height)\n", + "\n", + " draw.text((x1, y1), box[0], colors[col])\n", " for l in range(line):\n", - " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n", - " col = (col+1)%maxcol\n", - " \n", + " draw.rectangle((x1 - l, y1 - l, x2 + l, y2 + l), outline=colors[col])\n", + " col = (col + 1) % maxcol\n", + "\n", " imageFormat = \"PNG\"\n", " ext = sourceImage.lower()\n", - " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n", - " imageFormat = 'JPEG'\n", + " if ext.endswith(\"jpg\") or ext.endswith(\"jpeg\"):\n", + " imageFormat = \"JPEG\"\n", "\n", - " bbImage.save(imageLocation,format=imageFormat)\n", + " bbImage.save(imageLocation, format=imageFormat)\n", "\n", " display(bbImage)" ] @@ -274,10 +275,10 @@ "# Extract bounding box information from JSON response above and display image with bounding boxes around celebrites.\n", "\n", "boxes = []\n", - "celebrities = recognizeCelebritiesResponse['CelebrityFaces']\n", + "celebrities = recognizeCelebritiesResponse[\"CelebrityFaces\"]\n", "for celebrity in celebrities:\n", - " boxes.append ((celebrity['Name'], celebrity['Face']['BoundingBox']))\n", - " \n", + " boxes.append((celebrity[\"Name\"], celebrity[\"Face\"][\"BoundingBox\"]))\n", + "\n", "drawBoundingBoxes(imageName, boxes)" ] }, @@ -319,14 +320,14 @@ "# Start celebrity recognition job\n", "startCelebrityRekognition = rekognition.start_celebrity_recognition(\n", " Video={\n", - " 'S3Object': {\n", - " 'Bucket': bucketName,\n", - " 'Name': videoName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucketName,\n", + " \"Name\": videoName,\n", " }\n", " },\n", ")\n", "\n", - "celebrityJobId = startCelebrityRekognition['JobId']\n", + "celebrityJobId = startCelebrityRekognition[\"JobId\"]\n", "display(\"Job Id: {0}\".format(celebrityJobId))" ] }, @@ -347,20 +348,15 @@ "\n", "# Wait for celebrity recognition job to complete\n", "# In production use cases, you would usually use StepFucntion or SNS topic to get notified when job is complete.\n", - "getCelebrityRecognition = rekognition.get_celebrity_recognition(\n", - " JobId=celebrityJobId,\n", - " SortBy='TIMESTAMP'\n", - ")\n", + "getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n", "\n", - "while(getCelebrityRecognition['JobStatus'] == 'IN_PROGRESS'):\n", + "while getCelebrityRecognition[\"JobStatus\"] == \"IN_PROGRESS\":\n", " time.sleep(5)\n", - " print('.', end='')\n", - " \n", - " getCelebrityRecognition = rekognition.get_celebrity_recognition(\n", - " JobId=celebrityJobId,\n", - " SortBy='TIMESTAMP')\n", - " \n", - "display(getCelebrityRecognition['JobStatus'])" + " print(\".\", end=\"\")\n", + "\n", + " getCelebrityRecognition = rekognition.get_celebrity_recognition(JobId=celebrityJobId, SortBy=\"TIMESTAMP\")\n", + "\n", + "display(getCelebrityRecognition[\"JobStatus\"])" ] }, { @@ -404,16 +400,16 @@ "strOverall = \"Celebrities in the overall video:
=======================================
\"\n", "\n", "# Celebrities detected in each frame\n", - "for celebrity in getCelebrityRecognition['Celebrities']:\n", - " if 'Celebrity' in celebrity :\n", + "for celebrity in getCelebrityRecognition[\"Celebrities\"]:\n", + " if \"Celebrity\" in celebrity:\n", " cconfidence = celebrity[\"Celebrity\"][\"Confidence\"]\n", - " if(cconfidence > 95):\n", - " ts = celebrity [\"Timestamp\"]\n", + " if cconfidence > 95:\n", + " ts = celebrity[\"Timestamp\"]\n", " cname = celebrity[\"Celebrity\"][\"Name\"]\n", - " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence,2))\n", + " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, cname, round(cconfidence, 2))\n", " if not cname in theCelebs:\n", " theCelebs[cname] = cname\n", - " \n", + "\n", "\n", "# Unique faces detected in video\n", "for theCeleb in theCelebs:\n", @@ -421,7 +417,7 @@ "\n", "# Display results\n", "display(HTML(strOverall))\n", - "#display(HTML(strDetail))" + "# display(HTML(strDetail))" ] }, { @@ -442,12 +438,14 @@ "# Show video in a player\n", "\n", "s3FilePrefix = \"https://s3.amazonaws.com\"\n", - "if(not awsRegion == 'us-east-1'):\n", + "if not awsRegion == \"us-east-1\":\n", " s3FilePrefix = \"https://s3-{}.amazonaws.com\".format(awsRegion)\n", "\n", "s3VideoUrl = \"{0}/{1}/{2}\".format(s3FilePrefix, bucketName, videoName)\n", "\n", - "videoTag = \"\".format(s3VideoUrl)\n", + "videoTag = \"\".format(\n", + " s3VideoUrl\n", + ")\n", "\n", "videoui = \"
{}{}
\".format(videoTag, strDetail)\n", "\n", @@ -479,7 +477,9 @@ "metadata": {}, "outputs": [], "source": [ - "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': customCelebrityImageName})))" + "display(\n", + " IImage(url=s3.generate_presigned_url(\"get_object\", Params={\"Bucket\": bucketName, \"Key\": customCelebrityImageName}))\n", + ")" ] }, { @@ -492,9 +492,9 @@ "\n", "customCelebrityResponse = rekognition.recognize_celebrities(\n", " Image={\n", - " 'S3Object': {\n", - " 'Bucket': bucketName,\n", - " 'Name': customCelebrityImageName,\n", + " \"S3Object\": {\n", + " \"Bucket\": bucketName,\n", + " \"Name\": customCelebrityImageName,\n", " }\n", " }\n", ")" @@ -507,7 +507,7 @@ "outputs": [], "source": [ "# Display Rekognition response\n", - "# You will see Rekognition return an empty list for CelebrityFaces and \n", + "# You will see Rekognition return an empty list for CelebrityFaces and\n", "# UnrecognizedFaces list with unrecognized faces that were detected in the image.\n", "# In the next module you will learn how to get custom-celebrity faces recognized.\n", "\n", @@ -520,14 +520,14 @@ "metadata": {}, "outputs": [], "source": [ - "#Show image and bounded boxes around detected faces\n", + "# Show image and bounded boxes around detected faces\n", "\n", "# Extract BB info from response\n", "cboxes = []\n", - "faces = customCelebrityResponse['UnrecognizedFaces']\n", + "faces = customCelebrityResponse[\"UnrecognizedFaces\"]\n", "for face in faces:\n", - " cboxes.append (('Unrecognized Face', face['BoundingBox']))\n", - " \n", + " cboxes.append((\"Unrecognized Face\", face[\"BoundingBox\"]))\n", + "\n", "drawBoundingBoxes(customCelebrityImageName, cboxes)" ] }, diff --git a/03_automl/01_Prepare_Dataset_Autopilot.ipynb b/03_automl/01_Prepare_Dataset_Autopilot.ipynb index 8a72f04b..fd1db9f2 100644 --- a/03_automl/01_Prepare_Dataset_Autopilot.ipynb +++ b/03_automl/01_Prepare_Dataset_Autopilot.ipynb @@ -59,9 +59,9 @@ "try:\n", " setup_instance_check_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -91,9 +91,9 @@ "try:\n", " setup_dependencies_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -121,11 +121,11 @@ "outputs": [], "source": [ "try:\n", - " setup_s3_bucket_passed \n", + " setup_s3_bucket_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -153,11 +153,11 @@ "outputs": [], "source": [ "try:\n", - " setup_iam_roles_passed \n", + " setup_iam_roles_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -183,21 +183,21 @@ "outputs": [], "source": [ "if not setup_instance_check_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_dependencies_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_s3_bucket_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_iam_roles_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -210,7 +210,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -242,10 +242,12 @@ "source": [ "import csv\n", "\n", - "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df.shape" ] }, @@ -265,12 +267,13 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(kind=\"bar\", title=\"Breakdown by Star Rating\")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -288,43 +291,26 @@ "source": [ "from sklearn.utils import resample\n", "\n", - "five_star_df = df.query('star_rating == 5')\n", - "four_star_df = df.query('star_rating == 4')\n", - "three_star_df = df.query('star_rating == 3')\n", - "two_star_df = df.query('star_rating == 2')\n", - "one_star_df = df.query('star_rating == 1')\n", + "five_star_df = df.query(\"star_rating == 5\")\n", + "four_star_df = df.query(\"star_rating == 4\")\n", + "three_star_df = df.query(\"star_rating == 3\")\n", + "two_star_df = df.query(\"star_rating == 2\")\n", + "one_star_df = df.query(\"star_rating == 1\")\n", "\n", "# Check which sentiment has the least number of samples\n", - "minority_count = min(five_star_df.shape[0], \n", - " four_star_df.shape[0], \n", - " three_star_df.shape[0], \n", - " two_star_df.shape[0], \n", - " one_star_df.shape[0]) \n", + "minority_count = min(\n", + " five_star_df.shape[0], four_star_df.shape[0], three_star_df.shape[0], two_star_df.shape[0], one_star_df.shape[0]\n", + ")\n", "\n", - "five_star_df = resample(five_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "four_star_df = resample(four_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "three_star_df = resample(three_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "two_star_df = resample(two_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", - "one_star_df = resample(one_star_df,\n", - " replace = False,\n", - " n_samples = minority_count,\n", - " random_state = 27)\n", + "one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27)\n", "\n", "df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df])\n", "df_balanced = df_balanced.reset_index(drop=True)\n", @@ -338,9 +324,11 @@ "metadata": {}, "outputs": [], "source": [ - "df_balanced[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_balanced[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -368,14 +356,10 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Split all data into 90% train and 10% holdout\n", - "df_train, df_holdout = train_test_split(df_balanced, \n", - " test_size=0.10,\n", - " stratify=df_balanced['star_rating'])\n", + "df_train, df_holdout = train_test_split(df_balanced, test_size=0.10, stratify=df_balanced[\"star_rating\"])\n", "\n", "# Split holdout data into 50% validation and 50% test\n", - "df_validation, df_test = train_test_split(df_holdout,\n", - " test_size=0.50, \n", - " stratify=df_holdout['star_rating'])\n" + "df_validation, df_test = train_test_split(df_holdout, test_size=0.50, stratify=df_holdout[\"star_rating\"])" ] }, { @@ -385,16 +369,16 @@ "outputs": [], "source": [ "# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n", - "labels = ['Train', 'Validation', 'Test']\n", + "labels = [\"Train\", \"Validation\", \"Test\"]\n", "sizes = [len(df_train.index), len(df_validation.index), len(df_test.index)]\n", - "explode = (0.1, 0, 0) \n", + "explode = (0.1, 0, 0)\n", "\n", "fig1, ax1 = plt.subplots()\n", "\n", - "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)\n", + "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90)\n", "\n", "# Equal aspect ratio ensures that pie is drawn as a circle.\n", - "ax1.axis('equal') \n", + "ax1.axis(\"equal\")\n", "\n", "plt.show()" ] @@ -421,7 +405,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_train[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='90% Train Breakdown by Star Rating')" + "df_train[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"90% Train Breakdown by Star Rating\"\n", + ")" ] }, { @@ -446,7 +432,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_validation[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Validation Breakdown by Star Rating')" + "df_validation[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"5% Validation Breakdown by Star Rating\"\n", + ")" ] }, { @@ -471,7 +459,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_test[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='5% Test Breakdown by Star Rating')" + "df_test[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"5% Test Breakdown by Star Rating\"\n", + ")" ] }, { @@ -487,7 +477,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_train = df_train[['star_rating', 'review_body']]\n", + "df_train = df_train[[\"star_rating\", \"review_body\"]]\n", "df_train.shape" ] }, @@ -513,7 +503,7 @@ "metadata": {}, "outputs": [], "source": [ - "autopilot_train_path = './amazon_reviews_us_Digital_Software_v1_00_autopilot.csv'\n", + "autopilot_train_path = \"./amazon_reviews_us_Digital_Software_v1_00_autopilot.csv\"\n", "df_train.to_csv(autopilot_train_path, index=False, header=True)" ] }, @@ -530,7 +520,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_s3_prefix = 'data'\n", + "train_s3_prefix = \"data\"\n", "autopilot_train_s3_uri = sess.upload_data(path=autopilot_train_path, key_prefix=train_s3_prefix)\n", "autopilot_train_s3_uri" ] diff --git a/03_automl/02_Train_Reviews_Autopilot.ipynb b/03_automl/02_Train_Reviews_Autopilot.ipynb index d2d7f65c..f07d5525 100644 --- a/03_automl/02_Train_Reviews_Autopilot.ipynb +++ b/03_automl/02_Train_Reviews_Autopilot.ipynb @@ -89,11 +89,11 @@ "source": [ "try:\n", " autopilot_train_s3_uri\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -112,11 +112,11 @@ "outputs": [], "source": [ "if not autopilot_train_s3_uri:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] PLEASE RUN THE PREVIOUS 01_PREPARE_DATASET_AUTOPILOT NOTEBOOK.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -130,12 +130,12 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -187,7 +187,7 @@ "source": [ "import csv\n", "\n", - "df = pd.read_csv('./tmp/amazon_reviews_us_Digital_Software_v1_00_autopilot.csv')\n", + "df = pd.read_csv(\"./tmp/amazon_reviews_us_Digital_Software_v1_00_autopilot.csv\")\n", "df.head()" ] }, @@ -205,9 +205,9 @@ "metadata": {}, "outputs": [], "source": [ - "prefix_model_output = 'models/autopilot'\n", + "prefix_model_output = \"models/autopilot\"\n", "\n", - "model_output_s3_uri = 's3://{}/{}'.format(bucket, prefix_model_output)\n", + "model_output_s3_uri = \"s3://{}/{}\".format(bucket, prefix_model_output)\n", "\n", "print(model_output_s3_uri)" ] @@ -221,27 +221,21 @@ "max_candidates = 3\n", "\n", "job_config = {\n", - " 'CompletionCriteria': {\n", - " 'MaxRuntimePerTrainingJobInSeconds': 900,\n", - " 'MaxCandidates': max_candidates,\n", - " 'MaxAutoMLJobRuntimeInSeconds': 5400\n", + " \"CompletionCriteria\": {\n", + " \"MaxRuntimePerTrainingJobInSeconds\": 900,\n", + " \"MaxCandidates\": max_candidates,\n", + " \"MaxAutoMLJobRuntimeInSeconds\": 5400,\n", " },\n", "}\n", "\n", - "input_data_config = [{\n", - " 'DataSource': {\n", - " 'S3DataSource': {\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3Uri': '{}'.format(autopilot_train_s3_uri)\n", - " }\n", - " },\n", - " 'TargetAttributeName': 'star_rating'\n", + "input_data_config = [\n", + " {\n", + " \"DataSource\": {\"S3DataSource\": {\"S3DataType\": \"S3Prefix\", \"S3Uri\": \"{}\".format(autopilot_train_s3_uri)}},\n", + " \"TargetAttributeName\": \"star_rating\",\n", " }\n", "]\n", "\n", - "output_data_config = {\n", - " 'S3OutputPath': '{}'.format(model_output_s3_uri)\n", - "}" + "output_data_config = {\"S3OutputPath\": \"{}\".format(model_output_s3_uri)}" ] }, { @@ -266,20 +260,20 @@ "metadata": {}, "outputs": [], "source": [ - "num_existing_jobs = 0 \n", + "num_existing_jobs = 0\n", "running_jobs = 0\n", "\n", - "if 'AutoMLJobSummaries' in existing_jobs_response.keys():\n", - " job_list = existing_jobs_response['AutoMLJobSummaries']\n", + "if \"AutoMLJobSummaries\" in existing_jobs_response.keys():\n", + " job_list = existing_jobs_response[\"AutoMLJobSummaries\"]\n", " num_existing_jobs = len(job_list)\n", " # print('[INFO] You already created {} Autopilot job(s) in this account.'.format(num_existing_jobs))\n", " for j in job_list:\n", - " if 'AutoMLJobStatus' in j.keys(): \n", - " if j['AutoMLJobStatus'] == 'InProgress':\n", + " if \"AutoMLJobStatus\" in j.keys():\n", + " if j[\"AutoMLJobStatus\"] == \"InProgress\":\n", " running_jobs = running_jobs + 1\n", - " print('[INFO] You have {} Autopilot job(s) currently running << Should be 0 jobs.'.format(running_jobs))\n", + " print(\"[INFO] You have {} Autopilot job(s) currently running << Should be 0 jobs.\".format(running_jobs))\n", "else:\n", - " print('[OK] Please continue.')" + " print(\"[OK] Please continue.\")" ] }, { @@ -308,10 +302,10 @@ "\n", "try:\n", " auto_ml_job_name\n", - "except NameError: \n", - " timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", - " auto_ml_job_name = 'automl-dm-' + timestamp_suffix\n", - " print('Created AutoMLJobName: ' + auto_ml_job_name)" + "except NameError:\n", + " timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n", + " auto_ml_job_name = \"automl-dm-\" + timestamp_suffix\n", + " print(\"Created AutoMLJobName: \" + auto_ml_job_name)" ] }, { @@ -340,19 +334,29 @@ "source": [ "max_running_jobs = 1\n", "\n", - "if running_jobs < max_running_jobs: # Limiting to max. 1 Jobs\n", + "if running_jobs < max_running_jobs: # Limiting to max. 1 Jobs\n", " try:\n", - " sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,\n", - " InputDataConfig=input_data_config,\n", - " OutputDataConfig=output_data_config,\n", - " AutoMLJobConfig=job_config,\n", - " RoleArn=role)\n", - " print('[OK] Autopilot Job {} created.'.format(auto_ml_job_name))\n", + " sm.create_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name,\n", + " InputDataConfig=input_data_config,\n", + " OutputDataConfig=output_data_config,\n", + " AutoMLJobConfig=job_config,\n", + " RoleArn=role,\n", + " )\n", + " print(\"[OK] Autopilot Job {} created.\".format(auto_ml_job_name))\n", " running_jobs = running_jobs + 1\n", " except:\n", - " print('[INFO] You have already launched an Autopilot job. Please continue see the output of this job.'.format(running_jobs))\n", + " print(\n", + " \"[INFO] You have already launched an Autopilot job. Please continue see the output of this job.\".format(\n", + " running_jobs\n", + " )\n", + " )\n", "else:\n", - " print('[INFO] You have already launched {} Autopilot running job(s). Please continue see the output of the running job.'.format(running_jobs))" + " print(\n", + " \"[INFO] You have already launched {} Autopilot running job(s). Please continue see the output of the running job.\".format(\n", + " running_jobs\n", + " )\n", + " )" ] }, { @@ -392,14 +396,17 @@ "source": [ "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", "\n", - "while 'AutoMLJobStatus' not in job_description_response.keys() and 'AutoMLJobSecondaryStatus' not in job_description_response.keys():\n", + "while (\n", + " \"AutoMLJobStatus\" not in job_description_response.keys()\n", + " and \"AutoMLJobSecondaryStatus\" not in job_description_response.keys()\n", + "):\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " print('[INFO] Autopilot Job has not yet started. Please wait. ')\n", + " print(\"[INFO] Autopilot Job has not yet started. Please wait. \")\n", " print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n", - " print('[INFO] Waiting for Autopilot Job to start...')\n", + " print(\"[INFO] Waiting for Autopilot Job to start...\")\n", " sleep(15)\n", "\n", - "print('[OK] AutoMLJob started.')" + "print(\"[OK] AutoMLJob started.\")" ] }, { @@ -419,7 +426,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Processing Jobs'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Jobs'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { @@ -441,18 +454,18 @@ "source": [ "%%time\n", "\n", - "job_status = job_description_response['AutoMLJobStatus']\n", - "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + "job_status = job_description_response[\"AutoMLJobStatus\"]\n", + "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", "\n", - "if job_status not in ('Stopped', 'Failed'):\n", - " while job_status in ('InProgress') and job_sec_status in ('Starting', 'AnalyzingData'):\n", + "if job_status not in (\"Stopped\", \"Failed\"):\n", + " while job_status in (\"InProgress\") and job_sec_status in (\"Starting\", \"AnalyzingData\"):\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " job_status = job_description_response['AutoMLJobStatus']\n", - " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + " job_status = job_description_response[\"AutoMLJobStatus\"]\n", + " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", " print(job_status, job_sec_status)\n", " sleep(15)\n", - " print('[OK] Data analysis phase completed.\\n')\n", - " \n", + " print(\"[OK] Data analysis phase completed.\\n\")\n", + "\n", "print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))" ] }, @@ -481,14 +494,14 @@ "source": [ "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", "\n", - "while 'AutoMLJobArtifacts' not in job_description_response.keys():\n", + "while \"AutoMLJobArtifacts\" not in job_description_response.keys():\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " print('[INFO] Autopilot Job has not yet generated the artifacts. Please wait. ')\n", + " print(\"[INFO] Autopilot Job has not yet generated the artifacts. Please wait. \")\n", " print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n", - " print('[INFO] Waiting for AutoMLJobArtifacts...')\n", + " print(\"[INFO] Waiting for AutoMLJobArtifacts...\")\n", " sleep(15)\n", "\n", - "print('[OK] AutoMLJobArtifacts generated.')" + "print(\"[OK] AutoMLJobArtifacts generated.\")" ] }, { @@ -499,14 +512,14 @@ "source": [ "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", "\n", - "while 'DataExplorationNotebookLocation' not in job_description_response['AutoMLJobArtifacts'].keys():\n", + "while \"DataExplorationNotebookLocation\" not in job_description_response[\"AutoMLJobArtifacts\"].keys():\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " print('[INFO] Autopilot Job has not yet generated the notebooks. Please wait. ')\n", + " print(\"[INFO] Autopilot Job has not yet generated the notebooks. Please wait. \")\n", " print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))\n", - " print('[INFO] Waiting for DataExplorationNotebookLocation...')\n", + " print(\"[INFO] Waiting for DataExplorationNotebookLocation...\")\n", " sleep(15)\n", "\n", - "print('[OK] DataExplorationNotebookLocation found.') " + "print(\"[OK] DataExplorationNotebookLocation found.\")" ] }, { @@ -515,9 +528,9 @@ "metadata": {}, "outputs": [], "source": [ - "generated_resources = job_description_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']\n", - "download_path = generated_resources.rsplit('/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb')[0]\n", - "job_id = download_path.rsplit('/', 1)[-1]" + "generated_resources = job_description_response[\"AutoMLJobArtifacts\"][\"DataExplorationNotebookLocation\"]\n", + "download_path = generated_resources.rsplit(\"/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb\")[0]\n", + "job_id = download_path.rsplit(\"/\", 1)[-1]" ] }, { @@ -528,10 +541,16 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "if not job_id: \n", - " print('No AutoMLJobArtifacts found.')\n", - "else: \n", - " display(HTML('Review S3 Generated Resources'.format(bucket, prefix_model_output, auto_ml_job_name, job_id)))" + "if not job_id:\n", + " print(\"No AutoMLJobArtifacts found.\")\n", + "else:\n", + " display(\n", + " HTML(\n", + " 'Review S3 Generated Resources'.format(\n", + " bucket, prefix_model_output, auto_ml_job_name, job_id\n", + " )\n", + " )\n", + " )" ] }, { @@ -627,7 +646,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Jobs'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Jobs'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { @@ -638,7 +663,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Batch Transform Jobs'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review Batch Transform Jobs'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { @@ -661,19 +692,19 @@ "%%time\n", "\n", "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - "job_status = job_description_response['AutoMLJobStatus']\n", - "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + "job_status = job_description_response[\"AutoMLJobStatus\"]\n", + "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", "print(job_status)\n", "print(job_sec_status)\n", - "if job_status not in ('Stopped', 'Failed'):\n", - " while job_status in ('InProgress') and job_sec_status in ('FeatureEngineering'):\n", + "if job_status not in (\"Stopped\", \"Failed\"):\n", + " while job_status in (\"InProgress\") and job_sec_status in (\"FeatureEngineering\"):\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " job_status = job_description_response['AutoMLJobStatus']\n", - " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + " job_status = job_description_response[\"AutoMLJobStatus\"]\n", + " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", " print(job_status, job_sec_status)\n", " sleep(15)\n", - " print('[OK] Feature engineering phase completed.\\n')\n", - " \n", + " print(\"[OK] Feature engineering phase completed.\\n\")\n", + "\n", "print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))" ] }, @@ -719,7 +750,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Hyperparameter Tuning Jobs'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review Hyperparameter Tuning Jobs'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { @@ -730,7 +767,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Jobs'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Jobs'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { @@ -753,19 +796,19 @@ "%%time\n", "\n", "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - "job_status = job_description_response['AutoMLJobStatus']\n", - "job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + "job_status = job_description_response[\"AutoMLJobStatus\"]\n", + "job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", "print(job_status)\n", "print(job_sec_status)\n", - "if job_status not in ('Stopped', 'Failed'):\n", - " while job_status in ('InProgress') and job_sec_status in ('ModelTuning'):\n", + "if job_status not in (\"Stopped\", \"Failed\"):\n", + " while job_status in (\"InProgress\") and job_sec_status in (\"ModelTuning\"):\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " job_status = job_description_response['AutoMLJobStatus']\n", - " job_sec_status = job_description_response['AutoMLJobSecondaryStatus']\n", + " job_status = job_description_response[\"AutoMLJobStatus\"]\n", + " job_sec_status = job_description_response[\"AutoMLJobSecondaryStatus\"]\n", " print(job_status, job_sec_status)\n", " sleep(15)\n", - " print('[OK] Model tuning phase completed.\\n')\n", - " \n", + " print(\"[OK] Model tuning phase completed.\\n\")\n", + "\n", "print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))" ] }, @@ -794,17 +837,17 @@ "%%time\n", "\n", "job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - "job_status = job_description_response['AutoMLJobStatus']\n", + "job_status = job_description_response[\"AutoMLJobStatus\"]\n", "print(job_status)\n", - "if job_status not in ('Stopped', 'Failed'):\n", - " while job_status not in ('Completed'):\n", + "if job_status not in (\"Stopped\", \"Failed\"):\n", + " while job_status not in (\"Completed\"):\n", " job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " job_status = job_description_response['AutoMLJobStatus']\n", + " job_status = job_description_response[\"AutoMLJobStatus\"]\n", " print(job_status)\n", " sleep(10)\n", - " print('[OK] Autopilot Job completed.\\n')\n", + " print(\"[OK] Autopilot Job completed.\\n\")\n", "else:\n", - " print(job_status)\n" + " print(job_status)" ] }, { @@ -821,8 +864,9 @@ "metadata": {}, "outputs": [], "source": [ - "candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n", - " SortBy='FinalObjectiveMetricValue')" + "candidates_response = sm.list_candidates_for_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n", + ")" ] }, { @@ -847,15 +891,16 @@ "metadata": {}, "outputs": [], "source": [ - "while 'Candidates' not in candidates_response.keys():\n", - " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n", - " SortBy='FinalObjectiveMetricValue')\n", - " print('[INFO] Autopilot Job is generating the Candidates. Please wait.')\n", + "while \"Candidates\" not in candidates_response.keys():\n", + " candidates_response = sm.list_candidates_for_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n", + " )\n", + " print(\"[INFO] Autopilot Job is generating the Candidates. Please wait.\")\n", " print(json.dumps(candidates_response, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "candidates = candidates_response['Candidates']\n", - "print('[OK] Candidates generated.') " + "candidates = candidates_response[\"Candidates\"]\n", + "print(\"[OK] Candidates generated.\")" ] }, { @@ -873,15 +918,16 @@ "metadata": {}, "outputs": [], "source": [ - "while 'CandidateName' not in candidates[0]:\n", - " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n", - " SortBy='FinalObjectiveMetricValue')\n", - " candidates = candidates_response['Candidates']\n", - " print('[INFO] Autopilot Job is generating CandidateName. Please wait. ')\n", + "while \"CandidateName\" not in candidates[0]:\n", + " candidates_response = sm.list_candidates_for_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n", + " )\n", + " candidates = candidates_response[\"Candidates\"]\n", + " print(\"[INFO] Autopilot Job is generating CandidateName. Please wait. \")\n", " print(json.dumps(candidates, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "print('[OK] CandidateName generated.')" + "print(\"[OK] CandidateName generated.\")" ] }, { @@ -890,15 +936,16 @@ "metadata": {}, "outputs": [], "source": [ - "while 'FinalAutoMLJobObjectiveMetric' not in candidates[0]:\n", - " candidates_response = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n", - " SortBy='FinalObjectiveMetricValue')\n", - " candidates = candidates_response['Candidates']\n", - " print('[INFO] Autopilot Job is generating FinalAutoMLJobObjectiveMetric. Please wait. ')\n", + "while \"FinalAutoMLJobObjectiveMetric\" not in candidates[0]:\n", + " candidates_response = sm.list_candidates_for_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n", + " )\n", + " candidates = candidates_response[\"Candidates\"]\n", + " print(\"[INFO] Autopilot Job is generating FinalAutoMLJobObjectiveMetric. Please wait. \")\n", " print(json.dumps(candidates, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "print('[OK] FinalAutoMLJobObjectiveMetric generated.')" + "print(\"[OK] FinalAutoMLJobObjectiveMetric generated.\")" ] }, { @@ -919,9 +966,13 @@ "outputs": [], "source": [ "for index, candidate in enumerate(candidates):\n", - " print(str(index) + \" \" \n", - " + candidate['CandidateName'] + \" \" \n", - " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))" + " print(\n", + " str(index)\n", + " + \" \"\n", + " + candidate[\"CandidateName\"]\n", + " + \" \"\n", + " + str(candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"])\n", + " )" ] }, { @@ -942,8 +993,8 @@ "from sagemaker.analytics import ExperimentAnalytics, TrainingJobAnalytics\n", "\n", "exp = ExperimentAnalytics(\n", - " sagemaker_session=sess, \n", - " experiment_name=auto_ml_job_name + '-aws-auto-ml-job',\n", + " sagemaker_session=sess,\n", + " experiment_name=auto_ml_job_name + \"-aws-auto-ml-job\",\n", ")\n", "\n", "df = exp.dataframe()\n", @@ -989,14 +1040,14 @@ "metadata": {}, "outputs": [], "source": [ - "while 'BestCandidate' not in best_candidate_response:\n", + "while \"BestCandidate\" not in best_candidate_response:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " print('[INFO] Autopilot Job is generating BestCandidate. Please wait. ')\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate. Please wait. \")\n", " print(json.dumps(best_candidate_response, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "best_candidate = best_candidate_response['BestCandidate']\n", - "print('[OK] BestCandidate generated.') " + "best_candidate = best_candidate_response[\"BestCandidate\"]\n", + "print(\"[OK] BestCandidate generated.\")" ] }, { @@ -1025,14 +1076,14 @@ "metadata": {}, "outputs": [], "source": [ - "while 'CandidateName' not in best_candidate:\n", + "while \"CandidateName\" not in best_candidate:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "print('[OK] BestCandidate CandidateName generated.') " + "print(\"[OK] BestCandidate CandidateName generated.\")" ] }, { @@ -1041,14 +1092,14 @@ "metadata": {}, "outputs": [], "source": [ - "while 'FinalAutoMLJobObjectiveMetric' not in best_candidate:\n", + "while \"FinalAutoMLJobObjectiveMetric\" not in best_candidate:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "print('[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.') " + "print(\"[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.\")" ] }, { @@ -1057,10 +1108,10 @@ "metadata": {}, "outputs": [], "source": [ - "best_candidate_identifier = best_candidate['CandidateName']\n", + "best_candidate_identifier = best_candidate[\"CandidateName\"]\n", "print(\"Candidate name: \" + best_candidate_identifier)\n", - "print(\"Metric name: \" + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])\n", - "print(\"Metric value: \" + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))" + "print(\"Metric name: \" + best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"MetricName\"])\n", + "print(\"Metric value: \" + str(best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"]))" ] }, { @@ -1087,15 +1138,15 @@ "metadata": {}, "outputs": [], "source": [ - "while 'CandidateSteps' not in best_candidate:\n", + "while \"CandidateSteps\" not in best_candidate:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "best_candidate = best_candidate_response['BestCandidate']\n", - "print('[OK] BestCandidate CandidateSteps generated.')" + "best_candidate = best_candidate_response[\"BestCandidate\"]\n", + "print(\"[OK] BestCandidate CandidateSteps generated.\")" ] }, { @@ -1106,15 +1157,15 @@ }, "outputs": [], "source": [ - "while 'CandidateStepType' not in best_candidate['CandidateSteps'][0]:\n", + "while \"CandidateStepType\" not in best_candidate[\"CandidateSteps\"][0]:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepType. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepType. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "best_candidate = best_candidate_response['BestCandidate']\n", - "print('[OK] BestCandidate CandidateSteps CandidateStepType generated.')" + "best_candidate = best_candidate_response[\"BestCandidate\"]\n", + "print(\"[OK] BestCandidate CandidateSteps CandidateStepType generated.\")" ] }, { @@ -1123,15 +1174,15 @@ "metadata": {}, "outputs": [], "source": [ - "while 'CandidateStepName' not in best_candidate['CandidateSteps'][0]:\n", + "while \"CandidateStepName\" not in best_candidate[\"CandidateSteps\"][0]:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepName. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate CandidateSteps CandidateStepName. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "best_candidate = best_candidate_response['BestCandidate']\n", - "print('[OK] BestCandidate CandidateSteps CandidateStepName generated.')" + "best_candidate = best_candidate_response[\"BestCandidate\"]\n", + "print(\"[OK] BestCandidate CandidateSteps CandidateStepName generated.\")" ] }, { @@ -1141,10 +1192,10 @@ "outputs": [], "source": [ "steps = []\n", - "for step in best_candidate['CandidateSteps']:\n", - " print('Candidate Step Type: {}'.format(step['CandidateStepType']))\n", - " print('Candidate Step Name: {}'.format(step['CandidateStepName']))\n", - " steps.append(step['CandidateStepName'])" + "for step in best_candidate[\"CandidateSteps\"]:\n", + " print(\"Candidate Step Type: {}\".format(step[\"CandidateStepType\"]))\n", + " print(\"Candidate Step Name: {}\".format(step[\"CandidateStepName\"]))\n", + " steps.append(step[\"CandidateStepName\"])" ] }, { @@ -1155,7 +1206,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Best Candidate Processing Job'.format(region, steps[0])))" + "display(\n", + " HTML(\n", + " 'Review Best Candidate Processing Job'.format(\n", + " region, steps[0]\n", + " )\n", + " )\n", + ")" ] }, { @@ -1166,7 +1223,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Best Candidate Training Job'.format(region, steps[1])))" + "display(\n", + " HTML(\n", + " 'Review Best Candidate Training Job'.format(\n", + " region, steps[1]\n", + " )\n", + " )\n", + ")" ] }, { @@ -1177,7 +1240,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Best Candidate Transform Job'.format(region, steps[2])))" + "display(\n", + " HTML(\n", + " 'Review Best Candidate Transform Job'.format(\n", + " region, steps[2]\n", + " )\n", + " )\n", + ")" ] }, { @@ -1188,7 +1257,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Best Candidate Training Job (Tuning)'.format(region, steps[3])))" + "display(\n", + " HTML(\n", + " 'Review Best Candidate Training Job (Tuning)'.format(\n", + " region, steps[3]\n", + " )\n", + " )\n", + ")" ] }, { @@ -1204,14 +1279,14 @@ "metadata": {}, "outputs": [], "source": [ - "while 'InferenceContainers' not in best_candidate:\n", + "while \"InferenceContainers\" not in best_candidate:\n", " best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", - " best_candidate = best_candidate_response['BestCandidate']\n", - " print('[INFO] Autopilot Job is generating BestCandidate InferenceContainers. Please wait. ')\n", + " best_candidate = best_candidate_response[\"BestCandidate\"]\n", + " print(\"[INFO] Autopilot Job is generating BestCandidate InferenceContainers. Please wait. \")\n", " print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))\n", " sleep(10)\n", "\n", - "print('[OK] BestCandidate InferenceContainers generated.') " + "print(\"[OK] BestCandidate InferenceContainers generated.\")" ] }, { @@ -1220,7 +1295,7 @@ "metadata": {}, "outputs": [], "source": [ - "best_candidate_containers = best_candidate['InferenceContainers']" + "best_candidate_containers = best_candidate[\"InferenceContainers\"]" ] }, { @@ -1230,9 +1305,9 @@ "outputs": [], "source": [ "for container in best_candidate_containers:\n", - " print(container['Image'])\n", - " print(container['ModelDataUrl'])\n", - " print('======================')" + " print(container[\"Image\"])\n", + " print(container[\"ModelDataUrl\"])\n", + " print(\"======================\")" ] }, { @@ -1249,8 +1324,8 @@ "outputs": [], "source": [ "for container in best_candidate_containers:\n", - " print(container['Environment'])\n", - " print('======================')" + " print(container[\"Environment\"])\n", + " print(\"======================\")" ] }, { @@ -1259,9 +1334,9 @@ "metadata": {}, "outputs": [], "source": [ - "best_candidate_containers[1]['Environment'].update({'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label, probability'})\n", - "best_candidate_containers[2]['Environment'].update({'SAGEMAKER_INFERENCE_INPUT': 'predicted_label, probability'})\n", - "best_candidate_containers[2]['Environment'].update({'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label, probability'})" + "best_candidate_containers[1][\"Environment\"].update({\"SAGEMAKER_INFERENCE_OUTPUT\": \"predicted_label, probability\"})\n", + "best_candidate_containers[2][\"Environment\"].update({\"SAGEMAKER_INFERENCE_INPUT\": \"predicted_label, probability\"})\n", + "best_candidate_containers[2][\"Environment\"].update({\"SAGEMAKER_INFERENCE_OUTPUT\": \"predicted_label, probability\"})" ] }, { @@ -1271,8 +1346,8 @@ "outputs": [], "source": [ "for container in best_candidate_containers:\n", - " print(container['Environment'])\n", - " print('======================')" + " print(container[\"Environment\"])\n", + " print(\"======================\")" ] }, { @@ -1298,7 +1373,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(best_candidate['InferenceContainers'])" + "print(best_candidate[\"InferenceContainers\"])" ] }, { @@ -1318,10 +1393,10 @@ "source": [ "try:\n", " autopilot_model_name\n", - "except NameError: \n", - " timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", - " autopilot_model_name = 'automl-dm-model-' + timestamp_suffix\n", - " print('[OK] Created Autopilot Model Name: ' + autopilot_model_name)" + "except NameError:\n", + " timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n", + " autopilot_model_name = \"automl-dm-model-\" + timestamp_suffix\n", + " print(\"[OK] Created Autopilot Model Name: \" + autopilot_model_name)" ] }, { @@ -1351,11 +1426,11 @@ "try:\n", " autopilot_model_arn\n", "except NameError:\n", - " create_model_response = sm.create_model(Containers=best_candidate['InferenceContainers'],\n", - " ModelName=autopilot_model_name,\n", - " ExecutionRoleArn=role)\n", - " autopilot_model_arn = create_model_response['ModelArn']\n", - " print('[OK] Created Autopilot Model: {}'.format(autopilot_model_arn))" + " create_model_response = sm.create_model(\n", + " Containers=best_candidate[\"InferenceContainers\"], ModelName=autopilot_model_name, ExecutionRoleArn=role\n", + " )\n", + " autopilot_model_arn = create_model_response[\"ModelArn\"]\n", + " print(\"[OK] Created Autopilot Model: {}\".format(autopilot_model_arn))" ] }, { @@ -1380,8 +1455,8 @@ "metadata": {}, "outputs": [], "source": [ - "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", - "epc_name = 'automl-dm-epc-' + timestamp_suffix\n", + "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n", + "epc_name = \"automl-dm-epc-\" + timestamp_suffix\n", "\n", "print(epc_name)" ] @@ -1408,13 +1483,13 @@ "metadata": {}, "outputs": [], "source": [ - "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", + "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n", "\n", "try:\n", " autopilot_endpoint_name\n", - "except NameError: \n", - " autopilot_endpoint_name = 'automl-dm-ep-' + timestamp_suffix\n", - " print('[OK] Created Autopilot Endpoint Name {}: '.format(autopilot_endpoint_name)) " + "except NameError:\n", + " autopilot_endpoint_name = \"automl-dm-ep-\" + timestamp_suffix\n", + " print(\"[OK] Created Autopilot Endpoint Name {}: \".format(autopilot_endpoint_name))" ] }, { @@ -1423,8 +1498,8 @@ "metadata": {}, "outputs": [], "source": [ - "variant_name = 'automl-dm-variant-' + timestamp_suffix\n", - "print('[OK] Created Endpoint Variant Name {}: '.format(variant_name))" + "variant_name = \"automl-dm-variant-\" + timestamp_suffix\n", + "print(\"[OK] Created Endpoint Variant Name {}: \".format(variant_name))" ] }, { @@ -1442,11 +1517,17 @@ "metadata": {}, "outputs": [], "source": [ - "ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,\n", - " ProductionVariants=[{'InstanceType':'ml.m5.large',\n", - " 'InitialInstanceCount': 1,\n", - " 'ModelName': autopilot_model_name,\n", - " 'VariantName': variant_name}])" + "ep_config = sm.create_endpoint_config(\n", + " EndpointConfigName=epc_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"InstanceType\": \"ml.m5.large\",\n", + " \"InitialInstanceCount\": 1,\n", + " \"ModelName\": autopilot_model_name,\n", + " \"VariantName\": variant_name,\n", + " }\n", + " ],\n", + ")" ] }, { @@ -1466,10 +1547,9 @@ "source": [ "try:\n", " autopilot_endpoint_arn\n", - "except NameError: \n", - " create_endpoint_response = sm.create_endpoint(EndpointName=autopilot_endpoint_name,\n", - " EndpointConfigName=epc_name) \n", - " autopilot_endpoint_arn = create_endpoint_response['EndpointArn']\n", + "except NameError:\n", + " create_endpoint_response = sm.create_endpoint(EndpointName=autopilot_endpoint_name, EndpointConfigName=epc_name)\n", + " autopilot_endpoint_arn = create_endpoint_response[\"EndpointArn\"]\n", " print(autopilot_endpoint_arn)" ] }, @@ -1490,7 +1570,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, autopilot_endpoint_name)))" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, autopilot_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/03_automl/03_Predict_Reviews_Autopilot.ipynb b/03_automl/03_Predict_Reviews_Autopilot.ipynb index aa9db286..926e0c24 100644 --- a/03_automl/03_Predict_Reviews_Autopilot.ipynb +++ b/03_automl/03_Predict_Reviews_Autopilot.ipynb @@ -27,12 +27,12 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -52,12 +52,12 @@ "source": [ "try:\n", " autopilot_endpoint_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('***************************************************************************')\n", - " print('[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************')\n", - " print('[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************')\n", - " print('***************************************************************************')" + " print(\"***************************************************************************\")\n", + " print(\"[ERROR] PLEASE WAIT FOR THE PREVIOUS NOTEBOOK TO FINISH *******************\")\n", + " print(\"[ERROR] OR THIS NOTEBOOK WILL NOT RUN PROPERLY ****************************\")\n", + " print(\"***************************************************************************\")" ] }, { @@ -83,7 +83,7 @@ "metadata": {}, "outputs": [], "source": [ - "sm.get_waiter('endpoint_in_service').wait(EndpointName=autopilot_endpoint_name)" + "sm.get_waiter(\"endpoint_in_service\").wait(EndpointName=autopilot_endpoint_name)" ] }, { @@ -93,9 +93,9 @@ "outputs": [], "source": [ "resp = sm.describe_endpoint(EndpointName=autopilot_endpoint_name)\n", - "status = resp['EndpointStatus']\n", + "status = resp[\"EndpointStatus\"]\n", "\n", - "print(\"Arn: \" + resp['EndpointArn'])\n", + "print(\"Arn: \" + resp[\"EndpointArn\"])\n", "print(\"Status: \" + status)" ] }, @@ -113,7 +113,7 @@ "metadata": {}, "outputs": [], "source": [ - "sm_runtime = boto3.client('sagemaker-runtime')" + "sm_runtime = boto3.client(\"sagemaker-runtime\")" ] }, { @@ -124,12 +124,14 @@ "source": [ "csv_line_predict_positive = \"\"\"I loved it!\"\"\"\n", "\n", - "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_positive)\n", + "response = sm_runtime.invoke_endpoint(\n", + " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_positive\n", + ")\n", "\n", - "response_body = response['Body'].read().decode('utf-8').strip()\n", + "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n", "\n", - "r = response_body.split(',')\n", - "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))" + "r = response_body.split(\",\")\n", + "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))" ] }, { @@ -140,12 +142,14 @@ "source": [ "csv_line_predict_meh = \"\"\"It's OK.\"\"\"\n", "\n", - "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_meh)\n", + "response = sm_runtime.invoke_endpoint(\n", + " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_meh\n", + ")\n", "\n", - "response_body = response['Body'].read().decode('utf-8').strip()\n", + "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n", "\n", - "r = response_body.split(',')\n", - "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))" + "r = response_body.split(\",\")\n", + "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))" ] }, { @@ -158,12 +162,14 @@ "source": [ "csv_line_predict_negative = \"\"\"It's pretty good.\"\"\"\n", "\n", - "response = sm_runtime.invoke_endpoint(EndpointName=autopilot_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_negative)\n", + "response = sm_runtime.invoke_endpoint(\n", + " EndpointName=autopilot_endpoint_name, ContentType=\"text/csv\", Accept=\"text/csv\", Body=csv_line_predict_negative\n", + ")\n", "\n", - "response_body = response['Body'].read().decode('utf-8').strip()\n", + "response_body = response[\"Body\"].read().decode(\"utf-8\").strip()\n", "\n", - "r = response_body.split(',')\n", - "print('Predicated Star Rating Class: {} \\nProbability: {} '.format(r[0], r[1]))" + "r = response_body.split(\",\")\n", + "print(\"Predicated Star Rating Class: {} \\nProbability: {} \".format(r[0], r[1]))" ] }, { diff --git a/03_automl/generated_module/candidate_data_processors/dpp0.py b/03_automl/generated_module/candidate_data_processors/dpp0.py index c5d57903..acd57b70 100644 --- a/03_automl/generated_module/candidate_data_processors/dpp0.py +++ b/03_automl/generated_module/candidate_data_processors/dpp0.py @@ -7,10 +7,7 @@ # Given a list of column names and target column name, Header can return the index # for given column name -HEADER = Header( - column_names=['star_rating', 'review_body'], - target_column_name='star_rating' -) +HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating") def build_feature_transform(): @@ -18,35 +15,25 @@ def build_feature_transform(): # These features can be parsed as natural language. - text = HEADER.as_feature_indices(['review_body']) + text = HEADER.as_feature_indices(["review_body"]) text_processors = Pipeline( steps=[ ( - 'multicolumntfidfvectorizer', - MultiColumnTfidfVectorizer( - max_df=0.9941, - min_df=0.0007, - analyzer='word', - max_features=10000 - ) + "multicolumntfidfvectorizer", + MultiColumnTfidfVectorizer(max_df=0.9941, min_df=0.0007, analyzer="word", max_features=10000), ) ] ) - column_transformer = ColumnTransformer( - transformers=[('text_processing', text_processors, text)] - ) + column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)]) return Pipeline( - steps=[ - ('column_transformer', column_transformer - ), ('robuststandardscaler', RobustStandardScaler()) - ] + steps=[("column_transformer", column_transformer), ("robuststandardscaler", RobustStandardScaler())] ) def build_label_transform(): """Returns the model definition representing feature processing.""" - return RobustLabelEncoder(labels=['1', '2', '3', '4', '5']) + return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"]) diff --git a/03_automl/generated_module/candidate_data_processors/dpp1.py b/03_automl/generated_module/candidate_data_processors/dpp1.py index d54b46ab..8a1186cd 100644 --- a/03_automl/generated_module/candidate_data_processors/dpp1.py +++ b/03_automl/generated_module/candidate_data_processors/dpp1.py @@ -8,10 +8,7 @@ # Given a list of column names and target column name, Header can return the index # for given column name -HEADER = Header( - column_names=['star_rating', 'review_body'], - target_column_name='star_rating' -) +HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating") def build_feature_transform(): @@ -19,31 +16,24 @@ def build_feature_transform(): # These features can be parsed as natural language. - text = HEADER.as_feature_indices(['review_body']) + text = HEADER.as_feature_indices(["review_body"]) text_processors = Pipeline( steps=[ ( - 'multicolumntfidfvectorizer', - MultiColumnTfidfVectorizer( - max_df=0.99, - min_df=0.0021, - analyzer='char_wb', - max_features=10000 - ) + "multicolumntfidfvectorizer", + MultiColumnTfidfVectorizer(max_df=0.99, min_df=0.0021, analyzer="char_wb", max_features=10000), ) ] ) - column_transformer = ColumnTransformer( - transformers=[('text_processing', text_processors, text)] - ) + column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)]) return Pipeline( steps=[ - ('column_transformer', - column_transformer), ('robustpca', RobustPCA(n_components=5)), - ('robuststandardscaler', RobustStandardScaler()) + ("column_transformer", column_transformer), + ("robustpca", RobustPCA(n_components=5)), + ("robuststandardscaler", RobustStandardScaler()), ] ) @@ -51,4 +41,4 @@ def build_feature_transform(): def build_label_transform(): """Returns the model definition representing feature processing.""" - return RobustLabelEncoder(labels=['1', '2', '3', '4', '5']) + return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"]) diff --git a/03_automl/generated_module/candidate_data_processors/dpp2.py b/03_automl/generated_module/candidate_data_processors/dpp2.py index 2a52da4a..ef6e3dfd 100644 --- a/03_automl/generated_module/candidate_data_processors/dpp2.py +++ b/03_automl/generated_module/candidate_data_processors/dpp2.py @@ -7,10 +7,7 @@ # Given a list of column names and target column name, Header can return the index # for given column name -HEADER = Header( - column_names=['star_rating', 'review_body'], - target_column_name='star_rating' -) +HEADER = Header(column_names=["star_rating", "review_body"], target_column_name="star_rating") def build_feature_transform(): @@ -18,35 +15,25 @@ def build_feature_transform(): # These features can be parsed as natural language. - text = HEADER.as_feature_indices(['review_body']) + text = HEADER.as_feature_indices(["review_body"]) text_processors = Pipeline( steps=[ ( - 'multicolumntfidfvectorizer', - MultiColumnTfidfVectorizer( - max_df=0.9983, - min_df=0.0005, - analyzer='word', - max_features=10000 - ) + "multicolumntfidfvectorizer", + MultiColumnTfidfVectorizer(max_df=0.9983, min_df=0.0005, analyzer="word", max_features=10000), ) ] ) - column_transformer = ColumnTransformer( - transformers=[('text_processing', text_processors, text)] - ) + column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)]) return Pipeline( - steps=[ - ('column_transformer', column_transformer - ), ('robuststandardscaler', RobustStandardScaler()) - ] + steps=[("column_transformer", column_transformer), ("robuststandardscaler", RobustStandardScaler())] ) def build_label_transform(): """Returns the model definition representing feature processing.""" - return RobustLabelEncoder(labels=['1', '2', '3', '4', '5']) + return RobustLabelEncoder(labels=["1", "2", "3", "4", "5"]) diff --git a/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py b/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py index ae882934..a304708f 100644 --- a/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py +++ b/03_automl/generated_module/candidate_data_processors/sagemaker_serve.py @@ -16,35 +16,34 @@ def _is_inverse_label_transform(): """Returns True if if it's running in inverse label transform.""" - return os.getenv('AUTOML_TRANSFORM_MODE') == 'inverse-label-transform' + return os.getenv("AUTOML_TRANSFORM_MODE") == "inverse-label-transform" def _is_feature_transform(): """Returns True if it's running in feature transform mode.""" - return os.getenv('AUTOML_TRANSFORM_MODE') == 'feature-transform' + return os.getenv("AUTOML_TRANSFORM_MODE") == "feature-transform" def _get_selected_input_keys(): """Returns a list of ordered content keys for container's input.""" - return [key.strip().lower() for key in os.environ['SAGEMAKER_INFERENCE_INPUT'].split(',')] + return [key.strip().lower() for key in os.environ["SAGEMAKER_INFERENCE_INPUT"].split(",")] def _get_selected_output_keys(): """Returns a list of ordered content keys for container's output.""" - return [key.strip().lower() for key in os.environ['SAGEMAKER_INFERENCE_OUTPUT'].split(',')] + return [key.strip().lower() for key in os.environ["SAGEMAKER_INFERENCE_OUTPUT"].split(",")] def _sparsify_if_needed(x): """Returns a sparse matrix if the needed for encoding to sparse recordio protobuf.""" - if os.getenv('AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF') == '1' \ - and not sparse.issparse(x): + if os.getenv("AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF") == "1" and not sparse.issparse(x): return sparse.csr_matrix(x) return x def _split_features_target(x): """Returns the features and target by splitting the input array.""" - if os.getenv('AUTOML_TRANSFORM_MODE') == 'feature-transform': + if os.getenv("AUTOML_TRANSFORM_MODE") == "feature-transform": return _sparsify_if_needed(x), None if sparse.issparse(x): @@ -68,7 +67,7 @@ def model_fn(model_dir): deserialized model object that can be used for model serving """ - return load(filename=os.path.join(model_dir, 'model.joblib')) + return load(filename=os.path.join(model_dir, "model.joblib")) def predict_fn(input_object, model): @@ -101,10 +100,7 @@ def predict_fn(input_object, model): try: return model.transform(input_object) except ValueError as e: - return worker.Response( - response='{}'.format(str(e) or 'Unknown error.'), - status=http_client.BAD_REQUEST - ) + return worker.Response(response="{}".format(str(e) or "Unknown error."), status=http_client.BAD_REQUEST) def _generate_post_processed_response(array, model): @@ -137,8 +133,9 @@ def _generate_post_processed_response(array, model): for output_key_idx, output_key in enumerate(output_keys): if output_key == "predicted_label" and output_key in input_keys: input_key_idx = input_keys.index(output_key) - output_array[:, output_key_idx] = model.inverse_label_transform(array[:, input_key_idx] - .ravel().astype(np.float).astype(np.int)) + output_array[:, output_key_idx] = model.inverse_label_transform( + array[:, input_key_idx].ravel().astype(np.float).astype(np.int) + ) elif output_key == "labels": output_array[:, output_key_idx][:] = str(list(model.target_transformer.get_classes())) elif output_key in input_keys: @@ -168,11 +165,10 @@ def input_fn(request_body, request_content_type): decoded data as 2D numpy array """ - content_type = request_content_type.lower( - ) if request_content_type else "text/csv" + content_type = request_content_type.lower() if request_content_type else "text/csv" content_type = content_type.split(";")[0].strip() - if content_type == 'text/csv': + if content_type == "text/csv": if isinstance(request_body, str): byte_buffer = request_body.encode() else: @@ -182,8 +178,7 @@ def input_fn(request_body, request_content_type): return val return worker.Response( - response=f"'{request_content_type}' is an unsupported content type.", - status=http_client.UNSUPPORTED_MEDIA_TYPE + response=f"'{request_content_type}' is an unsupported content type.", status=http_client.UNSUPPORTED_MEDIA_TYPE ) @@ -217,20 +212,17 @@ def output_fn(prediction, accept_type): return worker.Response( response=encoder_factory[accept_type](prediction, output_keys), status=http_client.OK, - mimetype=accept_type + mimetype=accept_type, ) except KeyError: # Selectable inference is not turned on - if accept_type == 'text/csv': + if accept_type == "text/csv": return worker.Response( - response=encoders.encode(prediction, accept_type), - status=http_client.OK, - mimetype=accept_type + response=encoders.encode(prediction, accept_type), status=http_client.OK, mimetype=accept_type ) return worker.Response( - response=f"Accept type '{accept_type}' is not supported " - f"during inverse label transformation.", - status=http_client.NOT_ACCEPTABLE + response=f"Accept type '{accept_type}' is not supported " f"during inverse label transformation.", + status=http_client.NOT_ACCEPTABLE, ) if isinstance(prediction, tuple): @@ -238,30 +230,22 @@ def output_fn(prediction, accept_type): else: X, y = _split_features_target(prediction) - if accept_type == 'application/x-recordio-protobuf': + if accept_type == "application/x-recordio-protobuf": return worker.Response( response=encoders.array_to_recordio_protobuf( - _sparsify_if_needed(X).astype('float32'), - y.astype('float32') if y is not None else y + _sparsify_if_needed(X).astype("float32"), y.astype("float32") if y is not None else y ), status=http_client.OK, - mimetype=accept_type + mimetype=accept_type, ) - if accept_type == 'text/csv': + if accept_type == "text/csv": if y is not None: - X = np.column_stack( - (np.ravel(y), X.todense() if sparse.issparse(X) else X) - ) + X = np.column_stack((np.ravel(y), X.todense() if sparse.issparse(X) else X)) - return worker.Response( - response=encoders.encode(X, accept_type), - status=http_client.OK, - mimetype=accept_type - ) + return worker.Response(response=encoders.encode(X, accept_type), status=http_client.OK, mimetype=accept_type) return worker.Response( - response=f"Accept type '{accept_type}' is not supported.", - status=http_client.NOT_ACCEPTABLE + response=f"Accept type '{accept_type}' is not supported.", status=http_client.NOT_ACCEPTABLE ) @@ -273,16 +257,8 @@ def execution_parameters_fn(): used during inference and defaults to 6MB otherwise. """ if _is_feature_transform(): - return worker.Response( - response='{"MaxPayloadInMB":1}', - status=http_client.OK, - mimetype="application/json" - ) - return worker.Response( - response='{"MaxPayloadInMB":6}', - status=http_client.OK, - mimetype="application/json" - ) + return worker.Response(response='{"MaxPayloadInMB":1}', status=http_client.OK, mimetype="application/json") + return worker.Response(response='{"MaxPayloadInMB":6}', status=http_client.OK, mimetype="application/json") def numpy_array_to_csv(array, output_keys): @@ -358,7 +334,7 @@ def numpy_array_to_jsonlines(array, output_keys): encoder_factory = { - 'text/csv': numpy_array_to_csv, - 'application/json': numpy_array_to_json, - 'application/jsonlines': numpy_array_to_jsonlines + "text/csv": numpy_array_to_csv, + "application/json": numpy_array_to_json, + "application/jsonlines": numpy_array_to_jsonlines, } diff --git a/03_automl/generated_module/setup.py b/03_automl/generated_module/setup.py index c437bef2..215813f9 100644 --- a/03_automl/generated_module/setup.py +++ b/03_automl/generated_module/setup.py @@ -1,13 +1,13 @@ from setuptools import setup setup( - packages=['candidate_data_processors/'], - name='candidate_data_processors', - version='1.0.0', - description='This module is auto-generated by SageMaker AutoML. ' - 'It contains candidate data processing code and the ' - 'scaffolding to run them in SageMaker.', - author='Amazon Web Services', - license='Apache License 2.0', + packages=["candidate_data_processors/"], + name="candidate_data_processors", + version="1.0.0", + description="This module is auto-generated by SageMaker AutoML. " + "It contains candidate data processing code and the " + "scaffolding to run them in SageMaker.", + author="Amazon Web Services", + license="Apache License 2.0", include_package_data=True, ) diff --git a/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb b/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb index 202a3edb..10606107 100644 --- a/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb +++ b/03_automl/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb @@ -92,6 +92,7 @@ "!aws s3 sync s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/sagemaker-automl-candidates/pr-1-f08b7007254e43bb8d5d10af988f5e93db4f7a6b7f194a32b39f2c0733/notebooks/sagemaker_automl automl-dm-16-23-03-55-artifacts/sagemaker_automl --only-show-errors\n", "\n", "import sys\n", + "\n", "sys.path.append(\"automl-dm-16-23-03-55-artifacts\")" ] }, @@ -114,30 +115,33 @@ "from sagemaker_automl import uid, AutoMLLocalRunConfig\n", "\n", "# Where the preprocessed data from the existing AutoML job is stored\n", - "BASE_AUTOML_JOB_NAME = 'automl-dm-16-23-03-55'\n", + "BASE_AUTOML_JOB_NAME = \"automl-dm-16-23-03-55\"\n", "BASE_AUTOML_JOB_CONFIG = {\n", - " 'automl_job_name': BASE_AUTOML_JOB_NAME,\n", - " 'automl_output_s3_base_path': 's3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55',\n", - " 'data_transformer_image_repo_version': '0.2-1-cpu-py3',\n", - " 'algo_image_repo_versions': {'xgboost': '1.0-1-cpu-py3'},\n", - " 'algo_inference_image_repo_versions': {'xgboost': '1.0-1-cpu-py3'}\n", + " \"automl_job_name\": BASE_AUTOML_JOB_NAME,\n", + " \"automl_output_s3_base_path\": \"s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55\",\n", + " \"data_transformer_image_repo_version\": \"0.2-1-cpu-py3\",\n", + " \"algo_image_repo_versions\": {\"xgboost\": \"1.0-1-cpu-py3\"},\n", + " \"algo_inference_image_repo_versions\": {\"xgboost\": \"1.0-1-cpu-py3\"},\n", "}\n", "\n", "# Path conventions of the output data storage path from the local AutoML job run of this notebook\n", - "LOCAL_AUTOML_JOB_NAME = 'automl-dm--notebook-run-{}'.format(uid())\n", + "LOCAL_AUTOML_JOB_NAME = \"automl-dm--notebook-run-{}\".format(uid())\n", "LOCAL_AUTOML_JOB_CONFIG = {\n", - " 'local_automl_job_name': LOCAL_AUTOML_JOB_NAME,\n", - " 'local_automl_job_output_s3_base_path': 's3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/{}'.format(LOCAL_AUTOML_JOB_NAME),\n", - " 'data_processing_model_dir': 'data-processor-models',\n", - " 'data_processing_transformed_output_dir': 'transformed-data',\n", - " 'multi_algo_tuning_output_dir': 'multi-algo-tuning'\n", + " \"local_automl_job_name\": LOCAL_AUTOML_JOB_NAME,\n", + " \"local_automl_job_output_s3_base_path\": \"s3://sagemaker-us-east-1-405759480474/models/autopilot/automl-dm-16-23-03-55/{}\".format(\n", + " LOCAL_AUTOML_JOB_NAME\n", + " ),\n", + " \"data_processing_model_dir\": \"data-processor-models\",\n", + " \"data_processing_transformed_output_dir\": \"transformed-data\",\n", + " \"multi_algo_tuning_output_dir\": \"multi-algo-tuning\",\n", "}\n", "\n", "AUTOML_LOCAL_RUN_CONFIG = AutoMLLocalRunConfig(\n", - " role='arn:aws:iam::405759480474:role/mod-caf61d640fbd4ba7-SageMakerExecutionRole-1U3FI8J98QOSN',\n", + " role=\"arn:aws:iam::405759480474:role/mod-caf61d640fbd4ba7-SageMakerExecutionRole-1U3FI8J98QOSN\",\n", " base_automl_job_config=BASE_AUTOML_JOB_CONFIG,\n", " local_automl_job_config=LOCAL_AUTOML_JOB_CONFIG,\n", - " security_config={'EnableInterContainerTrafficEncryption': False, 'VpcConfig': {}})\n", + " security_config={\"EnableInterContainerTrafficEncryption\": False, \"VpcConfig\": {}},\n", + ")\n", "\n", "AUTOML_LOCAL_RUN_CONFIG.display()" ] @@ -194,30 +198,32 @@ "metadata": {}, "outputs": [], "source": [ - "automl_interactive_runner.select_candidate({\n", - " \"data_transformer\": {\n", - " \"name\": \"dpp0\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", - " \"volume_size_in_gb\": 50\n", - " },\n", - " \"transform_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", + "automl_interactive_runner.select_candidate(\n", + " {\n", + " \"data_transformer\": {\n", + " \"name\": \"dpp0\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " \"volume_size_in_gb\": 50,\n", + " },\n", + " \"transform_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", + " \"transforms_label\": True,\n", + " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n", + " \"sparse_encoding\": True,\n", " },\n", - " \"transforms_label\": True,\n", - " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n", - " \"sparse_encoding\": True\n", - " },\n", - " \"algorithm\": {\n", - " \"name\": \"xgboost\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", + " \"algorithm\": {\n", + " \"name\": \"xgboost\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", " },\n", " }\n", - "})" + ")" ] }, { @@ -234,30 +240,32 @@ "metadata": {}, "outputs": [], "source": [ - "automl_interactive_runner.select_candidate({\n", - " \"data_transformer\": {\n", - " \"name\": \"dpp1\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", - " \"volume_size_in_gb\": 50\n", + "automl_interactive_runner.select_candidate(\n", + " {\n", + " \"data_transformer\": {\n", + " \"name\": \"dpp1\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " \"volume_size_in_gb\": 50,\n", + " },\n", + " \"transform_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", + " \"transforms_label\": True,\n", + " \"transformed_data_format\": \"text/csv\",\n", + " \"sparse_encoding\": False,\n", " },\n", - " \"transform_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", - " },\n", - " \"transforms_label\": True,\n", - " \"transformed_data_format\": \"text/csv\",\n", - " \"sparse_encoding\": False\n", - " },\n", - " \"algorithm\": {\n", - " \"name\": \"xgboost\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", + " \"algorithm\": {\n", + " \"name\": \"xgboost\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", " },\n", " }\n", - "})" + ")" ] }, { @@ -274,30 +282,32 @@ "metadata": {}, "outputs": [], "source": [ - "automl_interactive_runner.select_candidate({\n", - " \"data_transformer\": {\n", - " \"name\": \"dpp2\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", - " \"volume_size_in_gb\": 50\n", + "automl_interactive_runner.select_candidate(\n", + " {\n", + " \"data_transformer\": {\n", + " \"name\": \"dpp2\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " \"volume_size_in_gb\": 50,\n", + " },\n", + " \"transform_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", + " \"transforms_label\": True,\n", + " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n", + " \"sparse_encoding\": True,\n", " },\n", - " \"transform_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", - " },\n", - " \"transforms_label\": True,\n", - " \"transformed_data_format\": \"application/x-recordio-protobuf\",\n", - " \"sparse_encoding\": True\n", - " },\n", - " \"algorithm\": {\n", - " \"name\": \"xgboost\",\n", - " \"training_resource_config\": {\n", - " \"instance_type\": \"ml.m5.4xlarge\",\n", - " \"instance_count\": 1,\n", + " \"algorithm\": {\n", + " \"name\": \"xgboost\",\n", + " \"training_resource_config\": {\n", + " \"instance_type\": \"ml.m5.4xlarge\",\n", + " \"instance_count\": 1,\n", + " },\n", " },\n", " }\n", - "})" + ")" ] }, { @@ -399,14 +409,14 @@ "outputs": [], "source": [ "ALGORITHM_OBJECTIVE_METRICS = {\n", - " 'xgboost': 'validation:accuracy',\n", + " \"xgboost\": \"validation:accuracy\",\n", "}\n", "\n", "STATIC_HYPERPARAMETERS = {\n", - " 'xgboost': {\n", - " 'objective': 'multi:softprob',\n", - " 'save_model_on_termination': 'true',\n", - " 'num_class': 5,\n", + " \"xgboost\": {\n", + " \"objective\": \"multi:softprob\",\n", + " \"save_model_on_termination\": \"true\",\n", + " \"num_class\": 5,\n", " },\n", "}" ] @@ -427,16 +437,16 @@ "from sagemaker.parameter import CategoricalParameter, ContinuousParameter, IntegerParameter\n", "\n", "ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES = {\n", - " 'xgboost': {\n", - " 'num_round': IntegerParameter(2, 1024, scaling_type='Logarithmic'),\n", - " 'max_depth': IntegerParameter(2, 8, scaling_type='Logarithmic'),\n", - " 'eta': ContinuousParameter(1e-3, 1.0, scaling_type='Logarithmic'),\n", - " 'gamma': ContinuousParameter(1e-6, 64.0, scaling_type='Logarithmic'),\n", - " 'min_child_weight': ContinuousParameter(1e-6, 32.0, scaling_type='Logarithmic'),\n", - " 'subsample': ContinuousParameter(0.5, 1.0, scaling_type='Linear'),\n", - " 'colsample_bytree': ContinuousParameter(0.3, 1.0, scaling_type='Linear'),\n", - " 'lambda': ContinuousParameter(1e-6, 2.0, scaling_type='Logarithmic'),\n", - " 'alpha': ContinuousParameter(1e-6, 2.0, scaling_type='Logarithmic'),\n", + " \"xgboost\": {\n", + " \"num_round\": IntegerParameter(2, 1024, scaling_type=\"Logarithmic\"),\n", + " \"max_depth\": IntegerParameter(2, 8, scaling_type=\"Logarithmic\"),\n", + " \"eta\": ContinuousParameter(1e-3, 1.0, scaling_type=\"Logarithmic\"),\n", + " \"gamma\": ContinuousParameter(1e-6, 64.0, scaling_type=\"Logarithmic\"),\n", + " \"min_child_weight\": ContinuousParameter(1e-6, 32.0, scaling_type=\"Logarithmic\"),\n", + " \"subsample\": ContinuousParameter(0.5, 1.0, scaling_type=\"Linear\"),\n", + " \"colsample_bytree\": ContinuousParameter(0.3, 1.0, scaling_type=\"Linear\"),\n", + " \"lambda\": ContinuousParameter(1e-6, 2.0, scaling_type=\"Logarithmic\"),\n", + " \"alpha\": ContinuousParameter(1e-6, 2.0, scaling_type=\"Logarithmic\"),\n", " },\n", "}" ] @@ -463,7 +473,8 @@ "multi_algo_tuning_parameters = automl_interactive_runner.prepare_multi_algo_parameters(\n", " objective_metrics=ALGORITHM_OBJECTIVE_METRICS,\n", " static_hyperparameters=STATIC_HYPERPARAMETERS,\n", - " hyperparameters_search_ranges=ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES)" + " hyperparameters_search_ranges=ALGORITHM_TUNABLE_HYPERPARAMETER_RANGES,\n", + ")" ] }, { @@ -515,8 +526,8 @@ "\n", "tuner = HyperparameterTuner.create(\n", " base_tuning_job_name=base_tuning_job_name,\n", - " strategy='Bayesian',\n", - " objective_type='Maximize',\n", + " strategy=\"Bayesian\",\n", + " objective_type=\"Maximize\",\n", " max_parallel_jobs=7,\n", " max_jobs=250,\n", " **multi_algo_tuning_parameters,\n", @@ -546,7 +557,10 @@ "tuning_job_name = tuner.latest_tuning_job.name\n", "\n", "display(\n", - " Markdown(f\"Tuning Job {tuning_job_name} started, please track the progress from [here](https://{AUTOML_LOCAL_RUN_CONFIG.region}.console.aws.amazon.com/sagemaker/home?region={AUTOML_LOCAL_RUN_CONFIG.region}#/hyper-tuning-jobs/{tuning_job_name})\"))\n", + " Markdown(\n", + " f\"Tuning Job {tuning_job_name} started, please track the progress from [here](https://{AUTOML_LOCAL_RUN_CONFIG.region}.console.aws.amazon.com/sagemaker/home?region={AUTOML_LOCAL_RUN_CONFIG.region}#/hyper-tuning-jobs/{tuning_job_name})\"\n", + " )\n", + ")\n", "\n", "# Wait for tuning job to finish\n", "tuner.wait()" @@ -588,16 +602,14 @@ "SAGEMAKER_SESSION = AUTOML_LOCAL_RUN_CONFIG.sagemaker_session\n", "SAGEMAKER_ROLE = AUTOML_LOCAL_RUN_CONFIG.role\n", "\n", - "tuner_analytics = HyperparameterTuningJobAnalytics(\n", - " tuner.latest_tuning_job.name, sagemaker_session=SAGEMAKER_SESSION)\n", + "tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=SAGEMAKER_SESSION)\n", "\n", "df_tuning_job_analytics = tuner_analytics.dataframe()\n", "\n", "# Sort the tuning job analytics by the final metrics value\n", "df_tuning_job_analytics.sort_values(\n", - " by=['FinalObjectiveValue'],\n", - " inplace=True,\n", - " ascending=False if tuner.objective_type == \"Maximize\" else True)\n", + " by=[\"FinalObjectiveValue\"], inplace=True, ascending=False if tuner.objective_type == \"Maximize\" else True\n", + ")\n", "\n", "# Show detailed analytics for the top 20 models\n", "df_tuning_job_analytics.head(20)" @@ -661,13 +673,15 @@ "\n", "# Get a data transformation model from chosen candidate\n", "best_candidate = automl_interactive_runner.choose_candidate(df_tuning_job_analytics, best_training_job)\n", - "best_data_transformer_model = best_candidate.get_data_transformer_model(role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION)\n", + "best_data_transformer_model = best_candidate.get_data_transformer_model(\n", + " role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION\n", + ")\n", "\n", "# Our first data transformation container will always return recordio-protobuf format\n", - "best_data_transformer_model.env[\"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT\"] = 'application/x-recordio-protobuf'\n", + "best_data_transformer_model.env[\"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT\"] = \"application/x-recordio-protobuf\"\n", "# Add environment variable for sparse encoding\n", "if best_candidate.data_transformer_step.sparse_encoding:\n", - " best_data_transformer_model.env[\"AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF\"] = '1'\n", + " best_data_transformer_model.env[\"AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF\"] = \"1\"\n", "\n", "# Get a algo model from chosen training job of the candidate\n", "algo_estimator = Estimator.attach(best_training_job)\n", @@ -677,22 +691,26 @@ "# inverse label transform model if we need to transform the intermediates back to non-numerical value\n", "model_containers = [best_data_transformer_model, best_algo_model]\n", "if best_candidate.transforms_label:\n", - " model_containers.append(best_candidate.get_data_transformer_model(\n", - " transform_mode=\"inverse-label-transform\",\n", - " role=SAGEMAKER_ROLE,\n", - " sagemaker_session=SAGEMAKER_SESSION))\n", + " model_containers.append(\n", + " best_candidate.get_data_transformer_model(\n", + " transform_mode=\"inverse-label-transform\", role=SAGEMAKER_ROLE, sagemaker_session=SAGEMAKER_SESSION\n", + " )\n", + " )\n", "\n", "# This model can emit response ['predicted_label', 'probability', 'labels', 'probabilities']. To enable the model to emit one or more\n", "# of the response content, pass the keys to `output_key` keyword argument in the select_inference_output method.\n", "\n", - "model_containers = select_inference_output(\"MulticlassClassification\", model_containers, output_keys=['predicted_label'])\n", + "model_containers = select_inference_output(\n", + " \"MulticlassClassification\", model_containers, output_keys=[\"predicted_label\"]\n", + ")\n", "\n", "\n", "pipeline_model = PipelineModel(\n", " name=\"AutoML-{}\".format(AUTOML_LOCAL_RUN_CONFIG.local_automl_job_name),\n", " role=SAGEMAKER_ROLE,\n", " models=model_containers,\n", - " vpc_config=AUTOML_LOCAL_RUN_CONFIG.vpc_config)" + " vpc_config=AUTOML_LOCAL_RUN_CONFIG.vpc_config,\n", + ")" ] }, { @@ -717,10 +735,9 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_model.deploy(initial_instance_count=1,\n", - " instance_type='ml.m5.2xlarge',\n", - " endpoint_name=pipeline_model.name,\n", - " wait=True)" + "pipeline_model.deploy(\n", + " initial_instance_count=1, instance_type=\"ml.m5.2xlarge\", endpoint_name=pipeline_model.name, wait=True\n", + ")" ] }, { diff --git a/03_automl/notebooks/sagemaker_automl/common.py b/03_automl/notebooks/sagemaker_automl/common.py index ecc21808..f53f81ad 100644 --- a/03_automl/notebooks/sagemaker_automl/common.py +++ b/03_automl/notebooks/sagemaker_automl/common.py @@ -20,8 +20,7 @@ def uid(): class AutoMLLocalCandidateStep: - """Helper class to execute a callable which is decorated with some metadata like name action. - """ + """Helper class to execute a callable which is decorated with some metadata like name action.""" def __init__(self, name, action, description=""): self.name = name @@ -56,22 +55,15 @@ def execute_steps(execution_name, steps, context, start_jitter_seconds=5): for step in steps: sleep(start_jitter_seconds) thread_name = threading.current_thread().name - logging.info( - "[{}:{}]Executing step: {}".format(thread_name, execution_name, step.name) - ) + logging.info("[{}:{}]Executing step: {}".format(thread_name, execution_name, step.name)) while True: try: step.run(context) break except ClientError as e: - if ( - e.response["Error"]["Code"] == "ThrottlingException" - and wait_seconds < max_wait_seconds - ): - logging.info( - "We are getting throttled, retrying in {}s".format(wait_seconds) - ) + if e.response["Error"]["Code"] == "ThrottlingException" and wait_seconds < max_wait_seconds: + logging.info("We are getting throttled, retrying in {}s".format(wait_seconds)) sleep(wait_seconds) wait_seconds = wait_seconds * 2 continue @@ -101,22 +93,22 @@ def select_inference_output(problem_type, model_containers, output_keys): Returns: List of model_containers updated to emit the response """ ALLOWED_INVERSE_TRANSFORM_KEYS = { - 'BinaryClassification': ['predicted_label', 'probability', 'probabilities', 'labels'], - 'MulticlassClassification': ['predicted_label', 'probability', 'probabilities', 'labels'] + "BinaryClassification": ["predicted_label", "probability", "probabilities", "labels"], + "MulticlassClassification": ["predicted_label", "probability", "probabilities", "labels"], } ALLOWED_ALGO_KEYS = { - 'BinaryClassification': ['predicted_label', 'probability', 'probabilities'], - 'MulticlassClassification': ['predicted_label', 'probability', 'probabilities'] + "BinaryClassification": ["predicted_label", "probability", "probabilities"], + "MulticlassClassification": ["predicted_label", "probability", "probabilities"], } try: ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type] except KeyError: - raise ValueError(f'{problem_type} does not support selective inference output.') + raise ValueError(f"{problem_type} does not support selective inference output.") # Either multiclass or binary classification, so the default should be 'predicted_label' - output_keys = output_keys or ['predicted_label'] + output_keys = output_keys or ["predicted_label"] bad_keys = [] algo_keys = [] @@ -130,32 +122,37 @@ def select_inference_output(problem_type, model_containers, output_keys): algo_keys.append(key.strip()) if len(bad_keys): - raise ValueError('Requested inference output keys [{}] are unsupported. ' - 'The supported inference keys are [{}]'.format( - ', '.join(bad_keys), ', '.format(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]))) - - model_containers[1].env.update({ - 'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv', - 'SAGEMAKER_INFERENCE_OUTPUT': ','.join(algo_keys), - 'SAGEMAKER_INFERENCE_SUPPORTED': ','.join(ALLOWED_ALGO_KEYS[problem_type]) - }) - model_containers[2].env.update({ - 'SAGEMAKER_INFERENCE_OUTPUT': ','.join(transform_keys), - 'SAGEMAKER_INFERENCE_INPUT': ','.join(algo_keys), - 'SAGEMAKER_INFERENCE_SUPPORTED': ','.join(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]) - }) + raise ValueError( + "Requested inference output keys [{}] are unsupported. " + "The supported inference keys are [{}]".format( + ", ".join(bad_keys), ", ".format(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]) + ) + ) + + model_containers[1].env.update( + { + "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv", + "SAGEMAKER_INFERENCE_OUTPUT": ",".join(algo_keys), + "SAGEMAKER_INFERENCE_SUPPORTED": ",".join(ALLOWED_ALGO_KEYS[problem_type]), + } + ) + model_containers[2].env.update( + { + "SAGEMAKER_INFERENCE_OUTPUT": ",".join(transform_keys), + "SAGEMAKER_INFERENCE_INPUT": ",".join(algo_keys), + "SAGEMAKER_INFERENCE_SUPPORTED": ",".join(ALLOWED_INVERSE_TRANSFORM_KEYS[problem_type]), + } + ) return model_containers def get_algo_image_uri(algo_name, region, repo_version): if algo_name == "xgboost": - return image_uris.retrieve(algo_name, region=region, version='1.0-1') + return image_uris.retrieve(algo_name, region=region, version="1.0-1") elif algo_name == "mlp": mlp_image_uri = image_uris.retrieve("linear-learner", region=region, version=repo_version) - last_slash_index = mlp_image_uri.rfind('/') - return "{}/{}:{}".format( - mlp_image_uri[:last_slash_index], "mxnet-algorithms", repo_version - ) + last_slash_index = mlp_image_uri.rfind("/") + return "{}/{}:{}".format(mlp_image_uri[:last_slash_index], "mxnet-algorithms", repo_version) else: return image_uris.retrieve(algo_name, region=region, version=repo_version) diff --git a/03_automl/notebooks/sagemaker_automl/config.py b/03_automl/notebooks/sagemaker_automl/config.py index 20796511..3f7f97d6 100644 --- a/03_automl/notebooks/sagemaker_automl/config.py +++ b/03_automl/notebooks/sagemaker_automl/config.py @@ -56,14 +56,10 @@ def __init__( self.automl_job_name = base_automl_job_config["automl_job_name"] # the base s3 path where the managed AutoML job stores the intermediates (e.g. data transformation pipeline # candidate) - self.automl_output_s3_base_path = base_automl_job_config[ - "automl_output_s3_base_path" - ] + self.automl_output_s3_base_path = base_automl_job_config["automl_output_s3_base_path"] # Auto ML output job path convention - self.automl_job_processed_data_path = join( - self.automl_output_s3_base_path, self.PRE_PROCESSED_DATA_ROOT - ) + self.automl_job_processed_data_path = join(self.automl_output_s3_base_path, self.PRE_PROCESSED_DATA_ROOT) self.automl_job_processed_training_data_path = join( self.automl_job_processed_data_path, self.PRE_PROCESSED_TRAINING_DATA_PATH ) @@ -73,17 +69,11 @@ def __init__( # Auto ML local job config self.local_automl_job_name = local_automl_job_config["local_automl_job_name"] - self.local_automl_job_output_s3_base_path = local_automl_job_config[ - "local_automl_job_output_s3_base_path" - ] + self.local_automl_job_output_s3_base_path = local_automl_job_config["local_automl_job_output_s3_base_path"] # data transformer docker image repo version - self.data_transformer_image_repo_version = base_automl_job_config[ - "data_transformer_image_repo_version" - ] - self.algo_image_repo_versions = base_automl_job_config[ - "algo_image_repo_versions" - ] + self.data_transformer_image_repo_version = base_automl_job_config["data_transformer_image_repo_version"] + self.algo_image_repo_versions = base_automl_job_config["algo_image_repo_versions"] self.algo_inference_image_repo_versions = base_automl_job_config["algo_inference_image_repo_versions"] @@ -110,19 +100,11 @@ def vpc_config(self): @property def subnets(self): - return ( - self.vpc_config.get("Subnets", None) - if self.vpc_config is not None - else None - ) + return self.vpc_config.get("Subnets", None) if self.vpc_config is not None else None @property def security_group_ids(self): - return ( - self.vpc_config.get("SecurityGroupIds", None) - if self.vpc_config is not None - else None - ) + return self.vpc_config.get("SecurityGroupIds", None) if self.vpc_config is not None else None @property def encrypt_inter_container_traffic(self): @@ -187,9 +169,4 @@ def to_html_table(self): def display(self): from IPython.display import display, Markdown - display( - Markdown( - "This notebook is initialized to use the following configuration: " - + self.to_html_table() - ) - ) + display(Markdown("This notebook is initialized to use the following configuration: " + self.to_html_table())) diff --git a/03_automl/notebooks/sagemaker_automl/interactive_runner.py b/03_automl/notebooks/sagemaker_automl/interactive_runner.py index d8221603..857f3376 100644 --- a/03_automl/notebooks/sagemaker_automl/interactive_runner.py +++ b/03_automl/notebooks/sagemaker_automl/interactive_runner.py @@ -22,9 +22,9 @@ class AutoMLInteractiveRunner: """AutoMLInteractiveRunner is an orchestrator that manages the AutoML local run. This includes the following: - 1. Manages the state of local candidates selection - 2. Orchestrate multi-algo tuning operations that requires inputs from all candidates. - 3. Model selection and export of trained estimator to deployable model + 1. Manages the state of local candidates selection + 2. Orchestrate multi-algo tuning operations that requires inputs from all candidates. + 3. Model selection and export of trained estimator to deployable model """ def __init__(self, local_run_config, candidates=None): @@ -74,9 +74,7 @@ def select_candidate(self, candidate_definition): if candidate_pipeline_name in self.candidates: logging.info( - "Warning: pipeline candidate {} has already been selected, replacing".format( - candidate_pipeline_name - ) + "Warning: pipeline candidate {} has already been selected, replacing".format(candidate_pipeline_name) ) # create candidate @@ -96,9 +94,7 @@ def fit_data_transformers(self, parallel_jobs=2, start_jitter_seconds=10): execution_future = {} - with ThreadPoolExecutor( - max_workers=parallel_jobs, thread_name_prefix="Worker" - ) as executor: + with ThreadPoolExecutor(max_workers=parallel_jobs, thread_name_prefix="Worker") as executor: for candidate_pipeline_name, candidate in self.candidates.items(): candidate.prepare_data_transformers_for_training() @@ -125,27 +121,19 @@ def fit_data_transformers(self, parallel_jobs=2, start_jitter_seconds=10): while True: future = next(iterator) candidate_pipeline_name = execution_future[future] - success = self._process_data_transformer_future( - candidate_pipeline_name, future - ) + success = self._process_data_transformer_future(candidate_pipeline_name, future) if success: success_count += 1 except StopIteration: - logging.info( - "Successfully fit {} data transformers".format(success_count) - ) + logging.info("Successfully fit {} data transformers".format(success_count)) def _process_data_transformer_future(self, candidate_pipeline_name, future): try: future.result() - logging.info( - "Successfully fit data transformer for {}".format( - candidate_pipeline_name - ) - ) + logging.info("Successfully fit data transformer for {}".format(candidate_pipeline_name)) self.candidates[candidate_pipeline_name].set_transformer_trained() return True except Exception: @@ -178,14 +166,10 @@ def prepare_multi_algo_parameters( """ # Create Estimators - estimator_kwargs[ - "encrypt_inter_container_traffic" - ] = self.local_run_config.encrypt_inter_container_traffic + estimator_kwargs["encrypt_inter_container_traffic"] = self.local_run_config.encrypt_inter_container_traffic estimator_kwargs["subnets"] = self.local_run_config.subnets - estimator_kwargs[ - "security_group_ids" - ] = self.local_run_config.security_group_ids + estimator_kwargs["security_group_ids"] = self.local_run_config.security_group_ids estimator_kwargs["output_kms_key"] = self.local_run_config.output_kms_key estimator_kwargs["enable_network_isolation"] = True @@ -253,15 +237,9 @@ def choose_candidate(self, tuner_analytics_dataframe, multi_algo_training_job_na tuner_analytics_dataframe["TrainingJobName"] == multi_algo_training_job_name ] # The TrainingJobDefinitionName is mapped to candidate name - best_data_processing_pipeline_name = training_job_analytics.iloc[0][ - "TrainingJobDefinitionName" - ] + best_data_processing_pipeline_name = training_job_analytics.iloc[0]["TrainingJobDefinitionName"] - logging.info( - "Chosen Data Processing pipeline candidate name is {}".format( - best_data_processing_pipeline_name - ) - ) + logging.info("Chosen Data Processing pipeline candidate name is {}".format(best_data_processing_pipeline_name)) best_candidate = self.candidates[best_data_processing_pipeline_name] return best_candidate diff --git a/03_automl/notebooks/sagemaker_automl/local_candidate.py b/03_automl/notebooks/sagemaker_automl/local_candidate.py index ff86cd16..9ff4d754 100644 --- a/03_automl/notebooks/sagemaker_automl/local_candidate.py +++ b/03_automl/notebooks/sagemaker_automl/local_candidate.py @@ -17,12 +17,9 @@ class AutoMLLocalCandidate: - """AutoMLLocalCandidate models an AutoML pipeline consist of data transformer and algo steps - """ + """AutoMLLocalCandidate models an AutoML pipeline consist of data transformer and algo steps""" - def __init__( - self, candidate_name, data_transformer_step, algo_step, local_run_config - ): + def __init__(self, candidate_name, data_transformer_step, algo_step, local_run_config): """ Args: candidate_name (str): name of the candidate, e.g. `dpp0-xgboost` @@ -83,7 +80,8 @@ def create(cls, candidate_name, candidate_definition, local_run_config): repo_version=local_run_config.data_transformer_image_repo_version, source_module_path=os.path.join( f"{local_run_config.automl_job_name}-artifacts", - AutoMLCandidateDataTransformerStep.DEFAULT_SOURCE_MODULE) + AutoMLCandidateDataTransformerStep.DEFAULT_SOURCE_MODULE, + ), ) algo_name = candidate_definition["algorithm"]["name"] @@ -91,12 +89,10 @@ def create(cls, candidate_name, candidate_definition, local_run_config): **candidate_definition["algorithm"], region=local_run_config.region, repo_version=local_run_config.algo_image_repo_versions[algo_name], - inference_repo_version=local_run_config.algo_inference_image_repo_versions[algo_name] + inference_repo_version=local_run_config.algo_inference_image_repo_versions[algo_name], ) - return AutoMLLocalCandidate( - candidate_name, data_transformer_step, algo_step, local_run_config - ) + return AutoMLLocalCandidate(candidate_name, data_transformer_step, algo_step, local_run_config) @property def content_type(self): @@ -111,9 +107,7 @@ def data_transformer_transformed_data_path(self): self._check_data_transformer_prepared() return self._state["data_transformer"]["transform_output_path"] - def prepare_data_transformers_for_training( - self, training_job_name=None, transform_job_name=None, **kwargs - ): + def prepare_data_transformers_for_training(self, training_job_name=None, transform_job_name=None, **kwargs): """This prepare the data transformers for training: 1. create SKlearn trainer 2. create steps to be executed by runner @@ -127,9 +121,7 @@ def prepare_data_transformers_for_training( """ # add network & security features - kwargs[ - "encrypt_inter_container_traffic" - ] = self.local_run_config.encrypt_inter_container_traffic + kwargs["encrypt_inter_container_traffic"] = self.local_run_config.encrypt_inter_container_traffic kwargs["subnets"] = self.local_run_config.subnets kwargs["security_group_ids"] = self.local_run_config.security_group_ids @@ -140,25 +132,19 @@ def prepare_data_transformers_for_training( output_path=self.local_run_config.data_processing_model_s3_root, role=self.local_run_config.role, sagemaker_session=self.local_run_config.sagemaker_session, - **kwargs + **kwargs, ) - training_job_name = ( - training_job_name - or "{prefix}-{dpp_name}-train-{suffix}".format( - prefix=self.local_run_config.local_automl_job_name, - dpp_name=self.data_transformer_step.name, - suffix=uid(), - ) + training_job_name = training_job_name or "{prefix}-{dpp_name}-train-{suffix}".format( + prefix=self.local_run_config.local_automl_job_name, + dpp_name=self.data_transformer_step.name, + suffix=uid(), ) - transform_job_name = ( - transform_job_name - or "{prefix}-{dpp_name}-transform-{suffix}".format( - prefix=self.local_run_config.local_automl_job_name, - dpp_name=self.data_transformer_step.name, - suffix=uid(), - ) + transform_job_name = transform_job_name or "{prefix}-{dpp_name}-transform-{suffix}".format( + prefix=self.local_run_config.local_automl_job_name, + dpp_name=self.data_transformer_step.name, + suffix=uid(), ) transform_output_path = "{prefix}/{dpp_name}/{transformed_data_format}".format( @@ -207,14 +193,9 @@ def set_transformer_trained(self): self._state["data_transformer"]["trained"] = True def data_transformer_is_trained(self): - return ( - "data_transformer" in self._state - and self._state["data_transformer"]["trained"] - ) + return "data_transformer" in self._state and self._state["data_transformer"]["trained"] - def get_data_transformer_model( - self, role, sagemaker_session, transform_mode=None, **kwargs - ): + def get_data_transformer_model(self, role, sagemaker_session, transform_mode=None, **kwargs): """ Args: @@ -230,25 +211,18 @@ def get_data_transformer_model( self._check_data_transformer_prepared() if not self.data_transformer_is_trained: - raise AutoMLLocalCandidateNotTrained( - "AutoML Candidate data transformers has not been trained yet" - ) + raise AutoMLLocalCandidateNotTrained("AutoML Candidate data transformers has not been trained yet") data_transformer_state = self._state["data_transformer"] trainer = data_transformer_state["trainer"] training_job_name = data_transformer_state["training_job_name"] - data_transformer_estimator = trainer.attach( - training_job_name, sagemaker_session=sagemaker_session - ) + data_transformer_estimator = trainer.attach(training_job_name, sagemaker_session=sagemaker_session) security_config = self.local_run_config.security_config - if ( - self.local_run_config.security_config is not None - and "VpcConfig" not in kwargs - ): + if self.local_run_config.security_config is not None and "VpcConfig" not in kwargs: kwargs.update({"vpc_config": security_config["VpcConfig"]}) return self.data_transformer_step.create_model( @@ -256,27 +230,21 @@ def get_data_transformer_model( role=role, sagemaker_session=sagemaker_session, transform_mode=transform_mode, - **kwargs + **kwargs, ) def to_dict(self): base_dict = { "pipeline_name": self.candidate_name, - "data_transformer": { - "data_processing_module_name": self.data_transformer_step.name - }, + "data_transformer": {"data_processing_module_name": self.data_transformer_step.name}, "algorithm": {"algo_name": self.algo_step.algo_name}, } if "data_transformer" in self._state: base_dict["data_transformer"].update( { - "training_job_name": self._state["data_transformer"][ - "training_job_name" - ], - "transform_job_name": self._state["data_transformer"][ - "transform_job_name" - ], + "training_job_name": self._state["data_transformer"]["training_job_name"], + "transform_job_name": self._state["data_transformer"]["transform_job_name"], } ) diff --git a/03_automl/notebooks/sagemaker_automl/steps.py b/03_automl/notebooks/sagemaker_automl/steps.py index ec243529..3128fce1 100644 --- a/03_automl/notebooks/sagemaker_automl/steps.py +++ b/03_automl/notebooks/sagemaker_automl/steps.py @@ -15,21 +15,27 @@ class AutoMLCandidateAlgoStep: and `mlp`. """ - def __init__(self, name, training_resource_config, region, repo_version, inference_repo_version, - candidate_specific_static_hyperparameters=None): + def __init__( + self, + name, + training_resource_config, + region, + repo_version, + inference_repo_version, + candidate_specific_static_hyperparameters=None, + ): self.algo_name = name self.training_resource_config = training_resource_config - self.candidate_specific_static_hps = candidate_specific_static_hyperparameters \ - if candidate_specific_static_hyperparameters else {} + self.candidate_specific_static_hps = ( + candidate_specific_static_hyperparameters if candidate_specific_static_hyperparameters else {} + ) self.region = region self.repo_version = repo_version self.algo_image_uri = get_algo_image_uri(self.algo_name, region, repo_version) self.algo_inference_image_uri = get_algo_image_uri(self.algo_name, region, inference_repo_version) - def create_estimator( - self, role, output_path, hyperparameters, sagemaker_session, **kwargs - ): + def create_estimator(self, role, output_path, hyperparameters, sagemaker_session, **kwargs): estimator = Estimator( self.algo_image_uri, @@ -47,13 +53,11 @@ def create_estimator( def get_inference_container_config(self): config = { - 'env': { - 'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv' - }, - 'image_uri': self.algo_inference_image_uri + "env": {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"}, + "image_uri": self.algo_inference_image_uri, } - if self.algo_name == 'mlp': - config['env']['ML_APPLICATION'] = 'mlp' + if self.algo_name == "mlp": + config["env"]["ML_APPLICATION"] = "mlp" return config @@ -107,7 +111,7 @@ def __init__( # We share registry account id with all framework container xgb_image_uri = image_uris.retrieve("xgboost", region=region, version="1.0-1") - last_slash_index = xgb_image_uri.rfind('/') + last_slash_index = xgb_image_uri.rfind("/") self.transformer_image_uri = "{}/{}:{}".format( xgb_image_uri[:last_slash_index], "sagemaker-sklearn-automl", repo_version ) @@ -198,13 +202,9 @@ def create_steps( def _train_transform(context): _trainer = context.get("trainer") - training_data_input_path = ( - local_run_config.automl_job_processed_training_data_path - ) + training_data_input_path = local_run_config.automl_job_processed_training_data_path return _trainer.fit( - { - AutoMLCandidateDataTransformerStep.TRAIN_CHANNEL_NAME: training_data_input_path - }, + {AutoMLCandidateDataTransformerStep.TRAIN_CHANNEL_NAME: training_data_input_path}, job_name=training_job_name, wait=True, logs=False, @@ -227,7 +227,7 @@ def _create_transformer(context): accept=self.content_type, env=transform_env, volume_kms_key=local_run_config.volume_kms_key, - output_kms_key=local_run_config.output_kms_key + output_kms_key=local_run_config.output_kms_key, ) context["transformer"] = transformer @@ -262,9 +262,7 @@ def _transform_data(context): ), ] - def create_model( - self, estimator, role, sagemaker_session, transform_mode, **kwargs - ): + def create_model(self, estimator, role, sagemaker_session, transform_mode, **kwargs): """Create a deployable data transformer model Args: estimator: an estimator attached from trainer diff --git a/04_ingest/01_Copy_TSV_To_S3.ipynb b/04_ingest/01_Copy_TSV_To_S3.ipynb index b7b05a62..ae42c5e4 100644 --- a/04_ingest/01_Copy_TSV_To_S3.ipynb +++ b/04_ingest/01_Copy_TSV_To_S3.ipynb @@ -87,9 +87,9 @@ "try:\n", " setup_instance_check_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -119,9 +119,9 @@ "try:\n", " setup_dependencies_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -149,11 +149,11 @@ "outputs": [], "source": [ "try:\n", - " setup_s3_bucket_passed \n", + " setup_s3_bucket_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -181,11 +181,11 @@ "outputs": [], "source": [ "try:\n", - " setup_iam_roles_passed \n", + " setup_iam_roles_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -204,21 +204,21 @@ "outputs": [], "source": [ "if not setup_instance_check_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Instance Check.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_dependencies_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_s3_bucket_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "if not setup_iam_roles_passed:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.') \n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup IAM Roles.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -231,13 +231,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -253,7 +253,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'" + "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\"" ] }, { @@ -278,7 +278,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n", + "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n", "print(s3_private_path_tsv)" ] }, @@ -350,7 +350,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Bucket'.format(region, account_id, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Bucket'.format(\n", + " region, account_id, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -427,7 +433,7 @@ "# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Digital_Software_v1_00.tsv.gz\"\n", "# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\"\n", "# !aws s3 cp --recursive $s3_public_path_tsv/ s3://dsoaws/$step_prefix/ --exclude \"*\" --include \"amazon_reviews_us_Gift_Card_v1_00.tsv.gz\"\n", - "# !aws s3 ls --recursive s3://dsoaws/$step_prefix/\n" + "# !aws s3 ls --recursive s3://dsoaws/$step_prefix/" ] } ], diff --git a/04_ingest/02_Create_Athena_Database.ipynb b/04_ingest/02_Create_Athena_Database.ipynb index 1d60b09e..08948b79 100644 --- a/04_ingest/02_Create_Athena_Database.ipynb +++ b/04_ingest/02_Create_Athena_Database.ipynb @@ -29,7 +29,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -62,10 +62,10 @@ "try:\n", " s3_public_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -95,10 +95,10 @@ "try:\n", " s3_private_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -141,7 +141,7 @@ "metadata": {}, "outputs": [], "source": [ - "database_name = 'dsoaws'" + "database_name = \"dsoaws\"" ] }, { @@ -160,7 +160,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -178,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'CREATE DATABASE IF NOT EXISTS {}'.format(database_name)\n", + "statement = \"CREATE DATABASE IF NOT EXISTS {}\".format(database_name)\n", "print(statement)" ] }, @@ -189,6 +189,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "pd.read_sql(statement, conn)" ] }, @@ -205,7 +206,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW DATABASES'\n", + "statement = \"SHOW DATABASES\"\n", "\n", "df_show = pd.read_sql(statement, conn)\n", "df_show.head(5)" diff --git a/04_ingest/03_Register_S3_TSV_With_Athena.ipynb b/04_ingest/03_Register_S3_TSV_With_Athena.ipynb index 55b04ebc..6d3eab6f 100644 --- a/04_ingest/03_Register_S3_TSV_With_Athena.ipynb +++ b/04_ingest/03_Register_S3_TSV_With_Athena.ipynb @@ -31,7 +31,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -64,9 +64,9 @@ "try:\n", " ingest_create_athena_db_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -85,11 +85,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_db_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not create the Athena Database.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]') " + " print(\"[OK]\")" ] }, { @@ -110,10 +110,10 @@ "try:\n", " s3_private_path_tsv\n", "except NameError:\n", - " print('*****************************************************************************')\n", - " print('[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************')\n", - " print('[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************')\n", - " print('*****************************************************************************')" + " print(\"*****************************************************************************\")\n", + " print(\"[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************\")\n", + " print(\"[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************\")\n", + " print(\"*****************************************************************************\")" ] }, { @@ -179,7 +179,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -189,8 +189,8 @@ "outputs": [], "source": [ "# Set Athena parameters\n", - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'" + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"" ] }, { @@ -226,7 +226,9 @@ " review_body string,\n", " review_date string\n", ") ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\\\t' LINES TERMINATED BY '\\\\n' LOCATION '{}'\n", - "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(database_name, table_name_tsv, s3_private_path_tsv)\n", + "TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')\"\"\".format(\n", + " database_name, table_name_tsv, s3_private_path_tsv\n", + ")\n", "\n", "print(statement)" ] @@ -238,6 +240,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "pd.read_sql(statement, conn)" ] }, @@ -254,7 +257,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW TABLES in {}'.format(database_name)\n", + "statement = \"SHOW TABLES in {}\".format(database_name)\n", "\n", "df_show = pd.read_sql(statement, conn)\n", "df_show.head(5)" @@ -292,10 +295,12 @@ "metadata": {}, "outputs": [], "source": [ - "product_category = 'Digital_Software'\n", + "product_category = \"Digital_Software\"\n", "\n", "statement = \"\"\"SELECT * FROM {}.{}\n", - " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_tsv, product_category)\n", + " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n", + " database_name, table_name_tsv, product_category\n", + ")\n", "\n", "print(statement)" ] @@ -317,11 +322,11 @@ "outputs": [], "source": [ "if not df.empty:\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "else:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -339,7 +344,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review AWS Glue Catalog'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review AWS Glue Catalog'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb b/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb index edb39de0..799d3123 100644 --- a/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb +++ b/04_ingest/04_Convert_S3_TSV_To_Parquet_With_Athena.ipynb @@ -30,7 +30,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -63,9 +63,9 @@ "try:\n", " ingest_create_athena_table_tsv_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -84,11 +84,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_tsv_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -123,12 +123,12 @@ "outputs": [], "source": [ "# Set S3 path to Parquet data\n", - "s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(bucket)\n", + "s3_path_parquet = \"s3://{}/amazon-reviews-pds/parquet\".format(bucket)\n", "\n", "# Set Athena parameters\n", - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'\n", - "table_name_parquet = 'amazon_reviews_parquet'" + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"\n", + "table_name_parquet = \"amazon_reviews_parquet\"" ] }, { @@ -138,7 +138,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -185,7 +185,9 @@ " CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,\n", " DATE(review_date) AS review_date,\n", " product_category\n", - "FROM {}.{}\"\"\".format(database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv)\n", + "FROM {}.{}\"\"\".format(\n", + " database_name, table_name_parquet, s3_path_parquet, database_name, table_name_tsv\n", + ")\n", "\n", "print(statement)" ] @@ -221,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)\n", + "statement = \"MSCK REPAIR TABLE {}.{}\".format(database_name, table_name_parquet)\n", "\n", "print(statement)" ] @@ -233,6 +235,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "df = pd.read_sql(statement, conn)\n", "df.head(5)" ] @@ -250,7 +253,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)\n", + "statement = \"SHOW PARTITIONS {}.{}\".format(database_name, table_name_parquet)\n", "\n", "print(statement)" ] @@ -278,7 +281,7 @@ "metadata": {}, "outputs": [], "source": [ - "statement = 'SHOW TABLES in {}'.format(database_name)" + "statement = \"SHOW TABLES in {}\".format(database_name)" ] }, { @@ -323,10 +326,12 @@ "metadata": {}, "outputs": [], "source": [ - "product_category = 'Digital_Software'\n", + "product_category = \"Digital_Software\"\n", "\n", "statement = \"\"\"SELECT * FROM {}.{}\n", - " WHERE product_category = '{}' LIMIT 100\"\"\".format(database_name, table_name_parquet, product_category)\n", + " WHERE product_category = '{}' LIMIT 100\"\"\".format(\n", + " database_name, table_name_parquet, product_category\n", + ")\n", "\n", "print(statement)" ] @@ -348,11 +353,11 @@ "outputs": [], "source": [ "if not df.empty:\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "else:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -370,7 +375,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review AWS Glue Catalog'.format(region)))\n" + "display(\n", + " HTML(\n", + " 'Review AWS Glue Catalog'.format(\n", + " region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb b/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb index 960053f1..f7d2794a 100644 --- a/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb +++ b/04_ingest/05_Query_Data_With_AWS_DataWrangler.ipynb @@ -46,9 +46,9 @@ "try:\n", " ingest_create_athena_table_tsv_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -67,11 +67,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_tsv_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -92,9 +92,9 @@ "try:\n", " ingest_create_athena_table_parquet_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -113,11 +113,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_parquet_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -136,12 +136,12 @@ "import sagemaker\n", "import boto3\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -181,11 +181,10 @@ "metadata": {}, "outputs": [], "source": [ - "path = 's3://{}/amazon-reviews-pds/parquet/'.format(bucket)\n", - "df_parquet_results = wr.s3.read_parquet(path,\n", - " columns=['star_rating', 'product_category', 'review_body'],\n", - " partition_filter=p_filter,\n", - " dataset=True)\n", + "path = \"s3://{}/amazon-reviews-pds/parquet/\".format(bucket)\n", + "df_parquet_results = wr.s3.read_parquet(\n", + " path, columns=[\"star_rating\", \"product_category\", \"review_body\"], partition_filter=p_filter, dataset=True\n", + ")\n", "df_parquet_results.shape" ] }, @@ -226,13 +225,15 @@ "metadata": {}, "outputs": [], "source": [ - "path = 's3://{}/amazon-reviews-pds/parquet/'.format(bucket)\n", - "chunk_iter = wr.s3.read_parquet(path,\n", - " columns=['star_rating', 'product_category', 'review_body'],\n", - " # filters=[(\"product_category\", \"=\", \"Digital_Software\")],\n", - " partition_filter=p_filter,\n", - " dataset=True,\n", - " chunked=True)" + "path = \"s3://{}/amazon-reviews-pds/parquet/\".format(bucket)\n", + "chunk_iter = wr.s3.read_parquet(\n", + " path,\n", + " columns=[\"star_rating\", \"product_category\", \"review_body\"],\n", + " # filters=[(\"product_category\", \"=\", \"Digital_Software\")],\n", + " partition_filter=p_filter,\n", + " dataset=True,\n", + " chunked=True,\n", + ")" ] }, { @@ -260,9 +261,9 @@ "metadata": {}, "outputs": [], "source": [ - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'\n", - "table_name_parquet = 'amazon_reviews_parquet'" + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"\n", + "table_name_parquet = \"amazon_reviews_parquet\"" ] }, { @@ -272,7 +273,7 @@ "outputs": [], "source": [ "for table in wr.catalog.get_tables(database=\"dsoaws\"):\n", - " print(table['Name'])" + " print(table[\"Name\"])" ] }, { @@ -290,10 +291,7 @@ "outputs": [], "source": [ "%%time\n", - "df = wr.athena.read_sql_query(\n", - " sql='SELECT * FROM {} LIMIT 5000'.format(table_name_parquet),\n", - " database=database_name\n", - ")" + "df = wr.athena.read_sql_query(sql=\"SELECT * FROM {} LIMIT 5000\".format(table_name_parquet), database=database_name)" ] }, { @@ -324,9 +322,9 @@ "%%time\n", "\n", "chunk_iter = wr.athena.read_sql_query(\n", - " sql='SELECT * FROM {} LIMIT 5000'.format(table_name_parquet),\n", - " database='{}'.format(database_name),\n", - " chunksize=64_000 # 64 KB Chunks\n", + " sql=\"SELECT * FROM {} LIMIT 5000\".format(table_name_parquet),\n", + " database=\"{}\".format(database_name),\n", + " chunksize=64_000, # 64 KB Chunks\n", ")" ] }, diff --git a/05_explore/01_Visualize_Reviews_Dataset.ipynb b/05_explore/01_Visualize_Reviews_Dataset.ipynb index 868bc788..a2b93906 100644 --- a/05_explore/01_Visualize_Reviews_Dataset.ipynb +++ b/05_explore/01_Visualize_Reviews_Dataset.ipynb @@ -25,9 +25,9 @@ "try:\n", " ingest_create_athena_table_parquet_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -46,11 +46,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_parquet_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.') \n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS. You did not convert into Parquet data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -97,6 +97,7 @@ "import seaborn as sns\n", "\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'" ] @@ -110,7 +111,7 @@ "import sagemaker\n", "import boto3\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -122,9 +123,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Set Athena database & table \n", - "database_name = 'dsoaws'\n", - "table_name = 'amazon_reviews_parquet'" + "# Set Athena database & table\n", + "database_name = \"dsoaws\"\n", + "table_name = \"amazon_reviews_parquet\"" ] }, { @@ -143,7 +144,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -168,23 +169,27 @@ "metadata": {}, "outputs": [], "source": [ - "sns.set_style = 'seaborn-whitegrid'\n", - "\n", - "sns.set(rc={\"font.style\":\"normal\",\n", - " \"axes.facecolor\":\"white\",\n", - " 'grid.color': '.8',\n", - " 'grid.linestyle': '-',\n", - " \"figure.facecolor\":\"white\",\n", - " \"figure.titlesize\":20,\n", - " \"text.color\":\"black\",\n", - " \"xtick.color\":\"black\",\n", - " \"ytick.color\":\"black\",\n", - " \"axes.labelcolor\":\"black\",\n", - " \"axes.grid\":True,\n", - " 'axes.labelsize':10,\n", - " 'xtick.labelsize':10,\n", - " 'font.size':10,\n", - " 'ytick.labelsize':10})" + "sns.set_style = \"seaborn-whitegrid\"\n", + "\n", + "sns.set(\n", + " rc={\n", + " \"font.style\": \"normal\",\n", + " \"axes.facecolor\": \"white\",\n", + " \"grid.color\": \".8\",\n", + " \"grid.linestyle\": \"-\",\n", + " \"figure.facecolor\": \"white\",\n", + " \"figure.titlesize\": 20,\n", + " \"text.color\": \"black\",\n", + " \"xtick.color\": \"black\",\n", + " \"ytick.color\": \"black\",\n", + " \"axes.labelcolor\": \"black\",\n", + " \"axes.grid\": True,\n", + " \"axes.labelsize\": 10,\n", + " \"xtick.labelsize\": 10,\n", + " \"font.size\": 10,\n", + " \"ytick.labelsize\": 10,\n", + " }\n", + ")" ] }, { @@ -205,7 +210,7 @@ " for p in ax.patches:\n", " _x = p.get_x() + p.get_width() + float(space)\n", " _y = p.get_y() + p.get_height()\n", - " value = round(float(p.get_width()),2)\n", + " value = round(float(p.get_width()), 2)\n", " ax.text(_x, _y, value, ha=\"left\")\n", "\n", " if isinstance(axs, np.ndarray):\n", @@ -234,7 +239,9 @@ "FROM {}.{} \n", "GROUP BY product_category \n", "ORDER BY avg_star_rating DESC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -281,16 +288,16 @@ "outputs": [], "source": [ "# Create plot\n", - "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df, saturation=1)\n", "\n", "if num_categories < 10:\n", - " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n", - " \n", - "# Set title and x-axis ticks \n", - "plt.title('Average Rating by Product Category')\n", - "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n", + " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n", "\n", - "# Helper code to show actual values afters bars \n", + "# Set title and x-axis ticks\n", + "plt.title(\"Average Rating by Product Category\")\n", + "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n", + "\n", + "# Helper code to show actual values afters bars\n", "show_values_barplot(barplot, 0.1)\n", "\n", "plt.xlabel(\"Average Rating\")\n", @@ -333,7 +340,9 @@ "FROM {}.{}\n", "GROUP BY product_category \n", "ORDER BY count_star_rating DESC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -355,10 +364,10 @@ "outputs": [], "source": [ "# Store counts\n", - "count_ratings = df['count_star_rating']\n", + "count_ratings = df[\"count_star_rating\"]\n", "\n", "# Store max ratings\n", - "max_ratings = df['count_star_rating'].max()\n", + "max_ratings = df[\"count_star_rating\"].max()\n", "print(max_ratings)" ] }, @@ -376,20 +385,20 @@ "outputs": [], "source": [ "# Create Seaborn barplot\n", - "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n", "\n", "if num_categories < 10:\n", - " sns.set(rc={'figure.figsize':(10.0, 5.0)})\n", + " sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n", "\n", "# Set title\n", "plt.title(\"Number of Ratings per Product Category for Subset of Product Categories\")\n", "\n", - "# Set x-axis ticks to match scale \n", + "# Set x-axis ticks to match scale\n", "if max_ratings > 200000:\n", - " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n", + " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n", " plt.xlim(0, 20000000)\n", "elif max_ratings <= 200000:\n", - " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '150K', '200K'])\n", + " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"150K\", \"200K\"])\n", " plt.xlim(0, 200000)\n", "\n", "plt.xlabel(\"Number of Ratings\")\n", @@ -427,13 +436,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category, MIN(review_date) AS first_review_date\n", "FROM {}.{}\n", "GROUP BY product_category\n", "ORDER BY first_review_date \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -456,7 +467,8 @@ "source": [ "# Convert date strings (e.g. 2014-10-18) to datetime\n", "import datetime as datetime\n", - "dates = pd.to_datetime(df['first_review_date'])\n" + "\n", + "dates = pd.to_datetime(df[\"first_review_date\"])" ] }, { @@ -467,16 +479,18 @@ "source": [ "# See: https://stackoverflow.com/questions/60761410/how-to-graph-events-on-a-timeline\n", "\n", + "\n", "def modify_dataframe(df):\n", " \"\"\" Modify dataframe to include new columns \"\"\"\n", - " df['year'] = pd.to_datetime(df['first_review_date'], format='%Y-%m-%d').dt.year\n", + " df[\"year\"] = pd.to_datetime(df[\"first_review_date\"], format=\"%Y-%m-%d\").dt.year\n", " return df\n", "\n", + "\n", "def get_x_y(df):\n", " \"\"\" Get X and Y coordinates; return tuple \"\"\"\n", - " series = df['year'].value_counts().sort_index()\n", + " series = df[\"year\"].value_counts().sort_index()\n", " # new_series = series.reindex(range(1,21)).fillna(0).astype(int)\n", - " return series.index, series.values\n" + " return series.index, series.values" ] }, { @@ -504,20 +518,20 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plt.figure(figsize=(12,5))\n", + "fig = plt.figure(figsize=(12, 5))\n", "ax = plt.gca()\n", "\n", - "ax.set_title('Number Of First Product Category Reviews Per Year for Subset of Categories')\n", - "ax.set_xlabel('Year')\n", - "ax.set_ylabel('Count')\n", + "ax.set_title(\"Number Of First Product Category Reviews Per Year for Subset of Categories\")\n", + "ax.set_xlabel(\"Year\")\n", + "ax.set_ylabel(\"Count\")\n", "\n", "ax.plot(X, Y, color=\"black\", linewidth=2, marker=\"o\")\n", - "ax.fill_between(X, [0]*len(X), Y, facecolor='lightblue')\n", + "ax.fill_between(X, [0] * len(X), Y, facecolor=\"lightblue\")\n", "\n", "ax.locator_params(integer=True)\n", "\n", "ax.set_xticks(range(1995, 2016, 1))\n", - "ax.set_yticks(range(0, max(Y)+2, 1))\n", + "ax.set_yticks(range(0, max(Y) + 2, 1))\n", "\n", "plt.xticks(rotation=45)\n", "\n", @@ -548,7 +562,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category,\n", " star_rating,\n", @@ -556,7 +570,9 @@ "FROM {}.{}\n", "GROUP BY product_category, star_rating\n", "ORDER BY product_category ASC, star_rating DESC, count_reviews\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -585,14 +601,14 @@ "outputs": [], "source": [ "# Create grouped DataFrames by category and by star rating\n", - "grouped_category = df.groupby('product_category')\n", - "grouped_star = df.groupby('star_rating')\n", + "grouped_category = df.groupby(\"product_category\")\n", + "grouped_star = df.groupby(\"star_rating\")\n", "\n", "# Create sum of ratings per star rating\n", - "df_sum = df.groupby(['star_rating']).sum()\n", + "df_sum = df.groupby([\"star_rating\"]).sum()\n", "\n", "# Calculate total number of star ratings\n", - "total = df_sum['count_reviews'].sum()\n", + "total = df_sum[\"count_reviews\"].sum()\n", "print(total)" ] }, @@ -605,17 +621,17 @@ "# Create dictionary of product categories and array of star rating distribution per category\n", "distribution = {}\n", "count_reviews_per_star = []\n", - "i=0\n", - " \n", + "i = 0\n", + "\n", "for category, ratings in grouped_category:\n", " count_reviews_per_star = []\n", - " for star in ratings['star_rating']:\n", - " count_reviews_per_star.append(ratings.at[i, 'count_reviews'])\n", - " i=i+1;\n", + " for star in ratings[\"star_rating\"]:\n", + " count_reviews_per_star.append(ratings.at[i, \"count_reviews\"])\n", + " i = i + 1\n", " distribution[category] = count_reviews_per_star\n", "\n", "# Check if distribution has been created succesfully\n", - "print(distribution)\n" + "print(distribution)" ] }, { @@ -654,8 +670,8 @@ "# Sort distribution by highest average rating per category\n", "sorted_distribution = {}\n", "\n", - "average_star_ratings.iloc[:,0]\n", - "for index, value in average_star_ratings.iloc[:,0].items():\n", + "average_star_ratings.iloc[:, 0]\n", + "for index, value in average_star_ratings.iloc[:, 0].items():\n", " sorted_distribution[value] = distribution[value]" ] }, @@ -716,7 +732,7 @@ "proportion_star5 = np.true_divide(star5, total) * 100\n", "\n", "# Add colors\n", - "colors = ['red', 'purple','blue','orange','green']\n", + "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n", "\n", "# The position of the bars on the x-axis\n", "r = range(len(categories))\n", @@ -724,21 +740,53 @@ "\n", "# Plot bars\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", - "\n", - "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n", - "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n", - "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n", - "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n", - "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", + "\n", + "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n", + "ax4 = plt.barh(\n", + " r,\n", + " proportion_star4,\n", + " left=proportion_star5,\n", + " color=colors[3],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"4-Star Ratings\",\n", + ")\n", + "ax3 = plt.barh(\n", + " r,\n", + " proportion_star3,\n", + " left=proportion_star5 + proportion_star4,\n", + " color=colors[2],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"3-Star Ratings\",\n", + ")\n", + "ax2 = plt.barh(\n", + " r,\n", + " proportion_star2,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3,\n", + " color=colors[1],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"2-Star Ratings\",\n", + ")\n", + "ax1 = plt.barh(\n", + " r,\n", + " proportion_star1,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n", + " color=colors[0],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"1-Star Ratings\",\n", + ")\n", "\n", - "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n", - "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n", - "plt.yticks(r, categories, fontweight='regular')\n", + "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n", + "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n", + "plt.yticks(r, categories, fontweight=\"regular\")\n", "\n", - "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n", + "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n", "plt.gca().invert_yaxis()\n", "plt.tight_layout()\n", "\n", @@ -769,14 +817,16 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT star_rating,\n", " COUNT(*) AS count_reviews\n", "FROM dsoaws.amazon_reviews_parquet\n", "GROUP BY star_rating\n", "ORDER BY star_rating DESC, count_reviews \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -807,15 +857,12 @@ "metadata": {}, "outputs": [], "source": [ - "chart = df.plot.bar(x='star_rating', \n", - " y='count_reviews', \n", - " rot='0',\n", - " figsize=(10,5), \n", - " title='Review Count by Star Ratings', \n", - " legend=False)\n", + "chart = df.plot.bar(\n", + " x=\"star_rating\", y=\"count_reviews\", rot=\"0\", figsize=(10, 5), title=\"Review Count by Star Ratings\", legend=False\n", + ")\n", "\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")\n", "\n", "plt.show(chart)" ] @@ -852,13 +899,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT year, ROUND(AVG(star_rating),4) AS avg_rating\n", "FROM {}.{}\n", "GROUP BY year\n", "ORDER BY year\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -879,7 +928,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year\n" + "df[\"year\"] = pd.to_datetime(df[\"year\"], format=\"%Y\").dt.year" ] }, { @@ -896,21 +945,21 @@ "outputs": [], "source": [ "fig = plt.gcf()\n", - "fig.set_size_inches(12,5)\n", + "fig.set_size_inches(12, 5)\n", "\n", - "fig.suptitle('Average Star Rating Over Time (Across Subset of Product Categories)')\n", + "fig.suptitle(\"Average Star Rating Over Time (Across Subset of Product Categories)\")\n", "\n", "ax = plt.gca()\n", - "#ax = plt.gca().set_xticks(df['year'])\n", + "# ax = plt.gca().set_xticks(df['year'])\n", "ax.locator_params(integer=True)\n", - "ax.set_xticks(df['year'].unique())\n", + "ax.set_xticks(df[\"year\"].unique())\n", "\n", - "df.plot(kind='line',x='year',y='avg_rating', color='red', ax=ax)\n", + "df.plot(kind=\"line\", x=\"year\", y=\"avg_rating\", color=\"red\", ax=ax)\n", "\n", - "#plt.xticks(range(1995, 2016, 1))\n", - "#plt.yticks(range(0,6,1))\n", - "plt.xlabel('Years')\n", - "plt.ylabel('Average Star Rating')\n", + "# plt.xticks(range(1995, 2016, 1))\n", + "# plt.yticks(range(0,6,1))\n", + "plt.xlabel(\"Years\")\n", + "plt.ylabel(\"Average Star Rating\")\n", "plt.xticks(rotation=45)\n", "\n", "# fig.savefig('average-rating.png', dpi=300)\n", @@ -940,13 +989,15 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_category, year, ROUND(AVG(star_rating), 4) AS avg_rating_category\n", "FROM {}.{}\n", "GROUP BY product_category, year\n", "ORDER BY year \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -975,11 +1026,20 @@ "outputs": [], "source": [ "def plot_categories(df):\n", - " df_categories = df['product_category'].unique()\n", + " df_categories = df[\"product_category\"].unique()\n", " for category in df_categories:\n", " # print(category)\n", - " df_plot = df.loc[df['product_category'] == category]\n", - " df_plot.plot(kind='line',x='year',y='avg_rating_category', c=np.random.rand(3,), ax=ax, label=category)" + " df_plot = df.loc[df[\"product_category\"] == category]\n", + " df_plot.plot(\n", + " kind=\"line\",\n", + " x=\"year\",\n", + " y=\"avg_rating_category\",\n", + " c=np.random.rand(\n", + " 3,\n", + " ),\n", + " ax=ax,\n", + " label=category,\n", + " )" ] }, { @@ -989,19 +1049,19 @@ "outputs": [], "source": [ "fig = plt.gcf()\n", - "fig.set_size_inches(12,5)\n", + "fig.set_size_inches(12, 5)\n", + "\n", + "fig.suptitle(\"Average Star Rating Over Time Across Subset Of Categories\")\n", "\n", - "fig.suptitle('Average Star Rating Over Time Across Subset Of Categories')\n", - " \n", "ax = plt.gca()\n", "\n", "ax.locator_params(integer=True)\n", - "ax.set_xticks(df['year'].unique())\n", + "ax.set_xticks(df[\"year\"].unique())\n", "\n", "plot_categories(df)\n", "\n", - "plt.xlabel('Year')\n", - "plt.ylabel('Average Star Rating')\n", + "plt.xlabel(\"Year\")\n", + "plt.ylabel(\"Average Star Rating\")\n", "plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n", "\n", "# fig.savefig('average_rating_category_all_data.png', dpi=300)\n", @@ -1031,14 +1091,16 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT star_rating,\n", " AVG(helpful_votes) AS avg_helpful_votes\n", "FROM {}.{}\n", "GROUP BY star_rating\n", "ORDER BY star_rating ASC\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1076,10 +1138,12 @@ "metadata": {}, "outputs": [], "source": [ - "chart = df.plot.bar(x='star_rating', y='avg_helpful_votes', rot='0', figsize=(10,5), title='Helpfulness Of Star Ratings', legend=False )\n", + "chart = df.plot.bar(\n", + " x=\"star_rating\", y=\"avg_helpful_votes\", rot=\"0\", figsize=(10, 5), title=\"Helpfulness Of Star Ratings\", legend=False\n", + ")\n", "\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Average Helpful Votes')\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Average Helpful Votes\")\n", "\n", "# chart.get_figure().savefig('helpful-votes.png', dpi=300)\n", "plt.show(chart)" @@ -1108,7 +1172,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT product_title,\n", " helpful_votes,\n", @@ -1117,7 +1181,9 @@ " SUBSTR(review_body, 1, 100) AS review_body_substr\n", "FROM {}.{}\n", "ORDER BY helpful_votes DESC LIMIT 10 \n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1155,7 +1221,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT (CAST(positive_review_count AS DOUBLE) / CAST(negative_review_count AS DOUBLE)) AS positive_to_negative_sentiment_ratio\n", "FROM (\n", @@ -1167,7 +1233,9 @@ " FROM {}.{}\n", " WHERE star_rating < 4\n", ")\n", - "\"\"\".format(database_name, table_name, database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name, database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1205,7 +1273,7 @@ "metadata": {}, "outputs": [], "source": [ - "# SQL statement \n", + "# SQL statement\n", "statement = \"\"\"\n", "SELECT customer_id, product_category, product_title, \n", "ROUND(AVG(star_rating),4) AS avg_star_rating, COUNT(*) AS review_count \n", @@ -1214,7 +1282,9 @@ "HAVING COUNT(*) > 1 \n", "ORDER BY review_count DESC\n", "LIMIT 5\n", - "\"\"\".format(database_name, table_name)\n", + "\"\"\".format(\n", + " database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -1275,7 +1345,7 @@ "metadata": {}, "outputs": [], "source": [ - "summary = df['num_words'].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n", + "summary = df[\"num_words\"].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n", "summary" ] }, @@ -1285,9 +1355,9 @@ "metadata": {}, "outputs": [], "source": [ - "df['num_words'].plot.hist(xticks=[0, 16, 32, 64, 128, 256], \n", - " bins=100,\n", - " range=[0, 256]).axvline(x=summary['80%'], c='red')" + "df[\"num_words\"].plot.hist(xticks=[0, 16, 32, 64, 128, 256], bins=100, range=[0, 256]).axvline(\n", + " x=summary[\"80%\"], c=\"red\"\n", + ")" ] }, { diff --git a/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb b/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb index c1d4cf89..d206f883 100644 --- a/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb +++ b/05_explore/02_Prepare_Dataset_Bias_Analysis.ipynb @@ -44,7 +44,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -85,10 +85,12 @@ "source": [ "import csv\n", "\n", - "df_giftcards = pd.read_csv('./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_giftcards = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Gift_Card_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_giftcards.shape" ] }, @@ -109,10 +111,12 @@ "source": [ "import csv\n", "\n", - "df_software = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_software = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_software.shape" ] }, @@ -133,10 +137,12 @@ "source": [ "import csv\n", "\n", - "df_videogames = pd.read_csv('./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_videogames = pd.read_csv(\n", + " \"./data-clarify/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df_videogames.shape" ] }, @@ -163,12 +169,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_giftcards[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_giftcards[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -178,12 +187,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_software[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_software[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -193,12 +205,15 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "df_videogames[['star_rating', 'review_id']].groupby('star_rating').count().plot(kind='bar', title='Breakdown by Star Rating')\n", - "plt.xlabel('Star Rating')\n", - "plt.ylabel('Review Count')" + "df_videogames[[\"star_rating\", \"review_id\"]].groupby(\"star_rating\").count().plot(\n", + " kind=\"bar\", title=\"Breakdown by Star Rating\"\n", + ")\n", + "plt.xlabel(\"Star Rating\")\n", + "plt.ylabel(\"Review Count\")" ] }, { @@ -270,7 +285,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df, x='star_rating', hue='product_category')" + "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -286,7 +301,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n", + "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n", "df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n", "df_balanced.shape" ] @@ -299,7 +314,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')" + "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -331,7 +346,7 @@ "metadata": {}, "outputs": [], "source": [ - "path = './data-clarify/amazon_reviews_us_giftcards_software_videogames.csv'\n", + "path = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\"\n", "df.to_csv(path, index=False, header=True)" ] }, @@ -357,7 +372,7 @@ "metadata": {}, "outputs": [], "source": [ - "path_balanced = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv'\n", + "path_balanced = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv\"\n", "df_balanced.to_csv(path_balanced, index=False, header=True)" ] }, @@ -374,8 +389,8 @@ "metadata": {}, "outputs": [], "source": [ - "path_jsonlines = './data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl'\n", - "df_balanced.to_json(path_or_buf=path_jsonlines, orient='records', lines=True)" + "path_jsonlines = \"./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl\"\n", + "df_balanced.to_json(path_or_buf=path_jsonlines, orient=\"records\", lines=True)" ] }, { @@ -392,9 +407,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path)\n", + "bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path)\n", "bias_data_s3_uri" ] }, @@ -413,7 +429,9 @@ "metadata": {}, "outputs": [], "source": [ - "balanced_bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_balanced)\n", + "balanced_bias_data_s3_uri = sess.upload_data(\n", + " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_balanced\n", + ")\n", "balanced_bias_data_s3_uri" ] }, @@ -432,7 +450,9 @@ "metadata": {}, "outputs": [], "source": [ - "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(bucket=bucket, key_prefix='bias-detection-{}'.format(timestamp), path=path_jsonlines)\n", + "balanced_bias_data_jsonlines_s3_uri = sess.upload_data(\n", + " bucket=bucket, key_prefix=\"bias-detection-{}\".format(timestamp), path=path_jsonlines\n", + ")\n", "balanced_bias_data_jsonlines_s3_uri" ] }, diff --git a/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb b/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb index cb27a9c5..03d6d536 100644 --- a/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb +++ b/05_explore/03_Run_Data_Bias_Analysis_AdHoc.ipynb @@ -114,7 +114,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n", + "df = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n", "df.shape" ] }, @@ -132,7 +132,7 @@ }, "outputs": [], "source": [ - "sns.countplot(data=df, x='star_rating', hue='product_category')" + "sns.countplot(data=df, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -166,11 +166,9 @@ }, "outputs": [], "source": [ - "facet_column = report.FacetColumn(name='product_category')\n", - "label_column = report.LabelColumn(name='star_rating', \n", - " data=df['star_rating'], \n", - " positive_label_values=[5, 4])\n", - "group_variable = df['product_category']" + "facet_column = report.FacetColumn(name=\"product_category\")\n", + "label_column = report.LabelColumn(name=\"star_rating\", data=df[\"star_rating\"], positive_label_values=[5, 4])\n", + "group_variable = df[\"product_category\"]" ] }, { @@ -194,11 +192,9 @@ }, "outputs": [], "source": [ - "report.bias_report(df, \n", - " facet_column, \n", - " label_column, \n", - " stage_type=report.StageType.PRE_TRAINING, \n", - " group_variable=group_variable)" + "report.bias_report(\n", + " df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n", + ")" ] }, { @@ -214,7 +210,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_grouped_by = df.groupby(['product_category', 'star_rating'])[['product_category', 'star_rating']]\n", + "df_grouped_by = df.groupby([\"product_category\", \"star_rating\"])[[\"product_category\", \"star_rating\"]]\n", "df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))\n", "df_balanced.shape" ] @@ -227,7 +223,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=df_balanced, x='star_rating', hue='product_category')" + "sns.countplot(data=df_balanced, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -255,12 +251,10 @@ "source": [ "from smclarify.bias import report\n", "\n", - "facet_column = report.FacetColumn(name='product_category')\n", - "label_column = report.LabelColumn(name='star_rating',\n", - " data=df_balanced['star_rating'],\n", - " positive_label_values=[5, 4])\n", + "facet_column = report.FacetColumn(name=\"product_category\")\n", + "label_column = report.LabelColumn(name=\"star_rating\", data=df_balanced[\"star_rating\"], positive_label_values=[5, 4])\n", "\n", - "group_variable = df_balanced['product_category']" + "group_variable = df_balanced[\"product_category\"]" ] }, { @@ -276,11 +270,9 @@ "metadata": {}, "outputs": [], "source": [ - "report.bias_report(df_balanced,\n", - " facet_column,\n", - " label_column,\n", - " stage_type=report.StageType.PRE_TRAINING,\n", - " group_variable=group_variable)" + "report.bias_report(\n", + " df_balanced, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=group_variable\n", + ")" ] }, { diff --git a/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb b/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb index 17dd4b4a..bcd02945 100644 --- a/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb +++ b/05_explore/04_Run_Data_Bias_Analysis_ProcessingJob.ipynb @@ -20,12 +20,12 @@ "import pandas as pd\n", "import numpy as np\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -72,7 +72,7 @@ "source": [ "import pandas as pd\n", "\n", - "data = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n", + "data = pd.read_csv(\"./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv\")\n", "data.head()" ] }, @@ -101,7 +101,7 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.countplot(data=data, x='star_rating', hue='product_category')" + "sns.countplot(data=data, x=\"star_rating\", hue=\"product_category\")" ] }, { @@ -121,10 +121,9 @@ "source": [ "from sagemaker import clarify\n", "\n", - "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n", - " instance_count=1,\n", - " instance_type='ml.c5.2xlarge',\n", - " sagemaker_session=sess)" + "clarify_processor = clarify.SageMakerClarifyProcessor(\n", + " role=role, instance_count=1, instance_type=\"ml.c5.2xlarge\", sagemaker_session=sess\n", + ")" ] }, { @@ -151,13 +150,15 @@ "metadata": {}, "outputs": [], "source": [ - "bias_report_output_path = 's3://{}/clarify'.format(bucket)\n", + "bias_report_output_path = \"s3://{}/clarify\".format(bucket)\n", "\n", - "bias_data_config = clarify.DataConfig(s3_data_input_path=bias_data_s3_uri,\n", - " s3_output_path=bias_report_output_path,\n", - " label='star_rating',\n", - " headers=data.columns.to_list(),\n", - " dataset_type='text/csv')" + "bias_data_config = clarify.DataConfig(\n", + " s3_data_input_path=bias_data_s3_uri,\n", + " s3_output_path=bias_report_output_path,\n", + " label=\"star_rating\",\n", + " headers=data.columns.to_list(),\n", + " dataset_type=\"text/csv\",\n", + ")" ] }, { @@ -177,10 +178,12 @@ "metadata": {}, "outputs": [], "source": [ - "bias_config = clarify.BiasConfig(label_values_or_threshold=[5, 4],\n", - " facet_name='product_category',\n", - " facet_values_or_threshold=['Gift Card'],\n", - " group_name='product_category')" + "bias_config = clarify.BiasConfig(\n", + " label_values_or_threshold=[5, 4],\n", + " facet_name=\"product_category\",\n", + " facet_values_or_threshold=[\"Gift Card\"],\n", + " group_name=\"product_category\",\n", + ")" ] }, { @@ -197,11 +200,8 @@ "outputs": [], "source": [ "clarify_processor.run_pre_training_bias(\n", - " data_config=bias_data_config,\n", - " data_bias_config=bias_config,\n", - " methods='all',\n", - " wait=False,\n", - " logs=False)" + " data_config=bias_data_config, data_bias_config=bias_config, methods=\"all\", wait=False, logs=False\n", + ")" ] }, { @@ -222,7 +222,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Processing Job'.format(region, run_pre_training_bias_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Job'.format(\n", + " region, run_pre_training_bias_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -233,7 +239,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, run_pre_training_bias_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, run_pre_training_bias_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -244,7 +256,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, run_pre_training_bias_processing_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n", + " bucket, run_pre_training_bias_processing_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -253,8 +271,9 @@ "metadata": {}, "outputs": [], "source": [ - "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=run_pre_training_bias_processing_job_name,\n", - " sagemaker_session=sess)\n", + "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n", + " processing_job_name=run_pre_training_bias_processing_job_name, sagemaker_session=sess\n", + ")\n", "\n", "processing_job_description = running_processor.describe()\n", "\n", @@ -304,7 +323,7 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Bias Report'))\n" + "display(HTML('Review Bias Report'))" ] }, { diff --git a/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb b/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb index 14506f1d..2ee7f780 100644 --- a/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb +++ b/05_explore/05_Analyze_Data_Quality_ProcessingJob_PySpark.ipynb @@ -76,9 +76,9 @@ "try:\n", " ingest_create_athena_table_tsv_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -97,11 +97,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_tsv_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN THE NOTEBOOKS IN THE INGEST FOLDER FIRST. You did not register the TSV Data.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -121,7 +121,7 @@ "import sagemaker\n", "import boto3\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -151,12 +151,14 @@ "source": [ "from sagemaker.spark.processing import PySparkProcessor\n", "\n", - "processor = PySparkProcessor(base_job_name='spark-amazon-reviews-analyzer',\n", - " role=role,\n", - " framework_version='2.4',\n", - " instance_count=1,\n", - " instance_type='ml.r5.2xlarge',\n", - " max_runtime_in_seconds=300)" + "processor = PySparkProcessor(\n", + " base_job_name=\"spark-amazon-reviews-analyzer\",\n", + " role=role,\n", + " framework_version=\"2.4\",\n", + " instance_count=1,\n", + " instance_type=\"ml.r5.2xlarge\",\n", + " max_runtime_in_seconds=300,\n", + ")" ] }, { @@ -165,7 +167,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_input_data = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n", + "s3_input_data = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n", "print(s3_input_data)" ] }, @@ -192,12 +194,13 @@ "outputs": [], "source": [ "from time import gmtime, strftime\n", + "\n", "timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "\n", - "output_prefix = 'amazon-reviews-spark-analyzer-{}'.format(timestamp_prefix)\n", - "processing_job_name = 'amazon-reviews-spark-analyzer-{}'.format(timestamp_prefix)\n", + "output_prefix = \"amazon-reviews-spark-analyzer-{}\".format(timestamp_prefix)\n", + "processing_job_name = \"amazon-reviews-spark-analyzer-{}\".format(timestamp_prefix)\n", "\n", - "print('Processing job name: {}'.format(processing_job_name))" + "print(\"Processing job name: {}\".format(processing_job_name))" ] }, { @@ -206,7 +209,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_output_analyze_data = 's3://{}/{}/output'.format(bucket, output_prefix)\n", + "s3_output_analyze_data = \"s3://{}/{}/output\".format(bucket, output_prefix)\n", "\n", "print(s3_output_analyze_data)" ] @@ -239,13 +242,17 @@ "source": [ "from sagemaker.processing import ProcessingOutput\n", "\n", - "processor.run(submit_app='preprocess-deequ-pyspark.py',\n", - " submit_jars=['deequ-1.0.3-rc2.jar'],\n", - " arguments=['s3_input_data', s3_input_data,\n", - " 's3_output_analyze_data', s3_output_analyze_data,\n", - " ],\n", - " logs=True,\n", - " wait=False\n", + "processor.run(\n", + " submit_app=\"preprocess-deequ-pyspark.py\",\n", + " submit_jars=[\"deequ-1.0.3-rc2.jar\"],\n", + " arguments=[\n", + " \"s3_input_data\",\n", + " s3_input_data,\n", + " \"s3_output_analyze_data\",\n", + " s3_output_analyze_data,\n", + " ],\n", + " logs=True,\n", + " wait=False,\n", ")" ] }, @@ -257,9 +264,15 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n", + "processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n", "\n", - "display(HTML('Review Processing Job'.format(region, processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Job'.format(\n", + " region, processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -270,9 +283,15 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n", + "processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n", "\n", - "display(HTML('Review CloudWatch Logs After a Few Minutes'.format(region, processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After a Few Minutes'.format(\n", + " region, processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -285,7 +304,13 @@ "\n", "s3_job_output_prefix = output_prefix\n", "\n", - "display(HTML('Review S3 Output Data After The Spark Job Has Completed'.format(bucket, s3_job_output_prefix, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Spark Job Has Completed'.format(\n", + " bucket, s3_job_output_prefix, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -303,8 +328,9 @@ }, "outputs": [], "source": [ - "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=processing_job_name,\n", - " sagemaker_session=sess)\n", + "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n", + " processing_job_name=processing_job_name, sagemaker_session=sess\n", + ")\n", "\n", "processing_job_description = running_processor.describe()\n", "\n", @@ -388,8 +414,11 @@ "import pandas as pd\n", "import os\n", "\n", + "\n", "def load_dataset(path, sep, header):\n", - " data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\n", + " data = pd.concat(\n", + " [pd.read_csv(f, sep=sep, header=header) for f in glob.glob(\"{}/*.csv\".format(path))], ignore_index=True\n", + " )\n", "\n", " return data" ] @@ -402,8 +431,8 @@ }, "outputs": [], "source": [ - "df_constraint_checks = load_dataset(path='./amazon-reviews-spark-analyzer/constraint-checks/', sep='\\t', header=0)\n", - "df_constraint_checks[['check', 'constraint', 'constraint_status', 'constraint_message']]" + "df_constraint_checks = load_dataset(path=\"./amazon-reviews-spark-analyzer/constraint-checks/\", sep=\"\\t\", header=0)\n", + "df_constraint_checks[[\"check\", \"constraint\", \"constraint_status\", \"constraint_message\"]]" ] }, { @@ -419,7 +448,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_dataset_metrics = load_dataset(path='./amazon-reviews-spark-analyzer/dataset-metrics/', sep='\\t', header=0)\n", + "df_dataset_metrics = load_dataset(path=\"./amazon-reviews-spark-analyzer/dataset-metrics/\", sep=\"\\t\", header=0)\n", "df_dataset_metrics" ] }, @@ -438,7 +467,7 @@ }, "outputs": [], "source": [ - "df_success_metrics = load_dataset(path='./amazon-reviews-spark-analyzer/success-metrics/', sep='\\t', header=0)\n", + "df_success_metrics = load_dataset(path=\"./amazon-reviews-spark-analyzer/success-metrics/\", sep=\"\\t\", header=0)\n", "df_success_metrics" ] }, diff --git a/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb b/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb index ffd4df51..a37ddf12 100644 --- a/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb +++ b/05_explore/99_GENERATED_Data_Wrangler_Job_Notebook.ipynb @@ -29,10 +29,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.20.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -159,6 +158,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -170,6 +170,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -187,6 +188,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -202,6 +204,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -218,29 +221,24 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", " return processing_inputs\n", "\n", + "\n", "def create_container_arguments(output_name, output_content_type):\n", - " output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - " }\n", + " output_config = {output_name: {\"content_type\": output_content_type}}\n", " return [f\"--output-config '{json.dumps(output_config)}'\"]\n", "\n", + "\n", "# Create Processing Job Arguments\n", "processing_job_arguments = {\n", " \"AppSpecification\": {\n", @@ -256,7 +254,7 @@ " \"S3Uri\": output_path,\n", " \"LocalPath\": os.path.join(processing_dir, \"output\"),\n", " \"S3UploadMode\": \"EndOfJob\",\n", - " }\n", + " },\n", " },\n", " ],\n", " },\n", @@ -357,14 +355,11 @@ "region = boto3.Session().region_name\n", "container = sagemaker.image_uris.retrieve(\"xgboost\", region, \"1.2-1\")\n", "hyperparameters = {\n", - " \"max_depth\":\"5\",\n", + " \"max_depth\": \"5\",\n", " \"objective\": \"reg:squarederror\",\n", " \"num_round\": \"10\",\n", "}\n", - "train_content_type = (\n", - " \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\"\n", - " else \"text/csv\"\n", - ")\n", + "train_content_type = \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\" else \"text/csv\"\n", "train_input = sagemaker.inputs.TrainingInput(\n", " s3_data=f\"s3://{bucket}/{training_path}\",\n", " content_type=train_content_type,\n", diff --git a/05_explore/99_GENERATED_Python_Code.py b/05_explore/99_GENERATED_Python_Code.py index 377fbc5b..c91f9927 100644 --- a/05_explore/99_GENERATED_Python_Code.py +++ b/05_explore/99_GENERATED_Python_Code.py @@ -1,10 +1,12 @@ from pyspark.sql.session import SparkSession from pyspark.sql.dataframe import DataFrame + # You may want to configure the Spark Context with the right credentials provider. -spark = SparkSession.builder.master('local').getOrCreate() +spark = SparkSession.builder.master("local").getOrCreate() mode = None + def capture_stdout(func, *args, **kwargs): """Capture standard output to a string buffer""" @@ -54,7 +56,7 @@ def default_spark_with_trained_parameters_and_state(df, trained_parameters, stat def dispatch(key_name, args, kwargs, funcs): """ - Dispatches to another operator based on a key in the passed parameters. + Dispatches to another operator based on a key in the passed parameters. This also slices out any parameters using the parameter_name passed in, and will reassemble the trained_parameters correctly after invocation. @@ -98,7 +100,9 @@ def dispatch(key_name, args, kwargs, funcs): updated_trained_parameters = result["trained_parameters"] if existing_trained_parameters is not None or updated_trained_parameters is not None: - existing_trained_parameters = existing_trained_parameters if existing_trained_parameters is not None else {} + existing_trained_parameters = ( + existing_trained_parameters if existing_trained_parameters is not None else {} + ) existing_trained_parameters[parameter_name] = result["trained_parameters"] # Update the result trained_parameters so they are part of the original structure. @@ -153,7 +157,9 @@ def process_numeric_standard_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) @@ -207,7 +213,9 @@ def process_numeric_robust_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) @@ -263,14 +271,21 @@ def process_numeric_min_max_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) temp_normalized_vector_col = temp_col_name(assembled) trained_parameters = load_trained_parameters( - trained_parameters, {"input_column": input_column, "min": min, "max": max,} + trained_parameters, + { + "input_column": input_column, + "min": min, + "max": max, + }, ) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( @@ -308,13 +323,20 @@ def process_numeric_max_absolute_scaler(df, input_column=None, output_column=Non process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) temp_normalized_vector_col = temp_col_name(assembled) - trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,}) + trained_parameters = load_trained_parameters( + trained_parameters, + { + "input_column": input_column, + }, + ) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" @@ -411,7 +433,9 @@ def athena_start_query_execution_core(client, request): try: result = client.start_query_execution(**request) except Exception as e: - raise RuntimeError(f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}") + raise RuntimeError( + f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}" + ) return result @@ -499,7 +523,10 @@ def athena_start_query_execution(dataset_definition, client): query_request = { "QueryString": ctas_query, - "QueryExecutionContext": {"Database": database_name, "Catalog": catalog_name,}, + "QueryExecutionContext": { + "Database": database_name, + "Catalog": catalog_name, + }, "ResultConfiguration": {"OutputLocation": metadata_s3_output_location}, } logging.debug("Query request is: %s", query_request) @@ -671,8 +698,13 @@ def cast_single_column_type( # | 2|None| bar | # | 3| 1 | | # +---+----+------------------+ - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) - df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) + df = df.withColumn( + non_castable_column, + f.when(df[temp_column].isNotNull(), "").otherwise(df[column]), + ) elif invalid_data_handling_method == NonCastableDataHandlingMethod.REPLACE_WITH_FIXED_VALUE: # Replace non-castable data to a value in the same column # Original dataframe @@ -693,7 +725,9 @@ def cast_single_column_type( # +---+----+ value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type) - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise( f.to_date(f.lit(value), date_formatting) @@ -726,8 +760,13 @@ def cast_single_column_type( # +---+----+------------------+ value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type) - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) - df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) + df = df.withColumn( + non_castable_column, + f.when(df[temp_column].isNotNull(), "").otherwise(df[column]), + ) replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise( f.to_date(f.lit(value), date_formatting) @@ -779,8 +818,7 @@ class OperatorSparkOperatorCustomerError(Exception): def temp_col_name(df, *illegal_names): - """Generates a temporary column name that is unused. - """ + """Generates a temporary column name that is unused.""" name = "temp_col" idx = 0 name_set = set(list(df.columns) + list(illegal_names)) @@ -792,8 +830,7 @@ def temp_col_name(df, *illegal_names): def get_temp_col_if_not_set(df, col_name): - """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name. - """ + """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name.""" if col_name: return col_name, False else: @@ -803,7 +840,7 @@ def get_temp_col_if_not_set(df, col_name): def replace_input_if_output_is_temp(df, input_column, output_column, output_is_temp): """Replaces the input column in the dataframe if the output was not set - This is used with get_temp_col_if_not_set to enable the behavior where a + This is used with get_temp_col_if_not_set to enable the behavior where a transformer will replace its input column if an output is not specified. """ if output_is_temp: @@ -843,7 +880,9 @@ def expects_valid_column_name(value, key, nullable=False): return if value is None or len(str(value).strip()) == 0: - raise OperatorSparkOperatorCustomerError(f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}") + raise OperatorSparkOperatorCustomerError( + f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}" + ) def expects_parameter(value, key, condition=None): @@ -855,12 +894,16 @@ def expects_parameter(value, key, condition=None): def expects_column(df, value, key): if not value or value not in df.columns: - raise OperatorSparkOperatorCustomerError(f"Expected column in dataframe for '{key}' however received '{value}'") + raise OperatorSparkOperatorCustomerError( + f"Expected column in dataframe for '{key}' however received '{value}'" + ) def expects_parameter_value_in_list(key, value, items): if value not in items: - raise OperatorSparkOperatorCustomerError(f"Illegal parameter value. {key} expected to be in {items}, but given {value}") + raise OperatorSparkOperatorCustomerError( + f"Illegal parameter value. {key} expected to be in {items}, but given {value}" + ) def encode_pyspark_model(model): @@ -963,7 +1006,6 @@ def transform_using_trained_model(model, df, loaded): ) - def type_inference(df): # noqa: C901 # pylint: disable=R0912 """Core type inference logic @@ -1234,7 +1276,9 @@ def athena_source(spark, mode, dataset_definition, trained_parameters=None): # trained_parameters["ctas_table_name"] = "" try: return default_spark_with_trained_parameters_and_state( - df=spark.read.parquet(path), trained_parameters=trained_parameters, state=get_execution_state(state), + df=spark.read.parquet(path), + trained_parameters=trained_parameters, + state=get_execution_state(state), ) except Exception as e: raise RuntimeError( @@ -1288,12 +1332,17 @@ def infer_and_cast_type(df, spark, inference_data_sample_size=1000, trained_para def process_numeric(df, spark, **kwargs): return dispatch( - "operator", [df], kwargs, {"Scale values": (process_numeric_scale_values, "scale_values_parameters"),}, + "operator", + [df], + kwargs, + { + "Scale values": (process_numeric_scale_values, "scale_values_parameters"), + }, ) def custom_pyspark(df, spark, code): - """ Apply custom pyspark operation on the input dataframe + """Apply custom pyspark operation on the input dataframe Example: The custom code expects the user to provide an output df. @@ -1326,14 +1375,50 @@ def custom_pyspark(df, spark, code): return default_spark_with_stdout(output_df, stdout) -op_1_output = athena_source(spark=spark, mode=mode, **{'dataset_definition': {'datasetSourceType': 'Athena', 'name': 'amazon_reviews', 'catalogName': 'AwsDataCatalog', 'databaseName': 'dsoaws', 'queryString': 'select * from amazon_reviews_parquet', 's3OutputLocation': 's3://sagemaker-us-east-1-835319576252/athena/', 'outputFormat': 'parquet'}}) -op_2_output = infer_and_cast_type(op_1_output['default'], spark=spark, **{}) -op_5_output = process_numeric(op_2_output['default'], spark=spark, **{'operator': 'Scale values', 'scale_values_parameters': {'scaler': 'Min-max scaler', 'min_max_scaler_parameters': {'min': -1, 'max': 1, 'input_column': 'star_rating', 'output_column': 'star_rating_min_max_scaled_builtin'}, 'standard_scaler_parameters': {}}}) -op_6_output = custom_pyspark(op_5_output['default'], spark=spark, **{'code': '# Table is available as variable `df`\nfrom pyspark.sql.functions import stddev, mean, col, floor\ndf = df.withColumn("sentiment", (floor(col("star_rating_min_max_scaled_builtin"))))'}) +op_1_output = athena_source( + spark=spark, + mode=mode, + **{ + "dataset_definition": { + "datasetSourceType": "Athena", + "name": "amazon_reviews", + "catalogName": "AwsDataCatalog", + "databaseName": "dsoaws", + "queryString": "select * from amazon_reviews_parquet", + "s3OutputLocation": "s3://sagemaker-us-east-1-835319576252/athena/", + "outputFormat": "parquet", + } + }, +) +op_2_output = infer_and_cast_type(op_1_output["default"], spark=spark, **{}) +op_5_output = process_numeric( + op_2_output["default"], + spark=spark, + **{ + "operator": "Scale values", + "scale_values_parameters": { + "scaler": "Min-max scaler", + "min_max_scaler_parameters": { + "min": -1, + "max": 1, + "input_column": "star_rating", + "output_column": "star_rating_min_max_scaled_builtin", + }, + "standard_scaler_parameters": {}, + }, + }, +) +op_6_output = custom_pyspark( + op_5_output["default"], + spark=spark, + **{ + "code": '# Table is available as variable `df`\nfrom pyspark.sql.functions import stddev, mean, col, floor\ndf = df.withColumn("sentiment", (floor(col("star_rating_min_max_scaled_builtin"))))' + }, +) # Glossary: variable name to node_id # # op_1_output: 14039109-2da9-49b4-8eee-df39306c9c47 # op_2_output: 98b4c198-d379-42ab-af96-165dcd1a01d8 # op_5_output: 93919dab-601b-4aa2-93d3-d223b6f46e25 -# op_6_output: 019d2a9b-9601-4cca-8395-9a976db0ada5 \ No newline at end of file +# op_6_output: 019d2a9b-9601-4cca-8395-9a976db0ada5 diff --git a/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb b/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb index 39c08037..baff3016 100644 --- a/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb +++ b/05_explore/99_GENERATED_SageMaker_Feature_Store_Notebook.ipynb @@ -50,10 +50,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.20.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -149,8 +148,8 @@ "metadata": {}, "outputs": [], "source": [ - "feature_group_name = f'FG-{flow_name}'\n", - "print(f\"Feature Group Name: {feature_group_name}\")\n" + "feature_group_name = f\"FG-{flow_name}\"\n", + "print(f\"Feature Group Name: {feature_group_name}\")" ] }, { @@ -169,15 +168,12 @@ "metadata": {}, "outputs": [], "source": [ - "datawrangler_FG_type_mapping = {\n", - " 'float': 'Fractional',\n", - " 'long': 'Integral'\n", - "}\n", + "datawrangler_FG_type_mapping = {\"float\": \"Fractional\", \"long\": \"Integral\"}\n", "\n", "# Some schema types in Data Wrangler are not supported by Feature Store.\n", "# Feature store supports String, Integral, and Fractional types.\n", "# The following will create a default_FG_type set to String for these types.\n", - "default_FG_type = \"String\"\n" + "default_FG_type = \"String\"" ] }, { @@ -195,71 +191,23 @@ "outputs": [], "source": [ "column_schema = [\n", - " {\n", - " \"name\": \"marketplace\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"customer_id\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"review_id\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"product_id\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"product_parent\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"product_title\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"star_rating\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"helpful_votes\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"total_votes\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"vine\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"verified_purchase\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"review_headline\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"review_body\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"year\",\n", - " \"type\": \"object\"\n", - " },\n", - " {\n", - " \"name\": \"review_date\",\n", - " \"type\": \"date\"\n", - " },\n", - " {\n", - " \"name\": \"product_category\",\n", - " \"type\": \"string\"\n", - " }\n", - "]\n" + " {\"name\": \"marketplace\", \"type\": \"string\"},\n", + " {\"name\": \"customer_id\", \"type\": \"long\"},\n", + " {\"name\": \"review_id\", \"type\": \"string\"},\n", + " {\"name\": \"product_id\", \"type\": \"string\"},\n", + " {\"name\": \"product_parent\", \"type\": \"long\"},\n", + " {\"name\": \"product_title\", \"type\": \"string\"},\n", + " {\"name\": \"star_rating\", \"type\": \"long\"},\n", + " {\"name\": \"helpful_votes\", \"type\": \"long\"},\n", + " {\"name\": \"total_votes\", \"type\": \"long\"},\n", + " {\"name\": \"vine\", \"type\": \"string\"},\n", + " {\"name\": \"verified_purchase\", \"type\": \"string\"},\n", + " {\"name\": \"review_headline\", \"type\": \"string\"},\n", + " {\"name\": \"review_body\", \"type\": \"string\"},\n", + " {\"name\": \"year\", \"type\": \"object\"},\n", + " {\"name\": \"review_date\", \"type\": \"date\"},\n", + " {\"name\": \"product_category\", \"type\": \"string\"},\n", + "]" ] }, { @@ -283,23 +231,18 @@ "source": [ "record_identifier_name = None\n", "if record_identifier_name is None:\n", - " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n", + " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n", "\n", "event_time_feature_name = None\n", "if event_time_feature_name is None:\n", - " raise RuntimeError(\"Select a column name as the event time feature name.\")\n", + " raise RuntimeError(\"Select a column name as the event time feature name.\")\n", "\n", "# Below you map the schema detected from Data Wrangler to Feature Group Types.\n", "feature_definitions = [\n", - " {\n", - " \"FeatureName\": schema['name'],\n", - " \"FeatureType\": datawrangler_FG_type_mapping.get(\n", - " schema['type'],\n", - " default_FG_type\n", - " )\n", - " } for schema in column_schema\n", + " {\"FeatureName\": schema[\"name\"], \"FeatureType\": datawrangler_FG_type_mapping.get(schema[\"type\"], default_FG_type)}\n", + " for schema in column_schema\n", "]\n", - "print(feature_definitions)\n" + "print(feature_definitions)" ] }, { @@ -321,38 +264,33 @@ "sagemaker_client = boto3.client(\"sagemaker\", endpoint_url=sagemaker_endpoint_url)\n", "\n", "# Online Store Configuration\n", - "online_store_config = {\n", - " \"EnableOnlineStore\": True\n", - "}\n", + "online_store_config = {\"EnableOnlineStore\": True}\n", "\n", "# Offline Store Configuration\n", - "s3_uri = 's3://' + bucket # this is the default bucket defined in previous cells\n", - "offline_store_config = {\n", - " \"S3StorageConfig\": {\n", - " \"S3Uri\": s3_uri\n", - " }\n", - "}\n", + "s3_uri = \"s3://\" + bucket # this is the default bucket defined in previous cells\n", + "offline_store_config = {\"S3StorageConfig\": {\"S3Uri\": s3_uri}}\n", "\n", "# Create Feature Group\n", "create_fg_response = sagemaker_client.create_feature_group(\n", - " FeatureGroupName = feature_group_name,\n", - " EventTimeFeatureName = event_time_feature_name,\n", - " RecordIdentifierFeatureName = record_identifier_name,\n", - " FeatureDefinitions = feature_definitions,\n", - " OnlineStoreConfig = online_store_config,\n", - " OfflineStoreConfig = offline_store_config,\n", - " RoleArn = iam_role)\n", + " FeatureGroupName=feature_group_name,\n", + " EventTimeFeatureName=event_time_feature_name,\n", + " RecordIdentifierFeatureName=record_identifier_name,\n", + " FeatureDefinitions=feature_definitions,\n", + " OnlineStoreConfig=online_store_config,\n", + " OfflineStoreConfig=offline_store_config,\n", + " RoleArn=iam_role,\n", + ")\n", "\n", "# Describe Feature Group\n", "status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n", - "while status['FeatureGroupStatus'] != 'Created':\n", - " if status['FeatureGroupStatus'] == 'CreateFailed':\n", + "while status[\"FeatureGroupStatus\"] != \"Created\":\n", + " if status[\"FeatureGroupStatus\"] == \"CreateFailed\":\n", " raise RuntimeError(f\"Feature Group Creation Failed: {status}\")\n", " status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n", - " print(\"Feature Group Status: \" + status['FeatureGroupStatus'])\n", + " print(\"Feature Group Status: \" + status[\"FeatureGroupStatus\"])\n", " time.sleep(3)\n", "\n", - "print(status)\n" + "print(status)" ] }, { @@ -380,6 +318,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -391,6 +330,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -408,6 +348,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -423,6 +364,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -439,16 +381,13 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", @@ -471,48 +410,40 @@ "outputs": [], "source": [ "# Processing job name\n", - "print(f'Processing Job Name: {processing_job_name}')\n", - "\n", - "processingResources = {\n", - " 'ClusterConfig': {\n", - " 'InstanceCount': 1,\n", - " 'InstanceType': 'ml.m5.4xlarge',\n", - " 'VolumeSizeInGB': 30\n", - " }\n", - " }\n", + "print(f\"Processing Job Name: {processing_job_name}\")\n", "\n", - "appSpecification = {'ImageUri': container_uri}\n", + "processingResources = {\"ClusterConfig\": {\"InstanceCount\": 1, \"InstanceType\": \"ml.m5.4xlarge\", \"VolumeSizeInGB\": 30}}\n", + "\n", + "appSpecification = {\"ImageUri\": container_uri}\n", "\n", "sagemaker_client.create_processing_job(\n", - " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n", - " ProcessingOutputConfig={\n", - " 'Outputs': [\n", - " {\n", - " 'OutputName': '14039109-2da9-49b4-8eee-df39306c9c47.default',\n", - " 'FeatureStoreOutput': {\n", - " 'FeatureGroupName': feature_group_name\n", - " },\n", - " 'AppManaged': True\n", - " }\n", - " ],\n", - " },\n", - " ProcessingJobName=processing_job_name,\n", - " ProcessingResources=processingResources,\n", - " AppSpecification=appSpecification,\n", - " RoleArn=iam_role\n", - " )\n", + " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n", + " ProcessingOutputConfig={\n", + " \"Outputs\": [\n", + " {\n", + " \"OutputName\": \"14039109-2da9-49b4-8eee-df39306c9c47.default\",\n", + " \"FeatureStoreOutput\": {\"FeatureGroupName\": feature_group_name},\n", + " \"AppManaged\": True,\n", + " }\n", + " ],\n", + " },\n", + " ProcessingJobName=processing_job_name,\n", + " ProcessingResources=processingResources,\n", + " AppSpecification=appSpecification,\n", + " RoleArn=iam_role,\n", + ")\n", "\n", "\n", "status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n", "\n", - "while status['ProcessingJobStatus'] in ('InProgress', 'Failed'):\n", - " if status['ProcessingJobStatus'] == 'Failed':\n", + "while status[\"ProcessingJobStatus\"] in (\"InProgress\", \"Failed\"):\n", + " if status[\"ProcessingJobStatus\"] == \"Failed\":\n", " raise RuntimeError(f\"Processing Job failed: {status}\")\n", " status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n", - " print(status['ProcessingJobStatus'])\n", + " print(status[\"ProcessingJobStatus\"])\n", " time.sleep(60)\n", "\n", - "print(status)\n" + "print(status)" ] }, { diff --git a/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb b/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb index 8879d1f1..3791bb2e 100644 --- a/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb +++ b/05_explore/99_GENERATED_SageMaker_Pipeline_Notebook.ipynb @@ -46,10 +46,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.20.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -184,6 +183,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -195,6 +195,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -212,6 +213,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -227,6 +229,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -243,29 +246,24 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", " return processing_inputs\n", "\n", + "\n", "def create_container_arguments(output_name, output_content_type):\n", - " output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - " }\n", + " output_config = {output_name: {\"content_type\": output_content_type}}\n", " return [f\"--output-config '{json.dumps(output_config)}'\"]\n", "\n", + "\n", "# Create Processing Job Arguments\n", "processing_job_arguments = {\n", " \"AppSpecification\": {\n", @@ -281,7 +279,7 @@ " \"S3Uri\": output_path,\n", " \"LocalPath\": os.path.join(processing_dir, \"output\"),\n", " \"S3UploadMode\": \"EndOfJob\",\n", - " }\n", + " },\n", " },\n", " ],\n", " },\n", @@ -317,14 +315,11 @@ "from sagemaker.workflow.steps import ProcessingStep, Step, StepTypeEnum\n", "\n", "processor = Processor(\n", - " role=iam_role,\n", - " image_uri=container_uri,\n", - " instance_count=instance_count,\n", - " instance_type=instance_type\n", + " role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type\n", ")\n", "\n", - "class DataWranglerStep(ProcessingStep):\n", "\n", + "class DataWranglerStep(ProcessingStep):\n", " def __init__(self, name, processor, step_args):\n", " super(NaiveStep, self).__init__(name, processor)\n", " self.step_args = step_args\n", @@ -333,10 +328,9 @@ " def arguments(self):\n", " return self.step_args\n", "\n", + "\n", "step_process = DataWranglerStep(\n", - " name=\"DataWranglerProcessingStep\",\n", - " processor=processor,\n", - " step_args=processing_job_arguments\n", + " name=\"DataWranglerProcessingStep\", processor=processor, step_args=processing_job_arguments\n", ")" ] }, @@ -386,7 +380,7 @@ " name=pipeline_name,\n", " parameters=[instance_type, instance_count],\n", " steps=[step_process],\n", - " sagemaker_session=sagemaker_session\n", + " sagemaker_session=sagemaker_session,\n", ")" ] }, @@ -517,35 +511,36 @@ "source": [ "import botocore.waiter\n", "\n", + "\n", "def get_waiter(pipeline, delay=24, max_attempts=60):\n", " waiter_id = \"PipelineExecutionComplete\"\n", - " model = botocore.waiter.WaiterModel({\n", - " \"version\": 2,\n", - " \"waiters\": {\n", - " waiter_id: {\n", - " \"delay\": delay,\n", - " \"maxAttempts\": max_attempts,\n", - " \"operation\": 'DescribePipelineExecution',\n", - " \"acceptors\": [\n", - " {\n", - " \"expected\": \"Succeeded\",\n", - " \"matcher\": \"path\",\n", - " \"state\": \"success\",\n", - " \"argument\": \"PipelineExecutionStatus\"\n", - " },\n", - " {\n", - " \"expected\": \"Failed\",\n", - " \"matcher\": \"path\",\n", - " \"state\": \"failure\",\n", - " \"argument\": \"PipelineExecutionStatus\"\n", - " },\n", - " ]\n", - " }\n", + " model = botocore.waiter.WaiterModel(\n", + " {\n", + " \"version\": 2,\n", + " \"waiters\": {\n", + " waiter_id: {\n", + " \"delay\": delay,\n", + " \"maxAttempts\": max_attempts,\n", + " \"operation\": \"DescribePipelineExecution\",\n", + " \"acceptors\": [\n", + " {\n", + " \"expected\": \"Succeeded\",\n", + " \"matcher\": \"path\",\n", + " \"state\": \"success\",\n", + " \"argument\": \"PipelineExecutionStatus\",\n", + " },\n", + " {\n", + " \"expected\": \"Failed\",\n", + " \"matcher\": \"path\",\n", + " \"state\": \"failure\",\n", + " \"argument\": \"PipelineExecutionStatus\",\n", + " },\n", + " ],\n", + " }\n", + " },\n", " }\n", - " })\n", - " return botocore.waiter.create_waiter_with_client(\n", - " waiter_id, model, sagemaker_session.sagemaker_client\n", - " )" + " )\n", + " return botocore.waiter.create_waiter_with_client(waiter_id, model, sagemaker_session.sagemaker_client)" ] }, { diff --git a/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb b/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb index 7539a1f2..e94646aa 100644 --- a/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb +++ b/05_explore/archive/01_Visualize_Reviews_Dataset.ipynb @@ -62,10 +62,11 @@ "import seaborn as sns\n", "\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", - "# Get region \n", + "# Get region\n", "session = boto3.session.Session()\n", "region_name = session.region_name\n", "\n", @@ -73,9 +74,9 @@ "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket()\n", "\n", - "# Set Athena database & table \n", - "database_name = 'dsoaws'\n", - "table_name = 'amazon_reviews_parquet'\n" + "# Set Athena database & table\n", + "database_name = \"dsoaws\"\n", + "table_name = \"amazon_reviews_parquet\"" ] }, { @@ -104,7 +105,7 @@ "outputs": [], "source": [ "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n", - "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)" ] }, { @@ -116,10 +117,13 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT DISTINCT product_category \\\n", + "cursor.execute(\n", + " \"SELECT DISTINCT product_category \\\n", " FROM {0}.{1} \\\n", - " ORDER BY product_category'\n", - " .format(database_name, table_name))\n", + " ORDER BY product_category\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_categories = as_pandas(cursor)\n", @@ -153,12 +157,15 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT product_category, \\\n", + "cursor.execute(\n", + " \"SELECT product_category, \\\n", " COUNT(star_rating) AS count_star_rating \\\n", " FROM {0}.{1} \\\n", " GROUP BY product_category \\\n", - " ORDER BY count_star_rating DESC'\n", - " .format(database_name, table_name))\n", + " ORDER BY count_star_rating DESC\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_star_ratings = as_pandas(cursor)\n", @@ -172,7 +179,7 @@ "outputs": [], "source": [ "# Store max ratings\n", - "max_ratings = df_star_ratings['count_star_rating'].max()\n", + "max_ratings = df_star_ratings[\"count_star_rating\"].max()\n", "print(max_ratings)" ] }, @@ -184,24 +191,24 @@ "source": [ "# Set size and style to use\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", "\n", - "plt.style.use('seaborn-whitegrid')\n", + "plt.style.use(\"seaborn-whitegrid\")\n", "\n", "# Create Seaborn barplot\n", - "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df_star_ratings, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df_star_ratings, saturation=1)\n", "\n", "# Set title\n", "plt.title(\"Number of Ratings per Product Category\")\n", "\n", - "# Set x-axis ticks to match scale \n", + "# Set x-axis ticks to match scale\n", "if max_ratings > 200000:\n", - " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n", + " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n", " plt.xlim(0, 20000000)\n", "elif max_ratings <= 200000:\n", - " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n", + " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n", " plt.xlim(0, 200000)\n", "\n", "plt.xlabel(\"Number of Ratings\")\n", @@ -232,12 +239,15 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT product_category, \\\n", + "cursor.execute(\n", + " \"SELECT product_category, \\\n", " AVG(star_rating) AS avg_star_rating \\\n", " FROM {0}.{1} \\\n", " GROUP BY product_category \\\n", - " ORDER BY avg_star_rating DESC'\n", - " .format(database_name, table_name))\n", + " ORDER BY avg_star_rating DESC\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_average_ratings = as_pandas(cursor)\n", @@ -251,22 +261,26 @@ "outputs": [], "source": [ "# Set some Seaborn parameters in advance\n", - "sns.set_style = 'seaborn-whitegrid'\n", - "\n", - "sns.set(rc={\"font.style\":\"normal\",\n", - "# \"axes.facecolor\":\"white\",\n", - " \"figure.facecolor\":\"white\",\n", - " \"figure.titlesize\":20,\n", - " \"text.color\":\"black\",\n", - " \"xtick.color\":\"black\",\n", - " \"ytick.color\":\"black\",\n", - " \"axes.labelcolor\":\"black\",\n", - " \"axes.grid\":True,\n", - " 'axes.labelsize':10,\n", - "# 'figure.figsize':(10.0, 10.0),\n", - " 'xtick.labelsize':10,\n", - " 'font.size':10,\n", - " 'ytick.labelsize':10})" + "sns.set_style = \"seaborn-whitegrid\"\n", + "\n", + "sns.set(\n", + " rc={\n", + " \"font.style\": \"normal\",\n", + " # \"axes.facecolor\":\"white\",\n", + " \"figure.facecolor\": \"white\",\n", + " \"figure.titlesize\": 20,\n", + " \"text.color\": \"black\",\n", + " \"xtick.color\": \"black\",\n", + " \"ytick.color\": \"black\",\n", + " \"axes.labelcolor\": \"black\",\n", + " \"axes.grid\": True,\n", + " \"axes.labelsize\": 10,\n", + " # 'figure.figsize':(10.0, 10.0),\n", + " \"xtick.labelsize\": 10,\n", + " \"font.size\": 10,\n", + " \"ytick.labelsize\": 10,\n", + " }\n", + ")" ] }, { @@ -277,12 +291,13 @@ "source": [ "# Helper code to display values on bars\n", "\n", + "\n", "def show_values_barplot(axs, space):\n", " def _show_on_plot(ax):\n", " for p in ax.patches:\n", " _x = p.get_x() + p.get_width() + float(space)\n", " _y = p.get_y() + p.get_height()\n", - " value = round(float(p.get_width()),2)\n", + " value = round(float(p.get_width()), 2)\n", " ax.text(_x, _y, value, ha=\"left\")\n", "\n", " if isinstance(axs, np.ndarray):\n", @@ -301,13 +316,13 @@ "# Plot average ratings per category\n", "\n", "# Create plot\n", - "barplot = sns.barplot(y='product_category', x='avg_star_rating', data = df_average_ratings, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df_average_ratings, saturation=1)\n", "\n", - "# Set title and x-axis ticks \n", - "plt.title('Average Rating by Product Category')\n", - "plt.xticks([1, 2, 3, 4, 5], ['1-Star', '2-Star', '3-Star','4-Star','5-Star'])\n", + "# Set title and x-axis ticks\n", + "plt.title(\"Average Rating by Product Category\")\n", + "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n", "\n", - "# Helper code to show actual values afters bars \n", + "# Helper code to show actual values afters bars\n", "show_values_barplot(barplot, 0.1)\n", "\n", "plt.xlabel(\"Average Rating\")\n", @@ -344,14 +359,17 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT product_category, \\\n", + "cursor.execute(\n", + " \"SELECT product_category, \\\n", " AVG(star_rating) AS avg_star_rating, \\\n", " STDDEV(star_rating) AS stddev_star_rating, \\\n", " SQRT(COUNT(*)) AS sqrt_count \\\n", " FROM {}.{} \\\n", " GROUP BY product_category \\\n", - " ORDER BY avg_star_rating DESC'\n", - " .format(database_name, table_name))\n", + " ORDER BY avg_star_rating DESC\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_avg_stddev_sqrt = as_pandas(cursor)\n", @@ -374,13 +392,16 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT product_category, \\\n", + "cursor.execute(\n", + " \"SELECT product_category, \\\n", " AVG(star_rating) AS avg_star_rating, \\\n", " (STDDEV(star_rating) / SQRT(COUNT(*))) AS sd_mean \\\n", " FROM {}.{} \\\n", " GROUP BY product_category \\\n", - " ORDER BY avg_star_rating DESC'\n", - " .format(database_name, table_name))\n", + " ORDER BY avg_star_rating DESC\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_breakdown_category_avg = as_pandas(cursor)\n", @@ -403,13 +424,16 @@ "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT product_category, \\\n", + "cursor.execute(\n", + " \"SELECT product_category, \\\n", " star_rating, \\\n", " COUNT(*) AS count_reviews \\\n", " FROM {}.{} \\\n", " GROUP BY product_category, star_rating \\\n", - " ORDER BY product_category, star_rating ASC, count_reviews DESC'\n", - " .format(database_name, table_name))\n", + " ORDER BY product_category, star_rating ASC, count_reviews DESC\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_breakdown_category = as_pandas(cursor)\n", @@ -423,11 +447,11 @@ "outputs": [], "source": [ "# Create grouped DataFrames by category and by star rating\n", - "grouped_category = df_breakdown_category.groupby('product_category')\n", - "grouped_star = df_breakdown_category.groupby('star_rating')\n", + "grouped_category = df_breakdown_category.groupby(\"product_category\")\n", + "grouped_star = df_breakdown_category.groupby(\"star_rating\")\n", "\n", "# Create sum of ratings per star rating\n", - "df_sum = df_breakdown_category.groupby(['star_rating']).sum()\n", + "df_sum = df_breakdown_category.groupby([\"star_rating\"]).sum()\n", "df_sum.head(10)" ] }, @@ -438,7 +462,7 @@ "outputs": [], "source": [ "# Calculate total number of star ratings\n", - "total = df_sum['count_reviews'].sum()\n", + "total = df_sum[\"count_reviews\"].sum()\n", "print(total)" ] }, @@ -452,17 +476,17 @@ "\n", "distribution = {}\n", "count_reviews_per_star = []\n", - "i=0\n", - " \n", + "i = 0\n", + "\n", "for category, ratings in grouped_category:\n", " count_reviews_per_star = []\n", - " for star in ratings['star_rating']:\n", - " count_reviews_per_star.append(ratings.get_value(i, 'count_reviews'))\n", - " i=i+1;\n", + " for star in ratings[\"star_rating\"]:\n", + " count_reviews_per_star.append(ratings.get_value(i, \"count_reviews\"))\n", + " i = i + 1\n", " distribution[category] = count_reviews_per_star\n", "\n", "# Check if distribution has been created succesfully\n", - "print(distribution)\n" + "print(distribution)" ] }, { @@ -501,8 +525,8 @@ "# Sort distribution by highest average rating per category\n", "sorted_distribution = {}\n", "\n", - "df_average_ratings.iloc[:,0]\n", - "for index, value in df_average_ratings.iloc[:,0].items():\n", + "df_average_ratings.iloc[:, 0]\n", + "for index, value in df_average_ratings.iloc[:, 0].items():\n", " sorted_distribution[value] = distribution[value]" ] }, @@ -525,8 +549,7 @@ " star2.append(stars[1])\n", " star3.append(stars[2])\n", " star4.append(stars[3])\n", - " star5.append(stars[4])\n", - " " + " star5.append(stars[4])" ] }, { @@ -548,7 +571,7 @@ "proportion_star5 = np.true_divide(star5, total) * 100\n", "\n", "# Add colors\n", - "colors = ['red', 'purple','blue','orange','green']\n", + "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n", "\n", "# The position of the bars on the x-axis\n", "r = range(len(categories))\n", @@ -556,25 +579,57 @@ "\n", "# Plot bars\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", - "\n", - "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n", - "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n", - "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n", - "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n", - "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n", - "\n", - "plt.title(\"Distribution of Reviews Per Rating Per Category\",fontsize='16')\n", - "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n", - "plt.yticks(r, categories, fontweight='bold')\n", - "\n", - "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", + "\n", + "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n", + "ax4 = plt.barh(\n", + " r,\n", + " proportion_star4,\n", + " left=proportion_star5,\n", + " color=colors[3],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"4-Star Ratings\",\n", + ")\n", + "ax3 = plt.barh(\n", + " r,\n", + " proportion_star3,\n", + " left=proportion_star5 + proportion_star4,\n", + " color=colors[2],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"3-Star Ratings\",\n", + ")\n", + "ax2 = plt.barh(\n", + " r,\n", + " proportion_star2,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3,\n", + " color=colors[1],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"2-Star Ratings\",\n", + ")\n", + "ax1 = plt.barh(\n", + " r,\n", + " proportion_star1,\n", + " left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n", + " color=colors[0],\n", + " edgecolor=\"white\",\n", + " height=barHeight,\n", + " label=\"1-Star Ratings\",\n", + ")\n", + "\n", + "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n", + "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n", + "plt.yticks(r, categories, fontweight=\"bold\")\n", + "\n", + "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n", "plt.gca().invert_yaxis()\n", "\n", "plt.tight_layout()\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -597,11 +652,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Execute query using connection cursor \n", + "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", "# If rating > 3, sentiment = 1 (positive), else 0 (negative)\n", - "cursor.execute('SELECT customer_id, \\\n", + "cursor.execute(\n", + " \"SELECT customer_id, \\\n", " product_id, \\\n", " star_rating, \\\n", " CASE \\\n", @@ -611,8 +667,10 @@ " AS is_positive_sentiment \\\n", " FROM {}.{} \\\n", " ORDER BY review_id \\\n", - " LIMIT 10000'\n", - " .format(database_name, table_name))\n", + " LIMIT 10000\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "# Load query results into Pandas DataFrame and show results\n", "df_sentiment = as_pandas(cursor)\n", @@ -646,10 +704,11 @@ }, "outputs": [], "source": [ - "# Execute query using connection cursor \n", + "# Execute query using connection cursor\n", "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n", "\n", - "cursor.execute('SELECT review_body, \\\n", + "cursor.execute(\n", + " \"SELECT review_body, \\\n", " CASE \\\n", " WHEN star_rating > 3 THEN 1 \\\n", " ELSE 0 \\\n", @@ -657,8 +716,10 @@ " AS is_positive_sentiment \\\n", " FROM {}.{} \\\n", " ORDER BY review_id \\\n", - " LIMIT 10000'\n", - " .format(database_name, table_name))\n", + " LIMIT 10000\".format(\n", + " database_name, table_name\n", + " )\n", + ")\n", "\n", "df_reviews = as_pandas(cursor)\n", "df_reviews.head(10)" @@ -678,7 +739,8 @@ "outputs": [], "source": [ "import bs4\n", - "df_reviews['review_body'] = df_reviews['review_body'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text())\n", + "\n", + "df_reviews[\"review_body\"] = df_reviews[\"review_body\"].apply(lambda x: bs4.BeautifulSoup(x, \"lxml\").get_text())\n", "df_reviews" ] }, @@ -690,32 +752,41 @@ "source": [ "from wordcloud import WordCloud, STOPWORDS\n", "\n", - "def plot_wordcloud(text, mask=None, max_words=200, max_font_size=150, figure_size=(20.0,15.0), \n", - " title = None, title_size=40, image_color=False):\n", + "\n", + "def plot_wordcloud(\n", + " text,\n", + " mask=None,\n", + " max_words=200,\n", + " max_font_size=150,\n", + " figure_size=(20.0, 15.0),\n", + " title=None,\n", + " title_size=40,\n", + " image_color=False,\n", + "):\n", " stopwords = set(STOPWORDS)\n", "\n", - " wordcloud = WordCloud(background_color='gray',\n", - " stopwords = stopwords,\n", - " max_words = max_words,\n", - " max_font_size = max_font_size, \n", - " random_state = 50,\n", - " width=800, \n", - " height=400,\n", - " mask = mask)\n", + " wordcloud = WordCloud(\n", + " background_color=\"gray\",\n", + " stopwords=stopwords,\n", + " max_words=max_words,\n", + " max_font_size=max_font_size,\n", + " random_state=50,\n", + " width=800,\n", + " height=400,\n", + " mask=mask,\n", + " )\n", " wordcloud.generate(str(text))\n", - " \n", + "\n", " plt.figure(figsize=figure_size)\n", " if image_color:\n", - " image_colors = ImageColorGenerator(mask);\n", - " plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation=\"bilinear\");\n", - " plt.title(title, fontdict={'size': title_size, \n", - " 'verticalalignment': 'bottom'})\n", + " image_colors = ImageColorGenerator(mask)\n", + " plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation=\"bilinear\")\n", + " plt.title(title, fontdict={\"size\": title_size, \"verticalalignment\": \"bottom\"})\n", " else:\n", - " plt.imshow(wordcloud);\n", - " plt.title(title, fontdict={'size': title_size, 'color': 'black', \n", - " 'verticalalignment': 'bottom'})\n", - " plt.axis('off');\n", - " plt.tight_layout() " + " plt.imshow(wordcloud)\n", + " plt.title(title, fontdict={\"size\": title_size, \"color\": \"black\", \"verticalalignment\": \"bottom\"})\n", + " plt.axis(\"off\")\n", + " plt.tight_layout()" ] }, { @@ -724,7 +795,9 @@ "metadata": {}, "outputs": [], "source": [ - "plot_wordcloud(df_reviews.query('is_positive_sentiment == 0')['review_body'], title=\"Word Cloud of Negative Amazon Reviews\")" + "plot_wordcloud(\n", + " df_reviews.query(\"is_positive_sentiment == 0\")[\"review_body\"], title=\"Word Cloud of Negative Amazon Reviews\"\n", + ")" ] }, { @@ -733,7 +806,9 @@ "metadata": {}, "outputs": [], "source": [ - "plot_wordcloud(df_reviews.query('is_positive_sentiment == 1')['review_body'], title=\"Word Cloud of Positive Amazon Reviews\")" + "plot_wordcloud(\n", + " df_reviews.query(\"is_positive_sentiment == 1\")[\"review_body\"], title=\"Word Cloud of Positive Amazon Reviews\"\n", + ")" ] }, { @@ -757,17 +832,21 @@ "source": [ "import string\n", "\n", - "df_reviews['num_words'] = df_reviews['review_body'].apply(lambda x: len(str(x).split()))\n", + "df_reviews[\"num_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x).split()))\n", "\n", - "df_reviews['num_unique_words'] = df_reviews['review_body'].apply(lambda x: len(set(str(x).split())))\n", + "df_reviews[\"num_unique_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(set(str(x).split())))\n", "\n", - "df_reviews['num_chars'] = df_reviews['review_body'].apply(lambda x: len(str(x)))\n", + "df_reviews[\"num_chars\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x)))\n", "\n", - "df_reviews['num_stopwords'] = df_reviews['review_body'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))\n", + "df_reviews[\"num_stopwords\"] = df_reviews[\"review_body\"].apply(\n", + " lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])\n", + ")\n", "\n", - "df_reviews['num_punctuations'] = df_reviews['review_body'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )\n", + "df_reviews[\"num_punctuations\"] = df_reviews[\"review_body\"].apply(\n", + " lambda x: len([c for c in str(x) if c in string.punctuation])\n", + ")\n", "\n", - "df_reviews['mean_word_len'] = df_reviews['review_body'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))" + "df_reviews[\"mean_word_len\"] = df_reviews[\"review_body\"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))" ] }, { @@ -803,7 +882,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_reviews = df_reviews.query('num_words <= 500 and num_punctuations < 500')" + "df_reviews = df_reviews.query(\"num_words <= 500 and num_punctuations < 500\")" ] }, { @@ -819,21 +898,21 @@ "metadata": {}, "outputs": [], "source": [ - "f, axes = plt.subplots(3, 1, figsize=(10,20))\n", + "f, axes = plt.subplots(3, 1, figsize=(10, 20))\n", "\n", - "sns.violinplot(x='is_positive_sentiment', y='num_words', data=df_reviews, ax=axes[0])\n", - "axes[0].set_xlabel('Sentiment', fontsize=12)\n", - "axes[0].set_ylabel('Number Of Words', fontsize=12)\n", + "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_words\", data=df_reviews, ax=axes[0])\n", + "axes[0].set_xlabel(\"Sentiment\", fontsize=12)\n", + "axes[0].set_ylabel(\"Number Of Words\", fontsize=12)\n", "axes[0].set_title(\"Number Of Words In Each Class\", fontsize=15)\n", "\n", - "sns.violinplot(x='is_positive_sentiment', y='num_chars', data=df_reviews, ax=axes[1])\n", - "axes[1].set_xlabel('Sentiment', fontsize=12)\n", - "axes[1].set_ylabel('Number Of Characters', fontsize=12)\n", + "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_chars\", data=df_reviews, ax=axes[1])\n", + "axes[1].set_xlabel(\"Sentiment\", fontsize=12)\n", + "axes[1].set_ylabel(\"Number Of Characters\", fontsize=12)\n", "axes[1].set_title(\"Number Of Characters In Each Class\", fontsize=15)\n", "\n", - "sns.violinplot(x='is_positive_sentiment', y='num_punctuations', data=df_reviews, ax=axes[2])\n", - "axes[2].set_xlabel('Sentiment', fontsize=12)\n", - "axes[2].set_ylabel('Number Of Punctutations', fontsize=12)\n", + "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_punctuations\", data=df_reviews, ax=axes[2])\n", + "axes[2].set_xlabel(\"Sentiment\", fontsize=12)\n", + "axes[2].set_ylabel(\"Number Of Punctutations\", fontsize=12)\n", "axes[2].set_title(\"Number Of Punctuations In Each Class\", fontsize=15)\n", "plt.show()" ] @@ -852,10 +931,10 @@ "outputs": [], "source": [ "# Count number of reviews per sentiment class\n", - "print(df_reviews['is_positive_sentiment'].value_counts())\n", + "print(df_reviews[\"is_positive_sentiment\"].value_counts())\n", "\n", "# Create Plot\n", - "plot = sns.countplot(x='is_positive_sentiment', data=df_reviews)\n", + "plot = sns.countplot(x=\"is_positive_sentiment\", data=df_reviews)\n", "plt.xlabel(\"Sentiment\", fontsize=16)\n", "plt.ylabel(\"Number Of Reviews\", fontsize=16)\n", "plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n", @@ -880,22 +959,21 @@ "source": [ "from sklearn.utils import resample\n", "\n", - "positive = df_reviews[df_reviews['is_positive_sentiment']==1]\n", - "negative = df_reviews[df_reviews['is_positive_sentiment']==0]\n", + "positive = df_reviews[df_reviews[\"is_positive_sentiment\"] == 1]\n", + "negative = df_reviews[df_reviews[\"is_positive_sentiment\"] == 0]\n", "\n", - "positive_downsampled = resample(positive,\n", - " replace = False, # sample without replacement\n", - " n_samples = len(negative), # match minority n\n", - " random_state = 27) # reproducible results\n", + "positive_downsampled = resample(\n", + " positive, replace=False, n_samples=len(negative), random_state=27 # sample without replacement # match minority n\n", + ") # reproducible results\n", "\n", "# combine minority and downsampled majority\n", "downsampled = pd.concat([positive_downsampled, negative])\n", "\n", "# checking counts\n", - "print(downsampled['is_positive_sentiment'].value_counts())\n", + "print(downsampled[\"is_positive_sentiment\"].value_counts())\n", "\n", "# Create Plot\n", - "plot = sns.countplot(x='is_positive_sentiment', data=downsampled)\n", + "plot = sns.countplot(x=\"is_positive_sentiment\", data=downsampled)\n", "plt.xlabel(\"Sentiment\", fontsize=16)\n", "plt.ylabel(\"Number Of Reviews\", fontsize=16)\n", "plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n", @@ -923,9 +1001,9 @@ "train, test = train_test_split(downsampled, test_size=0.2, random_state=0)\n", "test, validate = train_test_split(test, test_size=0.5, random_state=0)\n", "\n", - "print(f'Number of training examples: {len(train.index)}')\n", - "print(f'Number of testing examples: {len(test.index)}')\n", - "print(f'Number of validation examples: {len(validate.index)}')\n" + "print(f\"Number of training examples: {len(train.index)}\")\n", + "print(f\"Number of testing examples: {len(test.index)}\")\n", + "print(f\"Number of validation examples: {len(validate.index)}\")" ] }, { @@ -943,17 +1021,17 @@ "source": [ "# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n", "\n", - "labels = ['Train', 'Validation', 'Test']\n", + "labels = [\"Train\", \"Validation\", \"Test\"]\n", "sizes = [len(train.index), len(validate.index), len(test.index)]\n", - "explode = (0.1, 0, 0) \n", + "explode = (0.1, 0, 0)\n", "\n", "fig1, ax1 = plt.subplots()\n", "\n", "ax1.set_title(\"Split Of Train, Validatin And Test Data\", fontsize=16)\n", - "ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 12})\n", + "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90, textprops={\"fontsize\": 12})\n", "\n", "# Equal aspect ratio ensures that pie is drawn as a circle.\n", - "ax1.axis('equal') \n", + "ax1.axis(\"equal\")\n", "plt.show()" ] }, @@ -980,18 +1058,16 @@ "\n", "\n", "def query_athena(sql, region_name, s3_staging_dir):\n", - " cursor = pyathena.connect(\n", - " region_name=region_name,\n", - " s3_staging_dir=\"{}\".format(s3_staging_dir)).cursor()\n", + " cursor = pyathena.connect(region_name=region_name, s3_staging_dir=\"{}\".format(s3_staging_dir)).cursor()\n", " cursor.execute(sql)\n", " return cursor\n", "\n", + "\n", "@magics_class\n", "class AthenaMagics(Magics):\n", " s3_staging_dir = None\n", " region_name = None\n", "\n", - " \n", " def parse_args(self, line):\n", " args = magic_arguments.parse_argstring(self.athena, line)\n", "\n", @@ -1000,27 +1076,27 @@ " raise ValueError(\"s3_staging_dir for Athena should be set\")\n", " if args.s3_staging_dir is not None:\n", " self.s3_staging_dir = args.s3_staging_dir\n", - " \n", + "\n", " # region name\n", " if args.region_name is None and self.region_name is None:\n", " raise ValueError(\"region_name for Athena should be set\")\n", " if args.region_name is not None:\n", " self.region_name = args.region_name\n", - " \n", + "\n", " @cell_magic\n", " @magic_arguments.magic_arguments()\n", - " @magic_arguments.argument('--s3_staging_dir', '-s',\n", - " help='s3 path required by athena for writing query results (e.g. s3://your/staging/dir)'\n", - " )\n", - " @magic_arguments.argument('--region_name', '-r',\n", - " help='aws region name (e.g. us-west-2)'\n", + " @magic_arguments.argument(\n", + " \"--s3_staging_dir\",\n", + " \"-s\",\n", + " help=\"s3 path required by athena for writing query results (e.g. s3://your/staging/dir)\",\n", " )\n", - " def athena(self, line='', cell=None):\n", + " @magic_arguments.argument(\"--region_name\", \"-r\", help=\"aws region name (e.g. us-west-2)\")\n", + " def athena(self, line=\"\", cell=None):\n", " self.parse_args(line)\n", " cursor = query_athena(cell, self.region_name, self.s3_staging_dir)\n", " return as_pandas(cursor)\n", "\n", - " \n", + "\n", "ip = get_ipython()\n", "ip.register_magics(AthenaMagics)" ] diff --git a/05_explore/archive/02_Explore_Redshift_Data.ipynb b/05_explore/archive/02_Explore_Redshift_Data.ipynb index b4433b08..ea44e7bc 100644 --- a/05_explore/archive/02_Explore_Redshift_Data.ipynb +++ b/05_explore/archive/02_Explore_Redshift_Data.ipynb @@ -30,13 +30,13 @@ "metadata": {}, "outputs": [], "source": [ - "redshift_schema = 'redshift'\n", - "redshift_cluster_identifier = 'dsoaws'\n", - "redshift_host = 'dsoaws'\n", - "redshift_database = 'dsoaws'\n", - "redshift_port = '5439'\n", - "redshift_table_2015 = 'amazon_reviews_tsv_2015'\n", - "redshift_table_2014 = 'amazon_reviews_tsv_2014'" + "redshift_schema = \"redshift\"\n", + "redshift_cluster_identifier = \"dsoaws\"\n", + "redshift_host = \"dsoaws\"\n", + "redshift_database = \"dsoaws\"\n", + "redshift_port = \"5439\"\n", + "redshift_table_2015 = \"amazon_reviews_tsv_2015\"\n", + "redshift_table_2014 = \"amazon_reviews_tsv_2014\"" ] }, { @@ -55,13 +55,13 @@ "import json\n", "import boto3\n", "\n", - "secretsmanager = boto3.client('secretsmanager')\n", + "secretsmanager = boto3.client(\"secretsmanager\")\n", "\n", - "secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')\n", - "cred = json.loads(secret['SecretString'])\n", + "secret = secretsmanager.get_secret_value(SecretId=\"dsoaws_redshift_login\")\n", + "cred = json.loads(secret[\"SecretString\"])\n", "\n", - "redshift_username = cred[0]['username']\n", - "redshift_pw = cred[1]['password']" + "redshift_username = cred[0][\"username\"]\n", + "redshift_pw = cred[1][\"password\"]" ] }, { @@ -70,11 +70,11 @@ "metadata": {}, "outputs": [], "source": [ - "redshift = boto3.client('redshift')\n", + "redshift = boto3.client(\"redshift\")\n", "\n", "response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)\n", "\n", - "redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']\n", + "redshift_endpoint_address = response[\"Clusters\"][0][\"Endpoint\"][\"Address\"]\n", "\n", "print(redshift_endpoint_address)" ] @@ -94,7 +94,11 @@ "source": [ "from sqlalchemy import create_engine\n", "\n", - "engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database))" + "engine = create_engine(\n", + " \"postgresql://{}:{}@{}:{}/{}\".format(\n", + " redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database\n", + " )\n", + ")" ] }, { @@ -124,9 +128,14 @@ "outputs": [], "source": [ "%%time\n", - "df = pd.read_sql_query(\"\"\"SELECT approximate count(distinct customer_id)\n", + "df = pd.read_sql_query(\n", + " \"\"\"SELECT approximate count(distinct customer_id)\n", " FROM {}.{}\n", - " GROUP BY product_category\"\"\".format(redshift_schema, redshift_table_2015), engine)" + " GROUP BY product_category\"\"\".format(\n", + " redshift_schema, redshift_table_2015\n", + " ),\n", + " engine,\n", + ")" ] }, { @@ -136,9 +145,14 @@ "outputs": [], "source": [ "%%time\n", - "df = pd.read_sql_query(\"\"\"SELECT count(distinct customer_id)\n", + "df = pd.read_sql_query(\n", + " \"\"\"SELECT count(distinct customer_id)\n", " FROM {}.{}\n", - " GROUP BY product_category\"\"\".format(redshift_schema, redshift_table_2015), engine)" + " GROUP BY product_category\"\"\".format(\n", + " redshift_schema, redshift_table_2015\n", + " ),\n", + " engine,\n", + ")" ] }, { @@ -158,8 +172,9 @@ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", - "%config InlineBackend.figure_format='retina'\n" + "%config InlineBackend.figure_format='retina'" ] }, { @@ -174,7 +189,9 @@ "FROM {}.{}\n", "GROUP BY product_category\n", "ORDER BY count_star_rating DESC\n", - "\"\"\".format(redshift_schema, redshift_table_2015)\n", + "\"\"\".format(\n", + " redshift_schema, redshift_table_2015\n", + ")\n", "\n", "print(statement)" ] @@ -215,7 +232,7 @@ "outputs": [], "source": [ "# Store max ratings\n", - "max_ratings = df['count_star_rating'].max()\n", + "max_ratings = df[\"count_star_rating\"].max()\n", "print(max_ratings)" ] }, @@ -227,27 +244,30 @@ "source": [ "# Set size and style to use\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", - " \n", - "plt.style.use('seaborn-whitegrid')\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", + "\n", + "plt.style.use(\"seaborn-whitegrid\")\n", "\n", "# Create Seaborn barplot\n", - "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n", "\n", "# Set title\n", "plt.title(\"Number of Ratings per Product Category (Redshift)\")\n", "\n", "# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n", "if max_ratings <= 8000:\n", - " plt.xticks([10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ['10K', '20K', '30K', '40K', '50K', '60K','70K', '80K' ])\n", + " plt.xticks(\n", + " [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],\n", + " [\"10K\", \"20K\", \"30K\", \"40K\", \"50K\", \"60K\", \"70K\", \"80K\"],\n", + " )\n", " plt.xlim(0, 80000)\n", "elif max_ratings <= 200000:\n", - " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n", - " plt.xlim(0, 200000) \n", + " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n", + " plt.xlim(0, 200000)\n", "elif max_ratings > 200000:\n", - " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n", + " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n", " plt.xlim(0, 20000000)\n", "\n", "plt.xlabel(\"Number of Ratings\")\n", @@ -275,8 +295,8 @@ "metadata": {}, "outputs": [], "source": [ - "athena_schema = 'athena'\n", - "athena_table_name = 'amazon_reviews_tsv'\n" + "athena_schema = \"athena\"\n", + "athena_table_name = \"amazon_reviews_tsv\"" ] }, { @@ -290,7 +310,9 @@ "FROM {}.{}\n", "GROUP BY product_category\n", "ORDER BY count_star_rating DESC\n", - "\"\"\".format(athena_schema, athena_table_name)\n", + "\"\"\".format(\n", + " athena_schema, athena_table_name\n", + ")\n", "\n", "print(statement)" ] @@ -313,14 +335,14 @@ "source": [ "# Set size and style to use\n", "if num_categories > 10:\n", - " plt.figure(figsize=(10,10))\n", - "else: \n", - " plt.figure(figsize=(10,5))\n", + " plt.figure(figsize=(10, 10))\n", + "else:\n", + " plt.figure(figsize=(10, 5))\n", "\n", - "plt.style.use('seaborn-whitegrid')\n", + "plt.style.use(\"seaborn-whitegrid\")\n", "\n", "# Create Seaborn barplot\n", - "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n", + "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n", "\n", "# Set title\n", "plt.title(\"Number of Ratings per Product Category (Athena via Redshift Spectrum)\")\n", @@ -328,13 +350,16 @@ "# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n", "# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews\n", "if max_ratings <= 8000:\n", - " plt.xticks([10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ['10K', '20K', '30K', '40K', '50K', '60K','70K', '80K' ])\n", + " plt.xticks(\n", + " [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],\n", + " [\"10K\", \"20K\", \"30K\", \"40K\", \"50K\", \"60K\", \"70K\", \"80K\"],\n", + " )\n", " plt.xlim(0, 80000)\n", "elif max_ratings <= 200000:\n", - " plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '1500K', '200K'])\n", - " plt.xlim(0, 200000) \n", + " plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n", + " plt.xlim(0, 200000)\n", "elif max_ratings > 200000:\n", - " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n", + " plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n", " plt.xlim(0, 20000000)\n", "\n", "plt.xlabel(\"Number of Ratings\")\n", diff --git a/05_explore/preprocess-deequ-pyspark.py b/05_explore/preprocess-deequ-pyspark.py index b345ad04..42eee609 100644 --- a/05_explore/preprocess-deequ-pyspark.py +++ b/05_explore/preprocess-deequ-pyspark.py @@ -7,8 +7,9 @@ import shutil import csv import subprocess -subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-deps', 'pydeequ==0.1.5']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas==1.1.4']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "pydeequ==0.1.5"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas==1.1.4"]) import pyspark from pyspark.sql import SparkSession @@ -22,123 +23,106 @@ # PySpark Deequ GitHub Repo: https://github.com/awslabs/python-deequ + def main(): args_iter = iter(sys.argv[1:]) args = dict(zip(args_iter, args_iter)) # Retrieve the args and replace 's3://' with 's3a://' (used by Spark) - s3_input_data = args['s3_input_data'].replace('s3://', 's3a://') + s3_input_data = args["s3_input_data"].replace("s3://", "s3a://") print(s3_input_data) - s3_output_analyze_data = args['s3_output_analyze_data'].replace('s3://', 's3a://') + s3_output_analyze_data = args["s3_output_analyze_data"].replace("s3://", "s3a://") print(s3_output_analyze_data) - spark = SparkSession \ - .builder \ - .appName("PySparkAmazonReviewsAnalyzer") \ - .getOrCreate() - - schema = StructType([ - StructField("marketplace", StringType(), True), - StructField("customer_id", StringType(), True), - StructField("review_id", StringType(), True), - StructField("product_id", StringType(), True), - StructField("product_parent", StringType(), True), - StructField("product_title", StringType(), True), - StructField("product_category", StringType(), True), - StructField("star_rating", IntegerType(), True), - StructField("helpful_votes", IntegerType(), True), - StructField("total_votes", IntegerType(), True), - StructField("vine", StringType(), True), - StructField("verified_purchase", StringType(), True), - StructField("review_headline", StringType(), True), - StructField("review_body", StringType(), True), - StructField("review_date", StringType(), True) - ]) - - dataset = spark.read.csv(s3_input_data, - header=True, - schema=schema, - sep="\t", - quote="") + spark = SparkSession.builder.appName("PySparkAmazonReviewsAnalyzer").getOrCreate() + + schema = StructType( + [ + StructField("marketplace", StringType(), True), + StructField("customer_id", StringType(), True), + StructField("review_id", StringType(), True), + StructField("product_id", StringType(), True), + StructField("product_parent", StringType(), True), + StructField("product_title", StringType(), True), + StructField("product_category", StringType(), True), + StructField("star_rating", IntegerType(), True), + StructField("helpful_votes", IntegerType(), True), + StructField("total_votes", IntegerType(), True), + StructField("vine", StringType(), True), + StructField("verified_purchase", StringType(), True), + StructField("review_headline", StringType(), True), + StructField("review_body", StringType(), True), + StructField("review_date", StringType(), True), + ] + ) + + dataset = spark.read.csv(s3_input_data, header=True, schema=schema, sep="\t", quote="") # Calculate statistics on the dataset - analysisResult = AnalysisRunner(spark) \ - .onData(dataset) \ - .addAnalyzer(Size()) \ - .addAnalyzer(Completeness("review_id")) \ - .addAnalyzer(ApproxCountDistinct("review_id")) \ - .addAnalyzer(Mean("star_rating")) \ - .addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0")) \ - .addAnalyzer(Correlation("total_votes", "star_rating")) \ - .addAnalyzer(Correlation("total_votes", "helpful_votes")) \ - .run() + analysisResult = ( + AnalysisRunner(spark) + .onData(dataset) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("review_id")) + .addAnalyzer(ApproxCountDistinct("review_id")) + .addAnalyzer(Mean("star_rating")) + .addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0")) + .addAnalyzer(Correlation("total_votes", "star_rating")) + .addAnalyzer(Correlation("total_votes", "helpful_votes")) + .run() + ) metrics = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult) metrics.show(truncate=False) - metrics \ - .repartition(1) \ - .write.format('csv') \ - .mode('overwrite') \ - .option('header',True) \ - .option('sep','\t') \ - .save('{}/dataset-metrics'.format(s3_output_analyze_data)) + metrics.repartition(1).write.format("csv").mode("overwrite").option("header", True).option("sep", "\t").save( + "{}/dataset-metrics".format(s3_output_analyze_data) + ) # Check data quality - verificationResult = VerificationSuite(spark) \ - .onData(dataset) \ + verificationResult = ( + VerificationSuite(spark) + .onData(dataset) .addCheck( - Check(spark, CheckLevel.Error, "Review Check") \ - .hasSize(lambda x: x >= 200000) \ - .hasMin("star_rating", lambda x: x == 1.0) \ - .hasMax("star_rating", lambda x: x == 5.0) \ - .isComplete("review_id") \ - .isUnique("review_id") \ - .isComplete("marketplace") \ - .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"])) \ + Check(spark, CheckLevel.Error, "Review Check") + .hasSize(lambda x: x >= 200000) + .hasMin("star_rating", lambda x: x == 1.0) + .hasMax("star_rating", lambda x: x == 5.0) + .isComplete("review_id") + .isUnique("review_id") + .isComplete("marketplace") + .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"]) + ) .run() + ) print(f"Verification Run Status: {verificationResult.status}") resultsDataFrame = VerificationResult.checkResultsAsDataFrame(spark, verificationResult) - resultsDataFrame.show(truncate=False) - resultsDataFrame \ - .repartition(1) \ - .write.format('csv') \ - .mode('overwrite') \ - .option('header', True) \ - .option('sep', '\t') \ - .save('{}/constraint-checks'.format(s3_output_analyze_data)) - - verificationSuccessMetricsDataFrame = VerificationResult.successMetricsAsDataFrame(spark, verificationResult) + resultsDataFrame.show(truncate=False) + resultsDataFrame.repartition(1).write.format("csv").mode("overwrite").option("header", True).option( + "sep", "\t" + ).save("{}/constraint-checks".format(s3_output_analyze_data)) + + verificationSuccessMetricsDataFrame = VerificationResult.successMetricsAsDataFrame(spark, verificationResult) verificationSuccessMetricsDataFrame.show(truncate=False) - verificationSuccessMetricsDataFrame \ - .repartition(1) \ - .write.format('csv') \ - .mode('overwrite') \ - .option('header', True) \ - .option('sep', '\t') \ - .save('{}/success-metrics'.format(s3_output_analyze_data)) + verificationSuccessMetricsDataFrame.repartition(1).write.format("csv").mode("overwrite").option( + "header", True + ).option("sep", "\t").save("{}/success-metrics".format(s3_output_analyze_data)) # Suggest new checks and constraints - suggestionsResult = ConstraintSuggestionRunner(spark) \ - .onData(dataset) \ - .addConstraintRule(DEFAULT()) \ - .run() + suggestionsResult = ConstraintSuggestionRunner(spark).onData(dataset).addConstraintRule(DEFAULT()).run() suggestions = suggestionsResult["constraint_suggestions"] parallelizedSuggestions = spark.sparkContext.parallelize(suggestions) - + suggestionsResultsDataFrame = spark.createDataFrame(parallelizedSuggestions) suggestionsResultsDataFrame.show(truncate=False) - suggestionsResultsDataFrame \ - .repartition(1) \ - .write.format('csv') \ - .mode('overwrite') \ - .option('header', True) \ - .option('sep', '\t') \ - .save('{}/constraint-suggestions'.format(s3_output_analyze_data)) - + suggestionsResultsDataFrame.repartition(1).write.format("csv").mode("overwrite").option("header", True).option( + "sep", "\t" + ).save("{}/constraint-suggestions".format(s3_output_analyze_data)) + + # spark.stop() - + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb b/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb index 8ebd63b1..0d64c8ed 100644 --- a/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb +++ b/06_prepare/01_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb @@ -26,13 +26,13 @@ "import sagemaker\n", "import boto3\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)" ] }, { @@ -56,32 +56,25 @@ "import csv\n", "from transformers import DistilBertTokenizer\n", "\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - "REVIEW_BODY_COLUMN = 'review_body'\n", - "REVIEW_ID_COLUMN = 'review_id'\n", + "REVIEW_BODY_COLUMN = \"review_body\"\n", + "REVIEW_ID_COLUMN = \"review_id\"\n", "# DATE_COLUMN = 'date'\n", "\n", - "LABEL_COLUMN = 'star_rating'\n", + "LABEL_COLUMN = \"star_rating\"\n", "LABEL_VALUES = [1, 2, 3, 4, 5]\n", "\n", "label_map = {}\n", "for (i, label) in enumerate(LABEL_VALUES):\n", " label_map[label] = i\n", "\n", - " \n", + "\n", "class InputFeatures(object):\n", - " \"\"\"BERT feature vectors.\"\"\"\n", - "\n", - " def __init__(self,\n", - " input_ids,\n", - " input_mask,\n", - " segment_ids,\n", - " label_id,\n", - " review_id,\n", - " date,\n", - " label):\n", - "# review_body):\n", + " \"\"\"BERT feature vectors.\"\"\"\n", + "\n", + " def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):\n", + " # review_body):\n", " self.input_ids = input_ids\n", " self.input_mask = input_mask\n", " self.segment_ids = segment_ids\n", @@ -89,48 +82,51 @@ " self.review_id = review_id\n", " self.date = date\n", " self.label = label\n", + "\n", + "\n", "# self.review_body = review_body\n", "\n", - " \n", + "\n", "class Input(object):\n", - " \"\"\"A single training/test input for sequence classification.\"\"\"\n", - "\n", - " def __init__(self, text, review_id, date, label=None):\n", - " \"\"\"Constructs an Input.\n", - " Args:\n", - " text: string. The untokenized text of the first sequence. For single\n", - " sequence tasks, only this sequence must be specified.\n", - " label: (Optional) string. The label of the example. This should be\n", - " specified for train and dev examples, but not for test examples.\n", - " \"\"\"\n", - " self.text = text\n", - " self.review_id = review_id\n", - " self.date = date\n", - " self.label = label\n", - " \n", + " \"\"\"A single training/test input for sequence classification.\"\"\"\n", + "\n", + " def __init__(self, text, review_id, date, label=None):\n", + " \"\"\"Constructs an Input.\n", + " Args:\n", + " text: string. The untokenized text of the first sequence. For single\n", + " sequence tasks, only this sequence must be specified.\n", + " label: (Optional) string. The label of the example. This should be\n", + " specified for train and dev examples, but not for test examples.\n", + " \"\"\"\n", + " self.text = text\n", + " self.review_id = review_id\n", + " self.date = date\n", + " self.label = label\n", + "\n", "\n", "def convert_input(the_input, max_seq_length):\n", " # First, we need to preprocess our data so that it matches the data BERT was trained on:\n", " # 1. Lowercase our text (if we're using a BERT lowercase model)\n", " # 2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", " # 3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", - " # \n", + " #\n", " # Fortunately, the Transformers tokenizer does this for us!\n", "\n", " tokens = tokenizer.tokenize(the_input.text)\n", - " print('**tokens**\\n{}\\n'.format(tokens))\n", + " print(\"**tokens**\\n{}\\n\".format(tokens))\n", "\n", - " encode_plus_tokens = tokenizer.encode_plus(the_input.text,\n", - " pad_to_max_length=True,\n", - " max_length=max_seq_length,\n", - "# truncation=True\n", - " )\n", + " encode_plus_tokens = tokenizer.encode_plus(\n", + " the_input.text,\n", + " pad_to_max_length=True,\n", + " max_length=max_seq_length,\n", + " # truncation=True\n", + " )\n", "\n", " # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n", - " input_ids = encode_plus_tokens['input_ids']\n", - " \n", - " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n", - " input_mask = encode_plus_tokens['attention_mask']\n", + " input_ids = encode_plus_tokens[\"input_ids\"]\n", + "\n", + " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n", + " input_mask = encode_plus_tokens[\"attention_mask\"]\n", "\n", " # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n", " segment_ids = [0] * max_seq_length\n", @@ -145,23 +141,24 @@ " label_id=label_id,\n", " review_id=the_input.review_id,\n", " date=the_input.date,\n", - " label=the_input.label)\n", - "# review_body=the_input.text)\n", - "\n", - " print('**input_ids**\\n{}\\n'.format(features.input_ids))\n", - " print('**input_mask**\\n{}\\n'.format(features.input_mask))\n", - " print('**segment_ids**\\n{}\\n'.format(features.segment_ids))\n", - " print('**label_id**\\n{}\\n'.format(features.label_id))\n", - " print('**review_id**\\n{}\\n'.format(features.review_id))\n", - " print('**date**\\n{}\\n'.format(features.date))\n", - " print('**label**\\n{}\\n'.format(features.label))\n", - "# print('**review_body**\\n{}\\n'.format(features.review_body))\n", + " label=the_input.label,\n", + " )\n", + " # review_body=the_input.text)\n", + "\n", + " print(\"**input_ids**\\n{}\\n\".format(features.input_ids))\n", + " print(\"**input_mask**\\n{}\\n\".format(features.input_mask))\n", + " print(\"**segment_ids**\\n{}\\n\".format(features.segment_ids))\n", + " print(\"**label_id**\\n{}\\n\".format(features.label_id))\n", + " print(\"**review_id**\\n{}\\n\".format(features.review_id))\n", + " print(\"**date**\\n{}\\n\".format(features.date))\n", + " print(\"**label**\\n{}\\n\".format(features.label))\n", + " # print('**review_body**\\n{}\\n'.format(features.review_body))\n", "\n", " return features\n", "\n", "\n", "# We'll need to transform our data into a format that BERT understands.\n", - "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.\n", "# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data\n", "def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):\n", " records = []\n", @@ -169,33 +166,35 @@ "\n", " for (input_idx, the_input) in enumerate(inputs):\n", " if input_idx % 10000 == 0:\n", - " print('Writing input {} of {}\\n'.format(input_idx, len(inputs)))\n", + " print(\"Writing input {} of {}\\n\".format(input_idx, len(inputs)))\n", "\n", " features = convert_input(the_input, max_seq_length)\n", "\n", " all_features = collections.OrderedDict()\n", - " \n", - " # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids \n", - " all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n", - " all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n", - " all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n", - " all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n", + "\n", + " # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids\n", + " all_features[\"input_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n", + " all_features[\"input_mask\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n", + " all_features[\"segment_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n", + " all_features[\"label_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n", "\n", " tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n", " tf_record_writer.write(tf_record.SerializeToString())\n", "\n", " # Create Record For Feature Store With All Features\n", - " records.append({#'tf_record': tf_record.SerializeToString(),\n", - " 'input_ids': features.input_ids,\n", - " 'input_mask': features.input_mask,\n", - " 'segment_ids': features.segment_ids,\n", - " 'label_id': features.label_id,\n", - " 'review_id': the_input.review_id,\n", - " 'date': the_input.date,\n", - " 'label': features.label,\n", - "# 'review_body': features.review_body\n", - " })\n", - " \n", + " records.append(\n", + " { #'tf_record': tf_record.SerializeToString(),\n", + " \"input_ids\": features.input_ids,\n", + " \"input_mask\": features.input_mask,\n", + " \"segment_ids\": features.segment_ids,\n", + " \"label_id\": features.label_id,\n", + " \"review_id\": the_input.review_id,\n", + " \"date\": the_input.date,\n", + " \"label\": features.label,\n", + " # 'review_body': features.review_body\n", + " }\n", + " )\n", + "\n", " tf_record_writer.close()\n", "\n", " return records" @@ -246,7 +245,7 @@ "from datetime import datetime\n", "from time import strftime\n", "\n", - "#timestamp = datetime.now().replace(microsecond=0).isoformat()\n", + "# timestamp = datetime.now().replace(microsecond=0).isoformat()\n", "timestamp = datetime.now().strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", "print(timestamp)" ] @@ -262,21 +261,30 @@ "import pandas as pd\n", "\n", "data = [\n", - " [5, 'ABCD12345', \"\"\"I needed an \"antivirus\" application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\"],\n", - " [3, 'EFGH12345', \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\"],\n", - " [1, 'IJKL2345', \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\"]\n", - " ]\n", - "\n", - "df = pd.DataFrame(data, columns=['star_rating', 'review_id', 'review_body'])\n", + " [\n", + " 5,\n", + " \"ABCD12345\",\n", + " \"\"\"I needed an \"antivirus\" application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\",\n", + " ],\n", + " [\n", + " 3,\n", + " \"EFGH12345\",\n", + " \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\",\n", + " ],\n", + " [\n", + " 1,\n", + " \"IJKL2345\",\n", + " \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\",\n", + " ],\n", + "]\n", + "\n", + "df = pd.DataFrame(data, columns=[\"star_rating\", \"review_id\", \"review_body\"])\n", "\n", "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", - "inputs = df.apply(lambda x: Input(\n", - " label = x[LABEL_COLUMN],\n", - " text = x[REVIEW_BODY_COLUMN],\n", - " review_id = x[REVIEW_ID_COLUMN],\n", - " date = timestamp\n", - " ),\n", - " axis = 1)" + "inputs = df.apply(\n", + " lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),\n", + " axis=1,\n", + ")" ] }, { @@ -307,7 +315,7 @@ "metadata": {}, "outputs": [], "source": [ - "output_file='./data-tfrecord-featurestore/data.tfrecord'" + "output_file = \"./data-tfrecord-featurestore/data.tfrecord\"" ] }, { @@ -334,7 +342,7 @@ "metadata": {}, "outputs": [], "source": [ - "featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)" + "featurestore_runtime = boto3.Session().client(service_name=\"sagemaker-featurestore-runtime\", region_name=region)" ] }, { @@ -356,7 +364,7 @@ "source": [ "from time import gmtime, strftime, sleep\n", "\n", - "feature_group_name = 'reviews-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n", + "feature_group_name = \"reviews-feature-group-\" + strftime(\"%d-%H-%M-%S\", gmtime())\n", "print(feature_group_name)" ] }, @@ -371,16 +379,16 @@ " FeatureTypeEnum,\n", ")\n", "\n", - "feature_definitions= [\n", - " FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),\n", - " FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),\n", - "# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) \n", + "feature_definitions = [\n", + " FeatureDefinition(feature_name=\"input_ids\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"input_mask\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"segment_ids\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"label_id\", feature_type=FeatureTypeEnum.INTEGRAL),\n", + " FeatureDefinition(feature_name=\"review_id\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"date\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"label\", feature_type=FeatureTypeEnum.INTEGRAL),\n", + " # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"split_type\", feature_type=FeatureTypeEnum.STRING),\n", "]" ] }, @@ -392,9 +400,7 @@ "source": [ "from sagemaker.feature_store.feature_group import FeatureGroup\n", "\n", - "feature_group = FeatureGroup(name=feature_group_name, \n", - " feature_definitions=feature_definitions,\n", - " sagemaker_session=sess)\n", + "feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)\n", "print(feature_group)" ] }, @@ -428,7 +434,7 @@ "metadata": {}, "outputs": [], "source": [ - "prefix = 'reviews-feature-store-' + timestamp\n", + "prefix = \"reviews-feature-store-\" + timestamp\n", "print(prefix)" ] }, @@ -452,7 +458,7 @@ " record_identifier_name=record_identifier_feature_name,\n", " event_time_feature_name=event_time_feature_name,\n", " role_arn=role,\n", - " enable_online_store=True\n", + " enable_online_store=True,\n", ")" ] }, @@ -487,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "#sm.list_feature_groups()" + "# sm.list_feature_groups()" ] }, { @@ -507,6 +513,7 @@ "source": [ "import time\n", "\n", + "\n", "def wait_for_feature_group_creation_complete(feature_group):\n", " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", " while status == \"Creating\":\n", @@ -515,7 +522,7 @@ " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", " if status != \"Created\":\n", " raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n", - " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n" + " print(f\"FeatureGroup {feature_group.name} successfully created.\")" ] }, { @@ -524,7 +531,7 @@ "metadata": {}, "outputs": [], "source": [ - "wait_for_feature_group_creation_complete(feature_group=feature_group)\n" + "wait_for_feature_group_creation_complete(feature_group=feature_group)" ] }, { @@ -568,8 +575,9 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "df_records = pd.DataFrame.from_dict(records)\n", - "df_records['split_type']='train'\n", + "df_records[\"split_type\"] = \"train\"\n", "df_records" ] }, @@ -588,7 +596,7 @@ "source": [ "def cast_object_to_string(data_frame):\n", " for label in data_frame.columns:\n", - " if data_frame.dtypes[label] == 'object':\n", + " if data_frame.dtypes[label] == \"object\":\n", " data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")" ] }, @@ -616,9 +624,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_group.ingest(\n", - " data_frame=df_records, max_workers=3, wait=True\n", - ")" + "feature_group.ingest(data_frame=df_records, max_workers=3, wait=True)" ] }, { @@ -638,16 +644,15 @@ "source": [ "offline_store_contents = None\n", "\n", - "while (offline_store_contents is None):\n", - " objects_in_bucket = s3.list_objects(Bucket=bucket,\n", - " Prefix=prefix)\n", - " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n", - " offline_store_contents = objects_in_bucket['Contents']\n", + "while offline_store_contents is None:\n", + " objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=prefix)\n", + " if \"Contents\" in objects_in_bucket and len(objects_in_bucket[\"Contents\"]) > 1:\n", + " offline_store_contents = objects_in_bucket[\"Contents\"]\n", " else:\n", - " print('Waiting for data in offline store...\\n')\n", + " print(\"Waiting for data in offline store...\\n\")\n", " sleep(60)\n", - " \n", - "print('Data available.')" + "\n", + "print(\"Data available.\")" ] }, { @@ -674,10 +679,11 @@ }, "outputs": [], "source": [ - "record_identifier_value = 'IJKL2345'\n", + "record_identifier_value = \"IJKL2345\"\n", "\n", - "featurestore_runtime.get_record(FeatureGroupName=feature_group_name, \n", - " RecordIdentifierValueAsString=record_identifier_value)" + "featurestore_runtime.get_record(\n", + " FeatureGroupName=feature_group_name, RecordIdentifierValueAsString=record_identifier_value\n", + ")" ] }, { @@ -751,9 +757,11 @@ "source": [ "query_string = \"\"\"\n", "SELECT input_ids, input_mask, segment_ids, label_id, split_type FROM \"{}\" WHERE split_type='train' LIMIT 5\n", - "\"\"\".format(feature_store_table)\n", + "\"\"\".format(\n", + " feature_store_table\n", + ")\n", "\n", - "print('Running ' + query_string)" + "print(\"Running \" + query_string)" ] }, { @@ -770,7 +778,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_store_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/query_results/')\n", + "feature_store_query.run(query_string=query_string, output_location=\"s3://\" + bucket + \"/\" + prefix + \"/query_results/\")\n", "\n", "feature_store_query.wait()" ] diff --git a/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb b/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb index 03efc6c9..b77a0a84 100644 --- a/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb +++ b/06_prepare/02_Prepare_Dataset_BERT_Scikit_ScriptMode_FeatureStore.ipynb @@ -72,8 +72,8 @@ "bucket = sess.default_bucket()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)" ] }, { @@ -103,9 +103,9 @@ "try:\n", " s3_public_path_tsv\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the INGEST section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the INGEST section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -139,9 +139,9 @@ "try:\n", " s3_private_path_tsv\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the INGEST section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the INGEST section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -181,7 +181,7 @@ }, "outputs": [], "source": [ - "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n", + "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n", "print(raw_input_data_s3_uri)" ] }, @@ -278,12 +278,13 @@ "timestamp = int(time.time())\n", "\n", "experiment = Experiment.create(\n", - " experiment_name='Amazon-Customer-Reviews-BERT-Experiment-{}'.format(timestamp),\n", - " description='Amazon Customer Reviews BERT Experiment', \n", - " sagemaker_boto_client=sm)\n", + " experiment_name=\"Amazon-Customer-Reviews-BERT-Experiment-{}\".format(timestamp),\n", + " description=\"Amazon Customer Reviews BERT Experiment\",\n", + " sagemaker_boto_client=sm,\n", + ")\n", "\n", "experiment_name = experiment.experiment_name\n", - "print('Experiment name: {}'.format(experiment_name))" + "print(\"Experiment name: {}\".format(experiment_name))" ] }, { @@ -304,12 +305,12 @@ "\n", "timestamp = int(time.time())\n", "\n", - "trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n", - " experiment_name=experiment_name,\n", - " sagemaker_boto_client=sm)\n", + "trial = Trial.create(\n", + " trial_name=\"trial-{}\".format(timestamp), experiment_name=experiment_name, sagemaker_boto_client=sm\n", + ")\n", "\n", "trial_name = trial.trial_name\n", - "print('Trial name: {}'.format(trial_name))" + "print(\"Trial name: {}\".format(trial_name))" ] }, { @@ -326,9 +327,9 @@ "outputs": [], "source": [ "experiment_config = {\n", - " 'ExperimentName': experiment_name,\n", - " 'TrialName': trial_name,\n", - " 'TrialComponentDisplayName': 'prepare'\n", + " \"ExperimentName\": experiment_name,\n", + " \"TrialName\": trial_name,\n", + " \"TrialComponentDisplayName\": \"prepare\",\n", "}" ] }, @@ -381,7 +382,7 @@ "metadata": {}, "outputs": [], "source": [ - "featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)" + "featurestore_runtime = boto3.Session().client(service_name=\"sagemaker-featurestore-runtime\", region_name=region)" ] }, { @@ -392,7 +393,7 @@ "source": [ "timestamp = int(time.time())\n", "\n", - "feature_store_offline_prefix = 'reviews-feature-store-' + str(timestamp)\n", + "feature_store_offline_prefix = \"reviews-feature-store-\" + str(timestamp)\n", "\n", "print(feature_store_offline_prefix)" ] @@ -403,7 +404,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_group_name = 'reviews-feature-group-' + str(timestamp)\n", + "feature_group_name = \"reviews-feature-group-\" + str(timestamp)\n", "\n", "print(feature_group_name)" ] @@ -419,15 +420,15 @@ " FeatureTypeEnum,\n", ")\n", "\n", - "feature_definitions= [\n", - " FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL),\n", - " FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING),\n", - " FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL),\n", - "# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING)\n", + "feature_definitions = [\n", + " FeatureDefinition(feature_name=\"input_ids\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"input_mask\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"segment_ids\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"label_id\", feature_type=FeatureTypeEnum.INTEGRAL),\n", + " FeatureDefinition(feature_name=\"review_id\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"date\", feature_type=FeatureTypeEnum.STRING),\n", + " FeatureDefinition(feature_name=\"label\", feature_type=FeatureTypeEnum.INTEGRAL),\n", + " # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING)\n", "]" ] }, @@ -439,10 +440,7 @@ "source": [ "from sagemaker.feature_store.feature_group import FeatureGroup\n", "\n", - "feature_group = FeatureGroup(\n", - " name=feature_group_name, \n", - " feature_definitions=feature_definitions,\n", - " sagemaker_session=sess)\n", + "feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)\n", "\n", "print(feature_group)" ] @@ -462,13 +460,13 @@ }, "outputs": [], "source": [ - "processing_instance_type='ml.c5.2xlarge'\n", - "processing_instance_count=2\n", - "train_split_percentage=0.90\n", - "validation_split_percentage=0.05\n", - "test_split_percentage=0.05\n", - "balance_dataset=True\n", - "max_seq_length=64" + "processing_instance_type = \"ml.c5.2xlarge\"\n", + "processing_instance_count = 2\n", + "train_split_percentage = 0.90\n", + "validation_split_percentage = 0.05\n", + "test_split_percentage = 0.05\n", + "balance_dataset = True\n", + "max_seq_length = 64" ] }, { @@ -512,12 +510,14 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " env={'AWS_DEFAULT_REGION': region},\n", - " max_runtime_in_seconds=7200)" + "processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " env={\"AWS_DEFAULT_REGION\": region},\n", + " max_runtime_in_seconds=7200,\n", + ")" ] }, { @@ -528,35 +528,49 @@ "source": [ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "\n", - "processor.run(code='preprocess-scikit-text-to-bert-feature-store.py',\n", - " inputs=[\n", - " ProcessingInput(input_name='raw-input-data',\n", - " source=raw_input_data_s3_uri,\n", - " destination='/opt/ml/processing/input/data/',\n", - " s3_data_distribution_type='ShardedByS3Key')\n", - " ],\n", - " outputs=[\n", - " ProcessingOutput(output_name='bert-train',\n", - " s3_upload_mode='EndOfJob', \n", - " source='/opt/ml/processing/output/bert/train'),\n", - " ProcessingOutput(output_name='bert-validation',\n", - " s3_upload_mode='EndOfJob', \n", - " source='/opt/ml/processing/output/bert/validation'),\n", - " ProcessingOutput(output_name='bert-test',\n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/bert/test'),\n", - " ],\n", - " arguments=['--train-split-percentage', str(train_split_percentage),\n", - " '--validation-split-percentage', str(validation_split_percentage),\n", - " '--test-split-percentage', str(test_split_percentage),\n", - " '--max-seq-length', str(max_seq_length),\n", - " '--balance-dataset', str(balance_dataset),\n", - " '--feature-store-offline-prefix', str(feature_store_offline_prefix),\n", - " '--feature-group-name', str(feature_group_name)\n", - " ],\n", - " experiment_config=experiment_config,\n", - " logs=True,\n", - " wait=False)" + "processor.run(\n", + " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n", + " inputs=[\n", + " ProcessingInput(\n", + " input_name=\"raw-input-data\",\n", + " source=raw_input_data_s3_uri,\n", + " destination=\"/opt/ml/processing/input/data/\",\n", + " s3_data_distribution_type=\"ShardedByS3Key\",\n", + " )\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(\n", + " output_name=\"bert-train\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/bert/train\"\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-validation\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/validation\",\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-test\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/bert/test\"\n", + " ),\n", + " ],\n", + " arguments=[\n", + " \"--train-split-percentage\",\n", + " str(train_split_percentage),\n", + " \"--validation-split-percentage\",\n", + " str(validation_split_percentage),\n", + " \"--test-split-percentage\",\n", + " str(test_split_percentage),\n", + " \"--max-seq-length\",\n", + " str(max_seq_length),\n", + " \"--balance-dataset\",\n", + " str(balance_dataset),\n", + " \"--feature-store-offline-prefix\",\n", + " str(feature_store_offline_prefix),\n", + " \"--feature-group-name\",\n", + " str(feature_group_name),\n", + " ],\n", + " experiment_config=experiment_config,\n", + " logs=True,\n", + " wait=False,\n", + ")" ] }, { @@ -567,7 +581,7 @@ }, "outputs": [], "source": [ - "scikit_processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n", + "scikit_processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n", "print(scikit_processing_job_name)" ] }, @@ -581,7 +595,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Processing Job'.format(region, scikit_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Job'.format(\n", + " region, scikit_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -594,7 +614,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, scikit_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, scikit_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -607,7 +633,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, scikit_processing_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n", + " bucket, scikit_processing_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -625,8 +657,9 @@ }, "outputs": [], "source": [ - "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=scikit_processing_job_name,\n", - " sagemaker_session=sess)\n", + "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n", + " processing_job_name=scikit_processing_job_name, sagemaker_session=sess\n", + ")\n", "\n", "processing_job_description = running_processor.describe()\n", "\n", @@ -668,15 +701,15 @@ "source": [ "processing_job_description = running_processor.describe()\n", "\n", - "output_config = processing_job_description['ProcessingOutputConfig']\n", - "for output in output_config['Outputs']:\n", - " if output['OutputName'] == 'bert-train':\n", - " processed_train_data_s3_uri = output['S3Output']['S3Uri']\n", - " if output['OutputName'] == 'bert-validation':\n", - " processed_validation_data_s3_uri = output['S3Output']['S3Uri']\n", - " if output['OutputName'] == 'bert-test':\n", - " processed_test_data_s3_uri = output['S3Output']['S3Uri']\n", - " \n", + "output_config = processing_job_description[\"ProcessingOutputConfig\"]\n", + "for output in output_config[\"Outputs\"]:\n", + " if output[\"OutputName\"] == \"bert-train\":\n", + " processed_train_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n", + " if output[\"OutputName\"] == \"bert-validation\":\n", + " processed_validation_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n", + " if output[\"OutputName\"] == \"bert-test\":\n", + " processed_test_data_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n", + "\n", "print(processed_train_data_s3_uri)\n", "print(processed_validation_data_s3_uri)\n", "print(processed_test_data_s3_uri)" @@ -879,9 +912,11 @@ "source": [ "query_string = \"\"\"\n", "SELECT input_ids, input_mask, segment_ids, label_id, split_type FROM \"{}\" WHERE split_type='train' LIMIT 5\n", - "\"\"\".format(feature_store_table)\n", + "\"\"\".format(\n", + " feature_store_table\n", + ")\n", "\n", - "print('Running ' + query_string)" + "print(\"Running \" + query_string)" ] }, { @@ -890,7 +925,10 @@ "metadata": {}, "outputs": [], "source": [ - "feature_store_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+feature_store_offline_prefix+'/query_results/')\n", + "feature_store_query.run(\n", + " query_string=query_string,\n", + " output_location=\"s3://\" + bucket + \"/\" + feature_store_offline_prefix + \"/query_results/\",\n", + ")\n", "\n", "feature_store_query.wait()" ] @@ -901,10 +939,10 @@ "metadata": {}, "outputs": [], "source": [ - "#import pandas as pd\n", - "#dataset = pd.DataFrame()\n", - "#dataset = feature_store_query.as_dataframe()\n", - "#dataset\n", + "# import pandas as pd\n", + "# dataset = pd.DataFrame()\n", + "# dataset = feature_store_query.as_dataframe()\n", + "# dataset\n", "\n", "feature_store_query.as_dataframe()" ] @@ -925,14 +963,12 @@ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", - "#pd.set_option(\"max_rows\", 100)\n", + "# pd.set_option(\"max_rows\", 100)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", - " sagemaker_session=sess,\n", - " experiment_name=experiment_name,\n", - " sort_by=\"CreationTime\",\n", - " sort_order=\"Descending\"\n", + " sagemaker_session=sess, experiment_name=experiment_name, sort_by=\"CreationTime\", sort_order=\"Descending\"\n", ")\n", "\n", "experiment_analytics_df = experiment_analytics.dataframe()\n", @@ -945,7 +981,7 @@ "metadata": {}, "outputs": [], "source": [ - "trial_component_name=experiment_analytics_df.TrialComponentName[0]\n", + "trial_component_name = experiment_analytics_df.TrialComponentName[0]\n", "print(trial_component_name)" ] }, @@ -955,7 +991,7 @@ "metadata": {}, "outputs": [], "source": [ - "trial_component_description=sm.describe_trial_component(TrialComponentName=trial_component_name)\n", + "trial_component_description = sm.describe_trial_component(TrialComponentName=trial_component_name)\n", "trial_component_description" ] }, diff --git a/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb b/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb index 33e1f4b5..13a63207 100644 --- a/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb +++ b/06_prepare/data-wrangler/DataWranglerJob_Antje.ipynb @@ -29,10 +29,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.17.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -159,6 +158,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -170,6 +170,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -187,6 +188,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -202,6 +204,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -218,29 +221,24 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", " return processing_inputs\n", "\n", + "\n", "def create_container_arguments(output_name, output_content_type):\n", - " output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - " }\n", + " output_config = {output_name: {\"content_type\": output_content_type}}\n", " return [f\"--output-config '{json.dumps(output_config)}'\"]\n", "\n", + "\n", "# Create Processing Job Arguments\n", "processing_job_arguments = {\n", " \"AppSpecification\": {\n", @@ -256,7 +254,7 @@ " \"S3Uri\": output_path,\n", " \"LocalPath\": os.path.join(processing_dir, \"output\"),\n", " \"S3UploadMode\": \"EndOfJob\",\n", - " }\n", + " },\n", " },\n", " ],\n", " },\n", @@ -357,14 +355,11 @@ "region = boto3.Session().region_name\n", "container = sagemaker.image_uris.retrieve(\"xgboost\", region, \"1.2-1\")\n", "hyperparameters = {\n", - " \"max_depth\":\"5\",\n", + " \"max_depth\": \"5\",\n", " \"objective\": \"reg:squarederror\",\n", " \"num_round\": \"10\",\n", "}\n", - "train_content_type = (\n", - " \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\"\n", - " else \"text/csv\"\n", - ")\n", + "train_content_type = \"application/x-parquet\" if output_content_type.upper() == \"PARQUET\" else \"text/csv\"\n", "train_input = sagemaker.inputs.TrainingInput(\n", " s3_data=f\"s3://{bucket}/{training_path}\",\n", " content_type=train_content_type,\n", diff --git a/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb b/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb index 101c34b6..ce009003 100644 --- a/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb +++ b/06_prepare/data-wrangler/DataWrangler_To_FeatureStore_Antje.ipynb @@ -50,10 +50,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.17.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -165,8 +164,8 @@ } ], "source": [ - "feature_group_name = f'FG-{flow_name}'\n", - "print(f\"Feature Group Name: {feature_group_name}\")\n" + "feature_group_name = f\"FG-{flow_name}\"\n", + "print(f\"Feature Group Name: {feature_group_name}\")" ] }, { @@ -185,15 +184,12 @@ "metadata": {}, "outputs": [], "source": [ - "datawrangler_FG_type_mapping = {\n", - " 'float': 'Fractional',\n", - " 'long': 'Integral'\n", - "}\n", + "datawrangler_FG_type_mapping = {\"float\": \"Fractional\", \"long\": \"Integral\"}\n", "\n", "# Some schema types in Data Wrangler are not supported by Feature Store.\n", "# Feature store supports String, Integral, and Fractional types.\n", "# The following will create a default_FG_type set to String for these types.\n", - "default_FG_type = \"String\"\n" + "default_FG_type = \"String\"" ] }, { @@ -211,71 +207,23 @@ "outputs": [], "source": [ "column_schema = [\n", - " {\n", - " \"name\": \"marketplace\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"customer_id\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"review_id\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"product_id\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"product_parent\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"product_title\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"product_category\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"vine\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"verified_purchase\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"review_headline\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"review_body\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"review_date\",\n", - " \"type\": \"date\"\n", - " },\n", - " {\n", - " \"name\": \"star_rating\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"helpful_votes\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"total_votes\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"star_rating_scaled\",\n", - " \"type\": \"float\"\n", - " }\n", - "]\n" + " {\"name\": \"marketplace\", \"type\": \"string\"},\n", + " {\"name\": \"customer_id\", \"type\": \"long\"},\n", + " {\"name\": \"review_id\", \"type\": \"string\"},\n", + " {\"name\": \"product_id\", \"type\": \"string\"},\n", + " {\"name\": \"product_parent\", \"type\": \"long\"},\n", + " {\"name\": \"product_title\", \"type\": \"string\"},\n", + " {\"name\": \"product_category\", \"type\": \"string\"},\n", + " {\"name\": \"vine\", \"type\": \"string\"},\n", + " {\"name\": \"verified_purchase\", \"type\": \"string\"},\n", + " {\"name\": \"review_headline\", \"type\": \"string\"},\n", + " {\"name\": \"review_body\", \"type\": \"string\"},\n", + " {\"name\": \"review_date\", \"type\": \"date\"},\n", + " {\"name\": \"star_rating\", \"type\": \"long\"},\n", + " {\"name\": \"helpful_votes\", \"type\": \"long\"},\n", + " {\"name\": \"total_votes\", \"type\": \"long\"},\n", + " {\"name\": \"star_rating_scaled\", \"type\": \"float\"},\n", + "]" ] }, { @@ -305,25 +253,20 @@ } ], "source": [ - "record_identifier_name = 'review_id'\n", + "record_identifier_name = \"review_id\"\n", "if record_identifier_name is None:\n", - " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n", + " raise RuntimeError(\"Select a column name as the feature group identifier.\")\n", "\n", - "event_time_feature_name = 'review_date'\n", + "event_time_feature_name = \"review_date\"\n", "if event_time_feature_name is None:\n", - " raise RuntimeError(\"Select a column name as the event time feature name.\")\n", + " raise RuntimeError(\"Select a column name as the event time feature name.\")\n", "\n", "# Below you map the schema detected from Data Wrangler to Feature Group Types.\n", "feature_definitions = [\n", - " {\n", - " \"FeatureName\": schema['name'],\n", - " \"FeatureType\": datawrangler_FG_type_mapping.get(\n", - " schema['type'],\n", - " default_FG_type\n", - " )\n", - " } for schema in column_schema\n", + " {\"FeatureName\": schema[\"name\"], \"FeatureType\": datawrangler_FG_type_mapping.get(schema[\"type\"], default_FG_type)}\n", + " for schema in column_schema\n", "]\n", - "print(feature_definitions)\n" + "print(feature_definitions)" ] }, { @@ -358,38 +301,33 @@ "sagemaker_client = boto3.client(\"sagemaker\", endpoint_url=sagemaker_endpoint_url)\n", "\n", "# Online Store Configuration\n", - "online_store_config = {\n", - " \"EnableOnlineStore\": True\n", - "}\n", + "online_store_config = {\"EnableOnlineStore\": True}\n", "\n", "# Offline Store Configuration\n", - "s3_uri = 's3://' + bucket # this is the default bucket defined in previous cells\n", - "offline_store_config = {\n", - " \"S3StorageConfig\": {\n", - " \"S3Uri\": s3_uri\n", - " }\n", - "}\n", + "s3_uri = \"s3://\" + bucket # this is the default bucket defined in previous cells\n", + "offline_store_config = {\"S3StorageConfig\": {\"S3Uri\": s3_uri}}\n", "\n", "# Create Feature Group\n", "create_fg_response = sagemaker_client.create_feature_group(\n", - " FeatureGroupName = feature_group_name,\n", - " EventTimeFeatureName = event_time_feature_name,\n", - " RecordIdentifierFeatureName = record_identifier_name,\n", - " FeatureDefinitions = feature_definitions,\n", - " OnlineStoreConfig = online_store_config,\n", - " OfflineStoreConfig = offline_store_config,\n", - " RoleArn = iam_role)\n", + " FeatureGroupName=feature_group_name,\n", + " EventTimeFeatureName=event_time_feature_name,\n", + " RecordIdentifierFeatureName=record_identifier_name,\n", + " FeatureDefinitions=feature_definitions,\n", + " OnlineStoreConfig=online_store_config,\n", + " OfflineStoreConfig=offline_store_config,\n", + " RoleArn=iam_role,\n", + ")\n", "\n", "# Describe Feature Group\n", "status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n", - "while status['FeatureGroupStatus'] != 'Created':\n", - " if status['FeatureGroupStatus'] == 'CreateFailed':\n", + "while status[\"FeatureGroupStatus\"] != \"Created\":\n", + " if status[\"FeatureGroupStatus\"] == \"CreateFailed\":\n", " raise RuntimeError(f\"Feature Group Creation Failed: {status}\")\n", " status = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)\n", - " print(\"Feature Group Status: \" + status['FeatureGroupStatus'])\n", + " print(\"Feature Group Status: \" + status[\"FeatureGroupStatus\"])\n", " time.sleep(3)\n", "\n", - "print(status)\n" + "print(status)" ] }, { @@ -417,6 +355,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -428,6 +367,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -445,6 +385,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -460,6 +401,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -476,16 +418,13 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", @@ -533,48 +472,40 @@ ], "source": [ "# Processing job name\n", - "print(f'Processing Job Name: {processing_job_name}')\n", - "\n", - "processingResources = {\n", - " 'ClusterConfig': {\n", - " 'InstanceCount': 1,\n", - " 'InstanceType': 'ml.m5.4xlarge',\n", - " 'VolumeSizeInGB': 30\n", - " }\n", - " }\n", + "print(f\"Processing Job Name: {processing_job_name}\")\n", "\n", - "appSpecification = {'ImageUri': container_uri}\n", + "processingResources = {\"ClusterConfig\": {\"InstanceCount\": 1, \"InstanceType\": \"ml.m5.4xlarge\", \"VolumeSizeInGB\": 30}}\n", + "\n", + "appSpecification = {\"ImageUri\": container_uri}\n", "\n", "sagemaker_client.create_processing_job(\n", - " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n", - " ProcessingOutputConfig={\n", - " 'Outputs': [\n", - " {\n", - " 'OutputName': 'e880c72f-910c-4554-9a28-a66ce9d3b35f.default',\n", - " 'FeatureStoreOutput': {\n", - " 'FeatureGroupName': feature_group_name\n", - " },\n", - " 'AppManaged': True\n", - " }\n", - " ],\n", - " },\n", - " ProcessingJobName=processing_job_name,\n", - " ProcessingResources=processingResources,\n", - " AppSpecification=appSpecification,\n", - " RoleArn=iam_role\n", - " )\n", + " ProcessingInputs=create_processing_inputs(processing_dir, flow, flow_uri),\n", + " ProcessingOutputConfig={\n", + " \"Outputs\": [\n", + " {\n", + " \"OutputName\": \"e880c72f-910c-4554-9a28-a66ce9d3b35f.default\",\n", + " \"FeatureStoreOutput\": {\"FeatureGroupName\": feature_group_name},\n", + " \"AppManaged\": True,\n", + " }\n", + " ],\n", + " },\n", + " ProcessingJobName=processing_job_name,\n", + " ProcessingResources=processingResources,\n", + " AppSpecification=appSpecification,\n", + " RoleArn=iam_role,\n", + ")\n", "\n", "\n", "status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n", "\n", - "while status['ProcessingJobStatus'] in ('InProgress', 'Failed'):\n", - " if status['ProcessingJobStatus'] == 'Failed':\n", + "while status[\"ProcessingJobStatus\"] in (\"InProgress\", \"Failed\"):\n", + " if status[\"ProcessingJobStatus\"] == \"Failed\":\n", " raise RuntimeError(f\"Processing Job failed: {status}\")\n", " status = sagemaker_client.describe_processing_job(ProcessingJobName=processing_job_name)\n", - " print(status['ProcessingJobStatus'])\n", + " print(status[\"ProcessingJobStatus\"])\n", " time.sleep(60)\n", "\n", - "print(status)\n" + "print(status)" ] }, { diff --git a/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb b/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb index 9d5b8a76..d79a7219 100644 --- a/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb +++ b/06_prepare/data-wrangler/DataWrangler_To_Pipeline_Antje.ipynb @@ -46,10 +46,9 @@ "\n", "original_version = sagemaker.__version__\n", "if sagemaker.__version__ != \"2.17.0\":\n", - " subprocess.check_call(\n", - " [sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"]\n", - " )\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.17.0\"])\n", " import importlib\n", + "\n", " importlib.reload(sagemaker)" ] }, @@ -184,6 +183,7 @@ " },\n", " }\n", "\n", + "\n", "def create_s3_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -195,6 +195,7 @@ " },\n", " }\n", "\n", + "\n", "def create_redshift_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -212,6 +213,7 @@ " },\n", " }\n", "\n", + "\n", "def create_athena_processing_input(base_dir, name, dataset_definition):\n", " return {\n", " \"InputName\": name,\n", @@ -227,6 +229,7 @@ " },\n", " }\n", "\n", + "\n", "def create_processing_inputs(processing_dir, flow, flow_uri):\n", " \"\"\"Helper function for creating processing inputs\n", " :param flow: loaded data wrangler flow notebook\n", @@ -243,29 +246,24 @@ " source_type = data_def[\"datasetSourceType\"]\n", "\n", " if source_type == \"S3\":\n", - " s3_processing_input = create_s3_processing_input(\n", - " processing_dir, name, data_def)\n", + " s3_processing_input = create_s3_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(s3_processing_input)\n", " elif source_type == \"Athena\":\n", - " athena_processing_input = create_athena_processing_input(\n", - " processing_dir, name, data_def)\n", + " athena_processing_input = create_athena_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(athena_processing_input)\n", " elif source_type == \"Redshift\":\n", - " redshift_processing_input = create_redshift_processing_input(\n", - " processing_dir, name, data_def)\n", + " redshift_processing_input = create_redshift_processing_input(processing_dir, name, data_def)\n", " processing_inputs.append(redshift_processing_input)\n", " else:\n", " raise ValueError(f\"{source_type} is not supported for Data Wrangler Processing.\")\n", " return processing_inputs\n", "\n", + "\n", "def create_container_arguments(output_name, output_content_type):\n", - " output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - " }\n", + " output_config = {output_name: {\"content_type\": output_content_type}}\n", " return [f\"--output-config '{json.dumps(output_config)}'\"]\n", "\n", + "\n", "# Create Processing Job Arguments\n", "processing_job_arguments = {\n", " \"AppSpecification\": {\n", @@ -281,7 +279,7 @@ " \"S3Uri\": output_path,\n", " \"LocalPath\": os.path.join(processing_dir, \"output\"),\n", " \"S3UploadMode\": \"EndOfJob\",\n", - " }\n", + " },\n", " },\n", " ],\n", " },\n", @@ -315,8 +313,8 @@ "source": [ "from sagemaker.workflow.steps import ProcessingStep, Step, StepTypeEnum\n", "\n", - "class NaiveStep(Step):\n", "\n", + "class NaiveStep(Step):\n", " def __init__(self, name, step_type: StepTypeEnum, step_args):\n", " self.name = name\n", " self.step_type = step_type\n", @@ -329,18 +327,12 @@ " raise NotImplementedError()\n", "\n", " def to_request(self):\n", - " return {\n", - " 'Name': self.name,\n", - " 'Type': self.step_type.value,\n", - " 'Arguments': self.step_args\n", - " }\n", + " return {\"Name\": self.name, \"Type\": self.step_type.value, \"Arguments\": self.step_args}\n", "\n", "\n", "step_process = NaiveStep(\n", - " name=\"DataWranglerProcessingStep\",\n", - " step_type=StepTypeEnum.PROCESSING,\n", - " step_args=processing_job_arguments\n", - ")\n" + " name=\"DataWranglerProcessingStep\", step_type=StepTypeEnum.PROCESSING, step_args=processing_job_arguments\n", + ")" ] }, { @@ -390,8 +382,8 @@ " name=pipeline_name,\n", " parameters=[instance_type, instance_count],\n", " steps=[step_process],\n", - " sagemaker_session=sagemaker_session\n", - ")\n" + " sagemaker_session=sagemaker_session,\n", + ")" ] }, { @@ -411,7 +403,7 @@ "\n", "\n", "definition = json.loads(pipeline.definition())\n", - "definition\n" + "definition" ] }, { @@ -443,7 +435,7 @@ " raise\n", "\n", "pipeline_arn = response[\"PipelineArn\"]\n", - "print(pipeline_arn)\n" + "print(pipeline_arn)" ] }, { @@ -467,7 +459,7 @@ "source": [ "start_response = pipeline.start()\n", "pipeline_execution_arn = start_response.arn\n", - "print(pipeline_execution_arn)\n" + "print(pipeline_execution_arn)" ] }, { @@ -503,7 +495,7 @@ ")\n", "execution_steps = execution_steps_response[\"PipelineExecutionSteps\"]\n", "print(\"Execution steps:\")\n", - "pprint(execution_steps)\n" + "pprint(execution_steps)" ] }, { @@ -525,33 +517,33 @@ "\n", "def get_waiter(pipeline, delay=24, max_attempts=60):\n", " waiter_id = \"PipelineExecutionComplete\"\n", - " model = botocore.waiter.WaiterModel({\n", - " \"version\": 2,\n", - " \"waiters\": {\n", - " waiter_id: {\n", - " \"delay\": delay,\n", - " \"maxAttempts\": max_attempts,\n", - " \"operation\": 'DescribePipelineExecution',\n", - " \"acceptors\": [\n", - " {\n", - " \"expected\": \"Succeeded\",\n", - " \"matcher\": \"path\",\n", - " \"state\": \"success\",\n", - " \"argument\": \"PipelineExecutionStatus\"\n", - " },\n", - " {\n", - " \"expected\": \"Failed\",\n", - " \"matcher\": \"path\",\n", - " \"state\": \"failure\",\n", - " \"argument\": \"PipelineExecutionStatus\"\n", - " },\n", - " ]\n", - " }\n", + " model = botocore.waiter.WaiterModel(\n", + " {\n", + " \"version\": 2,\n", + " \"waiters\": {\n", + " waiter_id: {\n", + " \"delay\": delay,\n", + " \"maxAttempts\": max_attempts,\n", + " \"operation\": \"DescribePipelineExecution\",\n", + " \"acceptors\": [\n", + " {\n", + " \"expected\": \"Succeeded\",\n", + " \"matcher\": \"path\",\n", + " \"state\": \"success\",\n", + " \"argument\": \"PipelineExecutionStatus\",\n", + " },\n", + " {\n", + " \"expected\": \"Failed\",\n", + " \"matcher\": \"path\",\n", + " \"state\": \"failure\",\n", + " \"argument\": \"PipelineExecutionStatus\",\n", + " },\n", + " ],\n", + " }\n", + " },\n", " }\n", - " })\n", - " return botocore.waiter.create_waiter_with_client(\n", - " waiter_id, model, sagemaker_session.sagemaker_client\n", - " )\n" + " )\n", + " return botocore.waiter.create_waiter_with_client(waiter_id, model, sagemaker_session.sagemaker_client)" ] }, { @@ -561,7 +553,7 @@ "outputs": [], "source": [ "waiter = get_waiter(pipeline)\n", - "waiter.wait(PipelineExecutionArn=pipeline_execution_arn)\n" + "waiter.wait(PipelineExecutionArn=pipeline_execution_arn)" ] }, { @@ -575,7 +567,7 @@ ")\n", "execution_steps = execution_steps_response[\"PipelineExecutionSteps\"]\n", "print(\"Execution steps:\")\n", - "pprint(execution_steps)\n" + "pprint(execution_steps)" ] }, { diff --git a/06_prepare/data-wrangler/data_wrangler_antje.py b/06_prepare/data-wrangler/data_wrangler_antje.py index 81c56f8c..506f3cff 100644 --- a/06_prepare/data-wrangler/data_wrangler_antje.py +++ b/06_prepare/data-wrangler/data_wrangler_antje.py @@ -1,10 +1,12 @@ from pyspark.sql.session import SparkSession from pyspark.sql.dataframe import DataFrame + # You may want to configure the Spark Context with the right credentials provider. -spark = SparkSession.builder.master('local').getOrCreate() +spark = SparkSession.builder.master("local").getOrCreate() mode = None + def capture_stdout(func, *args, **kwargs): """Capture standard output to a string buffer""" @@ -54,7 +56,7 @@ def default_spark_with_trained_parameters_and_state(df, trained_parameters, stat def dispatch(key_name, args, kwargs, funcs): """ - Dispatches to another operator based on a key in the passed parameters. + Dispatches to another operator based on a key in the passed parameters. This also slices out any parameters using the parameter_name passed in, and will reassemble the trained_parameters correctly after invocation. @@ -98,7 +100,9 @@ def dispatch(key_name, args, kwargs, funcs): updated_trained_parameters = result["trained_parameters"] if existing_trained_parameters is not None or updated_trained_parameters is not None: - existing_trained_parameters = existing_trained_parameters if existing_trained_parameters is not None else {} + existing_trained_parameters = ( + existing_trained_parameters if existing_trained_parameters is not None else {} + ) existing_trained_parameters[parameter_name] = result["trained_parameters"] # Update the result trained_parameters so they are part of the original structure. @@ -132,6 +136,7 @@ class OperatorCustomerError(Exception): from sagemaker_dataprep.compute.operators.utils import ( dispatch, default_spark_with_trained_parameters, +) from pyspark.ml.feature import ( VectorAssembler, StandardScaler, @@ -156,7 +161,9 @@ def process_numeric_standard_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) @@ -210,7 +217,9 @@ def process_numeric_robust_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) @@ -266,14 +275,21 @@ def process_numeric_min_max_scaler( process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) temp_normalized_vector_col = temp_col_name(assembled) trained_parameters = load_trained_parameters( - trained_parameters, {"input_column": input_column, "min": min, "max": max,} + trained_parameters, + { + "input_column": input_column, + "min": min, + "max": max, + }, ) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( @@ -311,13 +327,20 @@ def process_numeric_max_absolute_scaler(df, input_column=None, output_column=Non process_numeric_expects_numeric_column(df, input_column) temp_vector_col = temp_col_name(df) - assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform(df) + assembled = VectorAssembler(inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="keep").transform( + df + ) assembled_wo_nans = VectorAssembler( inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) temp_normalized_vector_col = temp_col_name(assembled) - trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,}) + trained_parameters = load_trained_parameters( + trained_parameters, + { + "input_column": input_column, + }, + ) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" @@ -414,7 +437,9 @@ def athena_start_query_execution_core(client, request): try: result = client.start_query_execution(**request) except Exception as e: - raise RuntimeError(f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}") + raise RuntimeError( + f"An error ({type(e).__name__}) occurred when trying to invoke `start_query_execution`: {e}" + ) return result @@ -502,7 +527,10 @@ def athena_start_query_execution(dataset_definition, client): query_request = { "QueryString": ctas_query, - "QueryExecutionContext": {"Database": database_name, "Catalog": catalog_name,}, + "QueryExecutionContext": { + "Database": database_name, + "Catalog": catalog_name, + }, "ResultConfiguration": {"OutputLocation": metadata_s3_output_location}, } logging.debug("Query request is: %s", query_request) @@ -674,8 +702,13 @@ def cast_single_column_type( # | 2|None| bar | # | 3| 1 | | # +---+----+------------------+ - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) - df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) + df = df.withColumn( + non_castable_column, + f.when(df[temp_column].isNotNull(), "").otherwise(df[column]), + ) elif invalid_data_handling_method == NonCastableDataHandlingMethod.REPLACE_WITH_FIXED_VALUE: # Replace non-castable data to a value in the same column # Original dataframe @@ -696,7 +729,9 @@ def cast_single_column_type( # +---+----+ value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type) - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise( f.to_date(f.lit(value), date_formatting) @@ -729,8 +764,13 @@ def cast_single_column_type( # +---+----+------------------+ value = _validate_and_cast_value(value=replace_value, mohave_data_type=mohave_data_type) - df = df.withColumn(temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date) - df = df.withColumn(non_castable_column, f.when(df[temp_column].isNotNull(), "").otherwise(df[column]),) + df = df.withColumn( + temp_column, cast_to_date if (mohave_data_type == MohaveDataType.DATE) else cast_to_non_date + ) + df = df.withColumn( + non_castable_column, + f.when(df[temp_column].isNotNull(), "").otherwise(df[column]), + ) replace_date_value = f.when(df[temp_column].isNotNull(), df[temp_column]).otherwise( f.to_date(f.lit(value), date_formatting) @@ -782,8 +822,7 @@ class OperatorSparkOperatorCustomerError(Exception): def temp_col_name(df, *illegal_names): - """Generates a temporary column name that is unused. - """ + """Generates a temporary column name that is unused.""" name = "temp_col" idx = 0 name_set = set(list(df.columns) + list(illegal_names)) @@ -795,8 +834,7 @@ def temp_col_name(df, *illegal_names): def get_temp_col_if_not_set(df, col_name): - """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name. - """ + """Extracts the column name from the parameters if it exists, otherwise generates a temporary column name.""" if col_name: return col_name, False else: @@ -806,7 +844,7 @@ def get_temp_col_if_not_set(df, col_name): def replace_input_if_output_is_temp(df, input_column, output_column, output_is_temp): """Replaces the input column in the dataframe if the output was not set - This is used with get_temp_col_if_not_set to enable the behavior where a + This is used with get_temp_col_if_not_set to enable the behavior where a transformer will replace its input column if an output is not specified. """ if output_is_temp: @@ -846,7 +884,9 @@ def expects_valid_column_name(value, key, nullable=False): return if value is None or len(str(value).strip()) == 0: - raise OperatorSparkOperatorCustomerError(f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}") + raise OperatorSparkOperatorCustomerError( + f"Column name cannot be null, empty, or whitespace for parameter '{key}': {value}" + ) def expects_parameter(value, key, condition=None): @@ -858,12 +898,16 @@ def expects_parameter(value, key, condition=None): def expects_column(df, value, key): if not value or value not in df.columns: - raise OperatorSparkOperatorCustomerError(f"Expected column in dataframe for '{key}' however received '{value}'") + raise OperatorSparkOperatorCustomerError( + f"Expected column in dataframe for '{key}' however received '{value}'" + ) def expects_parameter_value_in_list(key, value, items): if value not in items: - raise OperatorSparkOperatorCustomerError(f"Illegal parameter value. {key} expected to be in {items}, but given {value}") + raise OperatorSparkOperatorCustomerError( + f"Illegal parameter value. {key} expected to be in {items}, but given {value}" + ) def encode_pyspark_model(model): @@ -966,7 +1010,6 @@ def transform_using_trained_model(model, df, loaded): ) - def type_inference(df): # noqa: C901 # pylint: disable=R0912 """Core type inference logic @@ -1237,7 +1280,9 @@ def athena_source(spark, mode, dataset_definition, trained_parameters=None): # trained_parameters["ctas_table_name"] = "" try: return default_spark_with_trained_parameters_and_state( - df=spark.read.parquet(path), trained_parameters=trained_parameters, state=get_execution_state(state), + df=spark.read.parquet(path), + trained_parameters=trained_parameters, + state=get_execution_state(state), ) except Exception as e: raise RuntimeError( @@ -1291,7 +1336,12 @@ def infer_and_cast_type(df, spark, inference_data_sample_size=1000, trained_para def process_numeric(df, spark, **kwargs): return dispatch( - "operator", [df], kwargs, {"Scale values": (process_numeric_scale_values, "scale_values_parameters"),}, + "operator", + [df], + kwargs, + { + "Scale values": (process_numeric_scale_values, "scale_values_parameters"), + }, ) @@ -1303,14 +1353,48 @@ def custom_formula(df, spark, formula, output_column=None): return default_spark(output_df) -op_1_output = athena_source(spark=spark, mode=mode, **{'dataset_definition': {'datasetSourceType': 'Athena', 'name': 'amazon-reviews-pds-tsv', 'catalogName': 'AwsDataCatalog', 'databaseName': 'dsoaws', 'queryString': 'select * from amazon_reviews_tsv', 's3OutputLocation': 's3://sagemaker-us-east-1-806570384721/athena/', 'outputFormat': 'parquet'}}) -op_2_output = infer_and_cast_type(op_1_output['default'], spark=spark, **{}) -op_3_output = process_numeric(op_2_output['default'], spark=spark, **{'operator': 'Scale values', 'scale_values_parameters': {'scaler': 'Min-max scaler', 'min_max_scaler_parameters': {'min': -1, 'max': 1, 'input_column': 'star_rating', 'output_column': 'star_rating_scaled'}, 'standard_scaler_parameters': {}}}) -op_4_output = custom_formula(op_3_output['default'], spark=spark, **{'output_column': 'star_rating_scaled_floored', 'formula': 'floor(star_rating_scaled)'}) +op_1_output = athena_source( + spark=spark, + mode=mode, + **{ + "dataset_definition": { + "datasetSourceType": "Athena", + "name": "amazon-reviews-pds-tsv", + "catalogName": "AwsDataCatalog", + "databaseName": "dsoaws", + "queryString": "select * from amazon_reviews_tsv", + "s3OutputLocation": "s3://sagemaker-us-east-1-806570384721/athena/", + "outputFormat": "parquet", + } + }, +) +op_2_output = infer_and_cast_type(op_1_output["default"], spark=spark, **{}) +op_3_output = process_numeric( + op_2_output["default"], + spark=spark, + **{ + "operator": "Scale values", + "scale_values_parameters": { + "scaler": "Min-max scaler", + "min_max_scaler_parameters": { + "min": -1, + "max": 1, + "input_column": "star_rating", + "output_column": "star_rating_scaled", + }, + "standard_scaler_parameters": {}, + }, + }, +) +op_4_output = custom_formula( + op_3_output["default"], + spark=spark, + **{"output_column": "star_rating_scaled_floored", "formula": "floor(star_rating_scaled)"}, +) # Glossary: variable name to node_id # # op_1_output: d46ffe0e-f774-4ecc-bdbf-40a708832774 # op_2_output: b1cdf334-0f01-40e6-819b-5806e59d41e6 # op_3_output: e880c72f-910c-4554-9a28-a66ce9d3b35f -# op_4_output: 969f0c55-dbfe-4658-88fc-15d4de6762e0 \ No newline at end of file +# op_4_output: 969f0c55-dbfe-4658-88fc-15d4de6762e0 diff --git a/06_prepare/preprocess-scikit-text-to-bert-feature-store.py b/06_prepare/preprocess-scikit-text-to-bert-feature-store.py index 1211ba85..7e1cd385 100644 --- a/06_prepare/preprocess-scikit-text-to-bert-feature-store.py +++ b/06_prepare/preprocess-scikit-text-to-bert-feature-store.py @@ -20,16 +20,18 @@ import subprocess ## PIP INSTALLS ## -# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to +# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to # use anaconda and anaconda only supports 2.3.0 at this time -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"]) import pandas as pd import re import sagemaker @@ -40,51 +42,55 @@ FeatureTypeEnum, ) -region = os.environ['AWS_DEFAULT_REGION'] -print('Region: {}'.format(region)) +region = os.environ["AWS_DEFAULT_REGION"] +print("Region: {}".format(region)) ############################# ## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc. ## Role and Bucket are malformed if we do this later. -sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region) +sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region) caller_identity = sts.get_caller_identity() -print('caller_identity: {}'.format(caller_identity)) +print("caller_identity: {}".format(caller_identity)) -assumed_role_arn = caller_identity['Arn'] -print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn)) +assumed_role_arn = caller_identity["Arn"] +print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn)) -assumed_role_name = assumed_role_arn.split('/')[-2] +assumed_role_name = assumed_role_arn.split("/")[-2] -iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region) -get_role_response = iam.get_role(RoleName=assumed_role_name) -print('get_role_response {}'.format(get_role_response)) -role = get_role_response['Role']['Arn'] -print('role {}'.format(role)) +iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region) +get_role_response = iam.get_role(RoleName=assumed_role_name) +print("get_role_response {}".format(get_role_response)) +role = get_role_response["Role"]["Arn"] +print("role {}".format(role)) bucket = sagemaker.Session().default_bucket() -print('The DEFAULT BUCKET is {}'.format(bucket)) +print("The DEFAULT BUCKET is {}".format(bucket)) ############################# -sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region) +sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region) -featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region) +featurestore_runtime = boto3.Session(region_name=region).client( + service_name="sagemaker-featurestore-runtime", region_name=region +) -s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region) +s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region) -sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region), - sagemaker_client=sm, - sagemaker_featurestore_runtime_client=featurestore_runtime) +sagemaker_session = sagemaker.Session( + boto_session=boto3.Session(region_name=region), + sagemaker_client=sm, + sagemaker_featurestore_runtime_client=featurestore_runtime, +) -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -REVIEW_BODY_COLUMN = 'review_body' -REVIEW_ID_COLUMN = 'review_id' +REVIEW_BODY_COLUMN = "review_body" +REVIEW_ID_COLUMN = "review_id" # DATE_COLUMN = 'date' -LABEL_COLUMN = 'star_rating' +LABEL_COLUMN = "star_rating" LABEL_VALUES = [1, 2, 3, 4, 5] - + label_map = {} for (i, label) in enumerate(LABEL_VALUES): label_map[label] = i @@ -92,94 +98,88 @@ def cast_object_to_string(data_frame): for label in data_frame.columns: - if data_frame.dtypes[label] == 'object': + if data_frame.dtypes[label] == "object": data_frame[label] = data_frame[label].astype("str").astype("string") return data_frame - + def wait_for_feature_group_creation_complete(feature_group): try: status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) if status != "Created": - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.") except: - print('No feature group created yet.') - - + print("No feature group created yet.") + + def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records - feature_definitions= [ - FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), -# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) + feature_definitions = [ + FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), + # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] - + feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=sagemaker_session) - - print('Feature Group: {}'.format(feature_group)) - - try: - print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...') + name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session + ) + + print("Feature Group: {}".format(feature_group)) + + try: + print( + "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." + ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: - print('Before CREATE FG wait exeption: {}'.format(e)) -# pass - + print("Before CREATE FG wait exeption: {}".format(e)) + # pass + try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" - - print('Creating Feature Group with role {}...'.format(role)) + + print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, - enable_online_store=True + enable_online_store=True, ) - print('Creating Feature Group. Completed.') - - print('Waiting for new Feature Group to become available...') + print("Creating Feature Group. Completed.") + + print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) - print('Feature Group available.') + print("Feature Group available.") feature_group.describe() - + except Exception as e: - print('Exception: {}'.format(e)) - + print("Exception: {}".format(e)) + return feature_group - + class InputFeatures(object): - """BERT feature vectors.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - review_id, - date, - label): -# review_body): + """BERT feature vectors.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label): + # review_body): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids @@ -187,36 +187,38 @@ def __init__(self, self.review_id = review_id self.date = date self.label = label + + # self.review_body = review_body - - + + class Input(object): - """A single training/test input for sequence classification.""" - - def __init__(self, text, review_id, date, label=None): - """Constructs an Input. - Args: - text: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.text = text - self.review_id = review_id - self.date = date - self.label = label - - + """A single training/test input for sequence classification.""" + + def __init__(self, text, review_id, date, label=None): + """Constructs an Input. + Args: + text: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.text = text + self.review_id = review_id + self.date = date + self.label = label + + def convert_input(the_input, max_seq_length): # First, we need to preprocess our data so that it matches the data BERT was trained on: # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) - # + # # Fortunately, the Transformers tokenizer does this for us! # - tokens = tokenizer.tokenize(the_input.text) + tokens = tokenizer.tokenize(the_input.text) # Next, we need to do the following: # @@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length): # # Again, the Transformers tokenizer does this for us! # - encode_plus_tokens = tokenizer.encode_plus(the_input.text, - pad_to_max_length=True, - max_length=max_seq_length, -# truncation=True - ) + encode_plus_tokens = tokenizer.encode_plus( + the_input.text, + pad_to_max_length=True, + max_length=max_seq_length, + # truncation=True + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] - - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + input_ids = encode_plus_tokens["input_ids"] + + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction. segment_ids = [0] * max_seq_length @@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length): label_id=label_id, review_id=the_input.review_id, date=the_input.date, - label=the_input.label) -# review_body=the_input.text) - -# print('**input_ids**\n{}\n'.format(features.input_ids)) -# print('**input_mask**\n{}\n'.format(features.input_mask)) -# print('**segment_ids**\n{}\n'.format(features.segment_ids)) -# print('**label_id**\n{}\n'.format(features.label_id)) -# print('**review_id**\n{}\n'.format(features.review_id)) -# print('**date**\n{}\n'.format(features.date)) -# print('**label**\n{}\n'.format(features.label)) -# print('**review_body**\n{}\n'.format(features.review_body)) + label=the_input.label, + ) + # review_body=the_input.text) + + # print('**input_ids**\n{}\n'.format(features.input_ids)) + # print('**input_mask**\n{}\n'.format(features.input_mask)) + # print('**segment_ids**\n{}\n'.format(features.segment_ids)) + # print('**label_id**\n{}\n'.format(features.label_id)) + # print('**review_id**\n{}\n'.format(features.review_id)) + # print('**date**\n{}\n'.format(features.date)) + # print('**label**\n{}\n'.format(features.label)) + # print('**review_body**\n{}\n'.format(features.review_body)) return features -def transform_inputs_to_tfrecord(inputs, - output_file, - max_seq_length): +def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length): """Convert a set of `Input`s to a TFRecord file.""" records = [] tf_record_writer = tf.io.TFRecordWriter(output_file) - + for (input_idx, the_input) in enumerate(inputs): if input_idx % 10000 == 0: - print('Writing input {} of {}\n'.format(input_idx, len(inputs))) + print("Writing input {} of {}\n".format(input_idx, len(inputs))) features = convert_input(the_input, max_seq_length) all_features = collections.OrderedDict() - all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) - all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) - all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) - all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) + all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) + all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) + all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) + all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) tf_record = tf.train.Example(features=tf.train.Features(feature=all_features)) tf_record_writer.write(tf_record.SerializeToString()) - records.append({#'tf_record': tf_record.SerializeToString(), - 'input_ids': features.input_ids, - 'input_mask': features.input_mask, - 'segment_ids': features.segment_ids, - 'label_id': features.label_id, - 'review_id': the_input.review_id, - 'date': the_input.date, - 'label': features.label, -# 'review_body': features.review_body - }) + records.append( + { #'tf_record': tf_record.SerializeToString(), + "input_ids": features.input_ids, + "input_mask": features.input_mask, + "segment_ids": features.segment_ids, + "label_id": features.label_id, + "review_id": the_input.review_id, + "date": the_input.date, + "label": features.label, + # 'review_body': features.review_body + } + ) ##################################### ####### TODO: REMOVE THIS BREAK ####### - ##################################### + ##################################### # break - + tf_record_writer.close() - + return records - + def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--train-split-percentage', type=float, + parser.add_argument( + "--train-split-percentage", + type=float, default=0.90, ) - parser.add_argument('--validation-split-percentage', type=float, - default=0.05, - ) - parser.add_argument('--test-split-percentage', type=float, + parser.add_argument( + "--validation-split-percentage", + type=float, default=0.05, ) - parser.add_argument('--balance-dataset', type=eval, - default=True + parser.add_argument( + "--test-split-percentage", + type=float, + default=0.05, ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument("--balance-dataset", type=eval, default=True) + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - parser.add_argument('--feature-store-offline-prefix', type=str, + ) + parser.add_argument( + "--feature-store-offline-prefix", + type=str, default=None, - ) - parser.add_argument('--feature-group-name', type=str, + ) + parser.add_argument( + "--feature-group-name", + type=str, default=None, - ) - + ) + return parser.parse_args() - -def _transform_tsv_to_tfrecord(file, - max_seq_length, - balance_dataset, - prefix, - feature_group_name): - print('file {}'.format(file)) - print('max_seq_length {}'.format(max_seq_length)) - print('balance_dataset {}'.format(balance_dataset)) - print('prefix {}'.format(prefix)) - print('feature_group_name {}'.format(feature_group_name)) + +def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name): + print("file {}".format(file)) + print("max_seq_length {}".format(max_seq_length)) + print("balance_dataset {}".format(balance_dataset)) + print("prefix {}".format(prefix)) + print("feature_group_name {}".format(feature_group_name)) # need to re-load since we can't pass feature_group object in _partial functions for some reason feature_group = create_or_load_feature_group(prefix, feature_group_name) - + filename_without_extension = Path(Path(file).stem).stem - df = pd.read_csv(file, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip') + df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip") df.isna().values.any() df = df.dropna() df = df.reset_index(drop=True) - print('Shape of dataframe {}'.format(df.shape)) + print("Shape of dataframe {}".format(df.shape)) - if balance_dataset: + if balance_dataset: # Balance the dataset down to the minority class from sklearn.utils import resample - five_star_df = df.query('star_rating == 5') - four_star_df = df.query('star_rating == 4') - three_star_df = df.query('star_rating == 3') - two_star_df = df.query('star_rating == 2') - one_star_df = df.query('star_rating == 1') - - minority_count = min(five_star_df.shape[0], - four_star_df.shape[0], - three_star_df.shape[0], - two_star_df.shape[0], - one_star_df.shape[0]) - - five_star_df = resample(five_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - four_star_df = resample(four_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - three_star_df = resample(three_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - two_star_df = resample(two_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - one_star_df = resample(one_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) + five_star_df = df.query("star_rating == 5") + four_star_df = df.query("star_rating == 4") + three_star_df = df.query("star_rating == 3") + two_star_df = df.query("star_rating == 2") + one_star_df = df.query("star_rating == 1") + + minority_count = min( + five_star_df.shape[0], + four_star_df.shape[0], + three_star_df.shape[0], + two_star_df.shape[0], + one_star_df.shape[0], + ) + + five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27) + + four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27) + + three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27) + + two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27) + + one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27) df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df]) - df_balanced = df_balanced.reset_index(drop=True) - print('Shape of balanced dataframe {}'.format(df_balanced.shape)) - print(df_balanced['star_rating'].head(100)) + df_balanced = df_balanced.reset_index(drop=True) + print("Shape of balanced dataframe {}".format(df_balanced.shape)) + print(df_balanced["star_rating"].head(100)) df = df_balanced - - print('Shape of dataframe before splitting {}'.format(df.shape)) - - print('train split percentage {}'.format(args.train_split_percentage)) - print('validation split percentage {}'.format(args.validation_split_percentage)) - print('test split percentage {}'.format(args.test_split_percentage)) - + + print("Shape of dataframe before splitting {}".format(df.shape)) + + print("train split percentage {}".format(args.train_split_percentage)) + print("validation split percentage {}".format(args.validation_split_percentage)) + print("test split percentage {}".format(args.test_split_percentage)) + holdout_percentage = 1.00 - args.train_split_percentage - print('holdout percentage {}'.format(holdout_percentage)) - df_train, df_holdout = train_test_split(df, - test_size=holdout_percentage, - stratify=df['star_rating']) + print("holdout percentage {}".format(holdout_percentage)) + df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"]) test_holdout_percentage = args.test_split_percentage / holdout_percentage - print('test holdout percentage {}'.format(test_holdout_percentage)) - df_validation, df_test = train_test_split(df_holdout, - test_size=test_holdout_percentage, - stratify=df_holdout['star_rating']) - + print("test holdout percentage {}".format(test_holdout_percentage)) + df_validation, df_test = train_test_split( + df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"] + ) + df_train = df_train.reset_index(drop=True) df_validation = df_validation.reset_index(drop=True) df_test = df_test.reset_index(drop=True) - print('Shape of train dataframe {}'.format(df_train.shape)) - print('Shape of validation dataframe {}'.format(df_validation.shape)) - print('Shape of test dataframe {}'.format(df_test.shape)) + print("Shape of train dataframe {}".format(df_train.shape)) + print("Shape of validation dataframe {}".format(df_validation.shape)) + print("Shape of test dataframe {}".format(df_test.shape)) timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) - train_inputs = df_train.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - validation_inputs = df_validation.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - test_inputs = df_test.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) + train_inputs = df_train.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + validation_inputs = df_validation.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + test_inputs = df_test.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): - # - # + # + # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) # 4. Map our words to indexes using a vocab file that BERT provides # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) - # + # # We don't have to worry about these details. The Transformers tokenizer does this for us. - # - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) + # + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow. - train_records = transform_inputs_to_tfrecord(train_inputs, - '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension), - max_seq_length) - - validation_records = transform_inputs_to_tfrecord(validation_inputs, - '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension), - max_seq_length) - - test_records = transform_inputs_to_tfrecord(test_inputs, - '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension), - max_seq_length) - + train_records = transform_inputs_to_tfrecord( + train_inputs, + "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + validation_records = transform_inputs_to_tfrecord( + validation_inputs, + "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + test_records = transform_inputs_to_tfrecord( + test_inputs, + "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension), + max_seq_length, + ) + df_train_records = pd.DataFrame.from_dict(train_records) - df_train_records['split_type'] = 'train' - df_train_records.head() - + df_train_records["split_type"] = "train" + df_train_records.head() + df_validation_records = pd.DataFrame.from_dict(validation_records) - df_validation_records['split_type'] = 'validation' - df_validation_records.head() + df_validation_records["split_type"] = "validation" + df_validation_records.head() df_test_records = pd.DataFrame.from_dict(test_records) - df_test_records['split_type'] = 'test' - df_test_records.head() - - # Add record to feature store + df_test_records["split_type"] = "test" + df_test_records.head() + + # Add record to feature store df_fs_train_records = cast_object_to_string(df_train_records) df_fs_validation_records = cast_object_to_string(df_validation_records) df_fs_test_records = cast_object_to_string(df_test_records) - print('Ingesting Features...') - feature_group.ingest( - data_frame=df_fs_train_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_validation_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_test_records, max_workers=3, wait=True - ) - print('Feature ingest completed.') + print("Ingesting Features...") + feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True) + print("Feature ingest completed.") def process(args): - print('Current host: {}'.format(args.current_host)) - - feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) + print("Current host: {}".format(args.current_host)) + + feature_group = create_or_load_feature_group( + prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name + ) feature_group.describe() - + print(feature_group.as_hive_ddl()) - - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) - - transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord, - max_seq_length=args.max_seq_length, - balance_dataset=args.balance_dataset, - prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) - - input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data)) + + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) + + transform_tsv_to_tfrecord = functools.partial( + _transform_tsv_to_tfrecord, + max_seq_length=args.max_seq_length, + balance_dataset=args.balance_dataset, + prefix=args.feature_store_offline_prefix, + feature_group_name=args.feature_group_name, + ) + + input_files = glob.glob("{}/*.tsv.gz".format(args.input_data)) num_cpus = multiprocessing.cpu_count() - print('num_cpus {}'.format(num_cpus)) + print("num_cpus {}".format(num_cpus)) p = multiprocessing.Pool(num_cpus) p.map(transform_tsv_to_tfrecord, input_files) - print('Listing contents of {}'.format(args.output_data)) + print("Listing contents of {}".format(args.output_data)) dirs_output = os.listdir(args.output_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(train_data)) + print("Listing contents of {}".format(train_data)) dirs_output = os.listdir(train_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(validation_data)) + print("Listing contents of {}".format(validation_data)) dirs_output = os.listdir(validation_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(test_data)) + print("Listing contents of {}".format(test_data)) dirs_output = os.listdir(test_data) for file in dirs_output: print(file) - + offline_store_contents = None - while (offline_store_contents is None): - objects_in_bucket = s3.list_objects(Bucket=bucket, - Prefix=args.feature_store_offline_prefix) - if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): - offline_store_contents = objects_in_bucket['Contents'] + while offline_store_contents is None: + objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix) + if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1: + offline_store_contents = objects_in_bucket["Contents"] else: - print('Waiting for data in offline store...\n') + print("Waiting for data in offline store...\n") sleep(60) - print('Data available.') - - print('Complete') - - + print("Data available.") + + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) process(args) diff --git a/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb b/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb index e4f3233b..a97b640b 100644 --- a/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb +++ b/07_train/01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb @@ -57,9 +57,9 @@ "try:\n", " max_seq_length\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -79,11 +79,11 @@ "source": [ "def select_data_and_label_from_record(record):\n", " x = {\n", - " 'input_ids': record['input_ids'],\n", - " 'input_mask': record['input_mask'],\n", - "# 'segment_ids': record['segment_ids']\n", + " \"input_ids\": record[\"input_ids\"],\n", + " \"input_mask\": record[\"input_mask\"],\n", + " # 'segment_ids': record['segment_ids']\n", " }\n", - " y = record['label_ids']\n", + " y = record[\"label_ids\"]\n", "\n", " return (x, y)" ] @@ -94,51 +94,47 @@ "metadata": {}, "outputs": [], "source": [ - "def file_based_input_dataset_builder(channel,\n", - " input_filenames,\n", - " pipe_mode,\n", - " is_training,\n", - " drop_remainder):\n", + "def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder):\n", "\n", " # For training, we want a lot of parallel reading and shuffling.\n", " # For eval, we want no shuffling and parallel reading doesn't matter.\n", "\n", " if pipe_mode:\n", - " print('***** Using pipe_mode with channel {}'.format(channel))\n", + " print(\"***** Using pipe_mode with channel {}\".format(channel))\n", " from sagemaker_tensorflow import PipeModeDataset\n", - " dataset = PipeModeDataset(channel=channel,\n", - " record_format='TFRecord')\n", + "\n", + " dataset = PipeModeDataset(channel=channel, record_format=\"TFRecord\")\n", " else:\n", - " print('***** Using input_filenames {}'.format(input_filenames))\n", + " print(\"***** Using input_filenames {}\".format(input_filenames))\n", " dataset = tf.data.TFRecordDataset(input_filenames)\n", "\n", " dataset = dataset.repeat(100)\n", " dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)\n", "\n", " name_to_features = {\n", - " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - "# \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n", + " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " # \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n", " }\n", "\n", " def _decode_record(record, name_to_features):\n", " \"\"\"Decodes a record to a TensorFlow example.\"\"\"\n", " return tf.io.parse_single_example(record, name_to_features)\n", - " \n", + "\n", " dataset = dataset.apply(\n", " tf.data.experimental.map_and_batch(\n", - " lambda record: _decode_record(record, name_to_features),\n", - " batch_size=8,\n", - " drop_remainder=drop_remainder,\n", - " num_parallel_calls=tf.data.experimental.AUTOTUNE))\n", + " lambda record: _decode_record(record, name_to_features),\n", + " batch_size=8,\n", + " drop_remainder=drop_remainder,\n", + " num_parallel_calls=tf.data.experimental.AUTOTUNE,\n", + " )\n", + " )\n", "\n", " dataset.cache()\n", "\n", " if is_training:\n", - " dataset = dataset.shuffle(seed=42,\n", - " buffer_size=10,\n", - " reshuffle_each_iteration=True)\n", + " dataset = dataset.shuffle(seed=42, buffer_size=10, reshuffle_each_iteration=True)\n", "\n", " return dataset" ] @@ -149,16 +145,13 @@ "metadata": {}, "outputs": [], "source": [ - "train_data = './data-tfrecord/bert-train'\n", - "train_data_filenames = glob('{}/*.tfrecord'.format(train_data))\n", - "print('train_data_filenames {}'.format(train_data_filenames))\n", + "train_data = \"./data-tfrecord/bert-train\"\n", + "train_data_filenames = glob(\"{}/*.tfrecord\".format(train_data))\n", + "print(\"train_data_filenames {}\".format(train_data_filenames))\n", "\n", "train_dataset = file_based_input_dataset_builder(\n", - " channel='train',\n", - " input_filenames=train_data_filenames,\n", - " pipe_mode=False,\n", - " is_training=True,\n", - " drop_remainder=False).map(select_data_and_label_from_record)" + " channel=\"train\", input_filenames=train_data_filenames, pipe_mode=False, is_training=True, drop_remainder=False\n", + ").map(select_data_and_label_from_record)" ] }, { @@ -167,16 +160,17 @@ "metadata": {}, "outputs": [], "source": [ - "validation_data = './data-tfrecord/bert-validation'\n", - "validation_data_filenames = glob('{}/*.tfrecord'.format(validation_data))\n", - "print('validation_data_filenames {}'.format(validation_data_filenames))\n", + "validation_data = \"./data-tfrecord/bert-validation\"\n", + "validation_data_filenames = glob(\"{}/*.tfrecord\".format(validation_data))\n", + "print(\"validation_data_filenames {}\".format(validation_data_filenames))\n", "\n", "validation_dataset = file_based_input_dataset_builder(\n", - " channel='validation',\n", + " channel=\"validation\",\n", " input_filenames=validation_data_filenames,\n", " pipe_mode=False,\n", " is_training=False,\n", - " drop_remainder=False).map(select_data_and_label_from_record)" + " drop_remainder=False,\n", + ").map(select_data_and_label_from_record)" ] }, { @@ -185,16 +179,13 @@ "metadata": {}, "outputs": [], "source": [ - "test_data = './data-tfrecord/bert-test'\n", - "test_data_filenames = glob('{}/*.tfrecord'.format(test_data))\n", + "test_data = \"./data-tfrecord/bert-test\"\n", + "test_data_filenames = glob(\"{}/*.tfrecord\".format(test_data))\n", "print(test_data_filenames)\n", "\n", "test_dataset = file_based_input_dataset_builder(\n", - " channel='test',\n", - " input_filenames=test_data_filenames,\n", - " pipe_mode=False,\n", - " is_training=False,\n", - " drop_remainder=False).map(select_data_and_label_from_record)" + " channel=\"test\", input_filenames=test_data_filenames, pipe_mode=False, is_training=False, drop_remainder=False\n", + ").map(select_data_and_label_from_record)" ] }, { @@ -210,13 +201,13 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "steps_per_epoch=50\n", - "validation_steps=50\n", - "test_steps=150\n", - "freeze_bert_layer=True\n", - "learning_rate=3e-5\n", - "epsilon=1e-08" + "epochs = 1\n", + "steps_per_epoch = 50\n", + "validation_steps = 50\n", + "test_steps = 150\n", + "freeze_bert_layer = True\n", + "learning_rate = 3e-5\n", + "epsilon = 1e-08" ] }, { @@ -235,24 +226,14 @@ }, "outputs": [], "source": [ - "CLASSES=[1, 2, 3, 4, 5]\n", - "\n", - "config = DistilBertConfig.from_pretrained('distilbert-base-uncased',\n", - " num_labels=len(CLASSES),\n", - " id2label={\n", - " 0: 1,\n", - " 1: 2,\n", - " 2: 3,\n", - " 3: 4,\n", - " 4: 5\n", - " },\n", - " label2id={\n", - " 1: 0,\n", - " 2: 1,\n", - " 3: 2,\n", - " 4: 3,\n", - " 5: 4\n", - " })\n", + "CLASSES = [1, 2, 3, 4, 5]\n", + "\n", + "config = DistilBertConfig.from_pretrained(\n", + " \"distilbert-base-uncased\",\n", + " num_labels=len(CLASSES),\n", + " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n", + " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n", + ")\n", "print(config)" ] }, @@ -264,20 +245,21 @@ "source": [ "from transformers import TFDistilBertModel\n", "\n", - "transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', \n", - " config=config)\n", + "transformer_model = TFDistilBertModel.from_pretrained(\"distilbert-base-uncased\", config=config)\n", "\n", - "input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')\n", - "input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') \n", + "input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name=\"input_ids\", dtype=\"int32\")\n", + "input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name=\"input_mask\", dtype=\"int32\")\n", "\n", "embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]\n", - "X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)\n", + "X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(\n", + " embedding_layer\n", + ")\n", "X = tf.keras.layers.GlobalMaxPool1D()(X)\n", - "X = tf.keras.layers.Dense(50, activation='relu')(X)\n", + "X = tf.keras.layers.Dense(50, activation=\"relu\")(X)\n", "X = tf.keras.layers.Dropout(0.2)(X)\n", - "X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)\n", + "X = tf.keras.layers.Dense(len(CLASSES), activation=\"sigmoid\")(X)\n", "\n", - "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)\n", + "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)\n", "\n", "for layer in model.layers[:3]:\n", " layer.trainable = not freeze_bert_layer" @@ -296,10 +278,10 @@ "metadata": {}, "outputs": [], "source": [ - "loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", - "metric=tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", + "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", + "metric = tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")\n", "\n", - "optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n", "\n", "model.compile(optimizer=optimizer, loss=loss, metrics=[metric])\n", "\n", @@ -314,7 +296,7 @@ "source": [ "callbacks = []\n", "\n", - "log_dir = './tmp/tensorboard/'\n", + "log_dir = \"./tmp/tensorboard/\"\n", "tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)\n", "callbacks.append(tensorboard_callback)" ] @@ -327,13 +309,15 @@ }, "outputs": [], "source": [ - "history = model.fit(train_dataset,\n", - " shuffle=True,\n", - " epochs=epochs,\n", - " steps_per_epoch=steps_per_epoch,\n", - " validation_data=validation_dataset,\n", - " validation_steps=validation_steps,\n", - " callbacks=callbacks)" + "history = model.fit(\n", + " train_dataset,\n", + " shuffle=True,\n", + " epochs=epochs,\n", + " steps_per_epoch=steps_per_epoch,\n", + " validation_data=validation_dataset,\n", + " validation_steps=validation_steps,\n", + " callbacks=callbacks,\n", + ")" ] }, { @@ -342,7 +326,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Trained model {}'.format(model))" + "print(\"Trained model {}\".format(model))" ] }, { @@ -358,9 +342,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_history = model.evaluate(test_dataset,\n", - " steps=test_steps, \n", - " callbacks=callbacks)\n", + "test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)\n", "print(test_history)" ] }, @@ -377,7 +359,7 @@ "metadata": {}, "outputs": [], "source": [ - "tensorflow_model_dir = './tmp/tensorflow/'" + "tensorflow_model_dir = \"./tmp/tensorflow/\"" ] }, { @@ -446,21 +428,19 @@ "\n", "from transformers import DistilBertTokenizer\n", "\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - "sample_review_body = 'This product is terrible.'\n", + "sample_review_body = \"This product is terrible.\"\n", "\n", - "encode_plus_tokens = tokenizer.encode_plus(sample_review_body,\n", - " padding=True,\n", - " max_length=max_seq_length,\n", - " truncation=True,\n", - " return_tensors='tf')\n", + "encode_plus_tokens = tokenizer.encode_plus(\n", + " sample_review_body, padding=True, max_length=max_seq_length, truncation=True, return_tensors=\"tf\"\n", + ")\n", "\n", "# The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n", - "input_ids = encode_plus_tokens['input_ids']\n", + "input_ids = encode_plus_tokens[\"input_ids\"]\n", "\n", - "# Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n", - "input_mask = encode_plus_tokens['attention_mask']\n", + "# Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n", + "input_mask = encode_plus_tokens[\"attention_mask\"]\n", "\n", "outputs = model.predict(x=(input_ids, input_mask))\n", "\n", @@ -468,8 +448,7 @@ "\n", "prediction = [{\"label\": config.id2label[item.argmax()], \"score\": item.max().item()} for item in scores]\n", "\n", - "print('Predicted star_rating \"{}\" for review_body \"{}\"'.format(prediction[0]['label'], sample_review_body))\n", - " " + "print('Predicted star_rating \"{}\" for review_body \"{}\"'.format(prediction[0][\"label\"], sample_review_body))" ] }, { diff --git a/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb b/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb index 908ed9ce..6d414288 100644 --- a/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb +++ b/07_train/02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb @@ -34,12 +34,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -67,9 +67,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -99,9 +99,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -131,9 +131,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -163,9 +163,9 @@ "try:\n", " max_seq_length\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -195,9 +195,9 @@ "try:\n", " experiment_name\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -229,9 +229,9 @@ "try:\n", " trial_name\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -303,12 +303,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -337,28 +334,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=1\n", - "validation_steps=1\n", - "test_steps=1\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=True\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 1\n", + "validation_steps = 1\n", + "test_steps = 1\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = True\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -397,10 +394,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -424,70 +421,79 @@ "from sagemaker.debugger import CollectionConfig\n", "from sagemaker.debugger import DebuggerHookConfig\n", "\n", - "actions=rule_configs.ActionList(\n", - "# rule_configs.StopTraining(),\n", - "# rule_configs.Email(\"\")\n", + "actions = rule_configs.ActionList(\n", + " # rule_configs.StopTraining(),\n", + " # rule_configs.Email(\"\")\n", ")\n", "\n", - "rules=[\n", - " Rule.sagemaker(\n", - " base_config=rule_configs.loss_not_decreasing(),\n", - " rule_parameters={\n", - " 'collection_names': 'losses,metrics',\n", - " 'use_losses_collection': 'true',\n", - " 'num_steps': '10',\n", - " 'diff_percent': '50'\n", - " },\n", - " collections_to_save=[\n", - " CollectionConfig(name='losses',\n", - " parameters={\n", - " 'save_interval': '10',\n", - " }),\n", - " CollectionConfig(name='metrics',\n", - " parameters={\n", - " 'save_interval': '10',\n", - " })\n", - " ],\n", - " actions=actions \n", - " ),\n", - " Rule.sagemaker(\n", - " base_config=rule_configs.overtraining(),\n", - " rule_parameters={\n", - " 'collection_names': 'losses,metrics',\n", - " 'patience_train': '10',\n", - " 'patience_validation': '10',\n", - " 'delta': '0.5'\n", - " },\n", - " collections_to_save=[\n", - " CollectionConfig(name='losses',\n", - " parameters={\n", - " 'save_interval': '10',\n", - " }),\n", - " CollectionConfig(name='metrics',\n", - " parameters={\n", - " 'save_interval': '10',\n", - " })\n", - " ],\n", - " actions=actions \n", - " ),\n", - " ProfilerRule.sagemaker(rule_configs.ProfilerReport()),\n", - " ProfilerRule.sagemaker(rule_configs.BatchSize()),\n", - " ProfilerRule.sagemaker(rule_configs.CPUBottleneck()),\n", - " ProfilerRule.sagemaker(rule_configs.GPUMemoryIncrease()),\n", - " ProfilerRule.sagemaker(rule_configs.IOBottleneck()),\n", - " ProfilerRule.sagemaker(rule_configs.LoadBalancing()),\n", - " ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),\n", - " ProfilerRule.sagemaker(rule_configs.OverallSystemUsage()),\n", - "# ProfilerRule.sagemaker(rule_configs.OverallFrameworkMetrics()),\n", - " ProfilerRule.sagemaker(rule_configs.StepOutlier()) \n", - " ]\n", + "rules = [\n", + " Rule.sagemaker(\n", + " base_config=rule_configs.loss_not_decreasing(),\n", + " rule_parameters={\n", + " \"collection_names\": \"losses,metrics\",\n", + " \"use_losses_collection\": \"true\",\n", + " \"num_steps\": \"10\",\n", + " \"diff_percent\": \"50\",\n", + " },\n", + " collections_to_save=[\n", + " CollectionConfig(\n", + " name=\"losses\",\n", + " parameters={\n", + " \"save_interval\": \"10\",\n", + " },\n", + " ),\n", + " CollectionConfig(\n", + " name=\"metrics\",\n", + " parameters={\n", + " \"save_interval\": \"10\",\n", + " },\n", + " ),\n", + " ],\n", + " actions=actions,\n", + " ),\n", + " Rule.sagemaker(\n", + " base_config=rule_configs.overtraining(),\n", + " rule_parameters={\n", + " \"collection_names\": \"losses,metrics\",\n", + " \"patience_train\": \"10\",\n", + " \"patience_validation\": \"10\",\n", + " \"delta\": \"0.5\",\n", + " },\n", + " collections_to_save=[\n", + " CollectionConfig(\n", + " name=\"losses\",\n", + " parameters={\n", + " \"save_interval\": \"10\",\n", + " },\n", + " ),\n", + " CollectionConfig(\n", + " name=\"metrics\",\n", + " parameters={\n", + " \"save_interval\": \"10\",\n", + " },\n", + " ),\n", + " ],\n", + " actions=actions,\n", + " ),\n", + " ProfilerRule.sagemaker(rule_configs.ProfilerReport()),\n", + " ProfilerRule.sagemaker(rule_configs.BatchSize()),\n", + " ProfilerRule.sagemaker(rule_configs.CPUBottleneck()),\n", + " ProfilerRule.sagemaker(rule_configs.GPUMemoryIncrease()),\n", + " ProfilerRule.sagemaker(rule_configs.IOBottleneck()),\n", + " ProfilerRule.sagemaker(rule_configs.LoadBalancing()),\n", + " ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),\n", + " ProfilerRule.sagemaker(rule_configs.OverallSystemUsage()),\n", + " # ProfilerRule.sagemaker(rule_configs.OverallFrameworkMetrics()),\n", + " ProfilerRule.sagemaker(rule_configs.StepOutlier()),\n", + "]\n", "\n", "hook_config = DebuggerHookConfig(\n", " hook_parameters={\n", - " 'save_interval': '10', # number of steps\n", - " 'export_tensorboard': 'true',\n", - " 'tensorboard_dir': 'hook_tensorboard/',\n", - " })" + " \"save_interval\": \"10\", # number of steps\n", + " \"export_tensorboard\": \"true\",\n", + " \"tensorboard_dir\": \"hook_tensorboard/\",\n", + " }\n", + ")" ] }, { @@ -511,7 +517,7 @@ "\n", "profiler_config = ProfilerConfig(\n", " system_monitor_interval_millis=500,\n", - " framework_profile_params=FrameworkProfile(local_path=\"/opt/ml/output/profiler/\", start_step=5, num_steps=10)\n", + " framework_profile_params=FrameworkProfile(local_path=\"/opt/ml/output/profiler/\", start_step=5, num_steps=10),\n", ")" ] }, @@ -531,8 +537,8 @@ "source": [ "import uuid\n", "\n", - "checkpoint_s3_prefix = 'checkpoints/{}'.format(str(uuid.uuid4()))\n", - "checkpoint_s3_uri = 's3://{}/{}/'.format(bucket, checkpoint_s3_prefix)\n", + "checkpoint_s3_prefix = \"checkpoints/{}\".format(str(uuid.uuid4()))\n", + "checkpoint_s3_uri = \"s3://{}/{}/\".format(bucket, checkpoint_s3_prefix)\n", "\n", "print(checkpoint_s3_uri)" ] @@ -564,43 +570,46 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " checkpoint_s3_uri=checkpoint_s3_uri,\n", - " py_version='py37',\n", - " framework_version='2.3.1',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " metric_definitions=metrics_definitions,\n", - " rules=rules,\n", - " debugger_hook_config=hook_config,\n", - " profiler_config=profiler_config,\n", - "# max_run=7200, # number of seconds\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " checkpoint_s3_uri=checkpoint_s3_uri,\n", + " py_version=\"py37\",\n", + " framework_version=\"2.3.1\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " metric_definitions=metrics_definitions,\n", + " rules=rules,\n", + " debugger_hook_config=hook_config,\n", + " profiler_config=profiler_config,\n", + " # max_run=7200, # number of seconds\n", + ")" ] }, { @@ -616,11 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "experiment_config = {\n", - " 'ExperimentName': experiment_name,\n", - " 'TrialName': trial_name,\n", - " 'TrialComponentDisplayName': 'train'\n", - "}" + "experiment_config = {\"ExperimentName\": experiment_name, \"TrialName\": trial_name, \"TrialComponentDisplayName\": \"train\"}" ] }, { @@ -636,12 +641,11 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " experiment_config=experiment_config, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " experiment_config=experiment_config,\n", + " wait=False,\n", + ")" ] }, { @@ -651,7 +655,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -662,7 +666,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -673,7 +683,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -684,7 +700,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -695,7 +717,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Checkpoint Data After The Training Job Has Completed'.format(bucket, checkpoint_s3_prefix, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Checkpoint Data After The Training Job Has Completed'.format(\n", + " bucket, checkpoint_s3_prefix, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -824,12 +852,13 @@ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", " sagemaker_session=sess,\n", " experiment_name=experiment_name,\n", - " metric_names=['validation:accuracy'],\n", + " metric_names=[\"validation:accuracy\"],\n", " sort_by=\"CreationTime\",\n", " sort_order=\"Descending\",\n", ")\n", @@ -953,7 +982,7 @@ "metadata": {}, "outputs": [], "source": [ - "#Internal - DO NOT RUN\n", + "# Internal - DO NOT RUN\n", "\n", "# step_prefix = '07_train'\n", "# !aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz s3://dsoaws/$step_prefix/tensorflow/ --acl public-read-write --acl bucket-owner-full-control\n", diff --git a/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb b/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb index c97a4028..45ec7104 100644 --- a/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb +++ b/07_train/03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb @@ -17,12 +17,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -43,9 +43,9 @@ "try:\n", " training_job_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please wait for the Training notebook to finish.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please wait for the Training notebook to finish.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -54,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Previous training_job_name: {}'.format(training_job_name))" + "print(\"Previous training_job_name: {}\".format(training_job_name))" ] }, { @@ -63,7 +63,7 @@ "metadata": {}, "outputs": [], "source": [ - "training_job_name = 'tensorflow-training-2021-01-06-21-36-03-293'" + "training_job_name = \"tensorflow-training-2021-01-06-21-36-03-293\"" ] }, { @@ -79,7 +79,7 @@ "metadata": {}, "outputs": [], "source": [ - "models_dir = './models'" + "models_dir = \"./models\"" ] }, { @@ -101,11 +101,11 @@ "import pickle as pkl\n", "\n", "try:\n", - " tar = tarfile.open('{}/model.tar.gz'.format(models_dir))\n", + " tar = tarfile.open(\"{}/model.tar.gz\".format(models_dir))\n", " tar.extractall(path=models_dir)\n", " tar.close()\n", "except Exception as e:\n", - " print('[ERROR] in tar operation: {}'.format(e))" + " print(\"[ERROR] in tar operation: {}\".format(e))" ] }, { @@ -123,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "transformer_model_dir = '{}/transformers/fine-tuned/'.format(models_dir)\n", + "transformer_model_dir = \"{}/transformers/fine-tuned/\".format(models_dir)\n", "\n", "!ls -al $transformer_model_dir" ] @@ -152,27 +152,17 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import DistilBertForSequenceClassification # PyTorch version\n", + "from transformers import DistilBertForSequenceClassification # PyTorch version\n", "\n", "try:\n", - " loaded_pytorch_model = DistilBertForSequenceClassification.from_pretrained(transformer_model_dir,\n", - " id2label={\n", - " 0: 1,\n", - " 1: 2,\n", - " 2: 3,\n", - " 3: 4,\n", - " 4: 5\n", - " },\n", - " label2id={\n", - " 1: 0,\n", - " 2: 1,\n", - " 3: 2,\n", - " 4: 3,\n", - " 5: 4\n", - " },\n", - " from_tf=True)\n", + " loaded_pytorch_model = DistilBertForSequenceClassification.from_pretrained(\n", + " transformer_model_dir,\n", + " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n", + " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n", + " from_tf=True,\n", + " )\n", "except Exception as e:\n", - " print('[ERROR] in loading model {}: '.format(e))" + " print(\"[ERROR] in loading model {}: \".format(e))" ] }, { @@ -201,7 +191,7 @@ "metadata": {}, "outputs": [], "source": [ - "pytorch_models_dir = './models/transformers/pytorch'" + "pytorch_models_dir = \"./models/transformers/pytorch\"" ] }, { @@ -246,7 +236,7 @@ "source": [ "from transformers import DistilBertTokenizer\n", "\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')" + "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")" ] }, { @@ -277,12 +267,12 @@ "from transformers import DistilBertForSequenceClassification\n", "from transformers import DistilBertConfig\n", "\n", - "config = DistilBertConfig.from_json_file('{}/config.json'.format(pytorch_models_dir))\n", + "config = DistilBertConfig.from_json_file(\"{}/config.json\".format(pytorch_models_dir))\n", "\n", - "model_path = '{}/{}'.format(pytorch_models_dir, 'model.pth') \n", + "model_path = \"{}/{}\".format(pytorch_models_dir, \"model.pth\")\n", "model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)\n", "\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)" ] }, @@ -295,24 +285,24 @@ "import json\n", "\n", "max_seq_length = 64\n", - "classes=[1, 2, 3, 4, 5]\n", + "classes = [1, 2, 3, 4, 5]\n", "\n", "model.eval()\n", "\n", "input_data = '[{\"features\": [\"This is great!\"]}, \\\n", " {\"features\": [\"This is bad.\"]}]'\n", - "print('input_data: {}'.format(input_data))\n", + "print(\"input_data: {}\".format(input_data))\n", "\n", "data_json = json.loads(input_data)\n", - "print('data_json: {}'.format(data_json))\n", + "print(\"data_json: {}\".format(data_json))\n", "\n", "predicted_classes = []\n", "\n", "for data_json_line in data_json:\n", - " print('data_json_line: {}'.format(data_json_line))\n", - " print('type(data_json_line): {}'.format(type(data_json_line)))\n", + " print(\"data_json_line: {}\".format(data_json_line))\n", + " print(\"type(data_json_line): {}\".format(type(data_json_line)))\n", "\n", - " review_body = data_json_line['features'][0]\n", + " review_body = data_json_line[\"features\"][0]\n", " print(\"\"\"review_body: {}\"\"\".format(review_body))\n", "\n", " encode_plus_token = tokenizer.encode_plus(\n", @@ -322,40 +312,41 @@ " return_token_type_ids=False,\n", " pad_to_max_length=True,\n", " return_attention_mask=True,\n", - " return_tensors='pt',\n", - " truncation=True)\n", + " return_tensors=\"pt\",\n", + " truncation=True,\n", + " )\n", "\n", - " input_ids = encode_plus_token['input_ids']\n", - " attention_mask = encode_plus_token['attention_mask']\n", + " input_ids = encode_plus_token[\"input_ids\"]\n", + " attention_mask = encode_plus_token[\"attention_mask\"]\n", "\n", " output = model(input_ids, attention_mask)\n", - " print('output: {}'.format(output))\n", + " print(\"output: {}\".format(output))\n", "\n", - " # output is a tuple: \n", + " # output is a tuple:\n", " # output: (tensor([[-1.9840, -0.9870, 2.8947]], grad_fn=),\n", - " # for torch.max() you need to pass in the tensor, output[0] \n", + " # for torch.max() you need to pass in the tensor, output[0]\n", " _, prediction = torch.max(output[0], dim=1)\n", "\n", " predicted_class_idx = prediction.item()\n", " predicted_class = classes[predicted_class_idx]\n", - " print('predicted_class: {}'.format(predicted_class))\n", + " print(\"predicted_class: {}\".format(predicted_class))\n", "\n", " prediction_dict = {}\n", - " prediction_dict['predicted_label'] = predicted_class\n", + " prediction_dict[\"predicted_label\"] = predicted_class\n", "\n", " jsonline = json.dumps(prediction_dict)\n", - " print('jsonline: {}'.format(jsonline))\n", + " print(\"jsonline: {}\".format(jsonline))\n", "\n", " predicted_classes.append(jsonline)\n", - " print('predicted_classes in the loop: {}'.format(predicted_classes))\n", + " print(\"predicted_classes in the loop: {}\".format(predicted_classes))\n", "\n", - "predicted_classes_jsonlines = '\\n'.join(predicted_classes) \n", - "print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines))\n", - "print('type(predicted_classes_jsonlines): {}'.format(type(predicted_classes_jsonlines)))\n", + "predicted_classes_jsonlines = \"\\n\".join(predicted_classes)\n", + "print(\"predicted_classes_jsonlines: {}\".format(predicted_classes_jsonlines))\n", + "print(\"type(predicted_classes_jsonlines): {}\".format(type(predicted_classes_jsonlines)))\n", "\n", "predicted_classes_jsonlines_dump = json.dumps(predicted_classes_jsonlines)\n", - "print('predicted_classes_jsonlines_dump: {}'.format(predicted_classes_jsonlines_dump))\n", - "print('type(predicted_classes_jsonlines_dump): {}'.format(type(predicted_classes_jsonlines_dump)))" + "print(\"predicted_classes_jsonlines_dump: {}\".format(predicted_classes_jsonlines_dump))\n", + "print(\"type(predicted_classes_jsonlines_dump): {}\".format(type(predicted_classes_jsonlines_dump)))" ] }, { @@ -371,7 +362,7 @@ "metadata": {}, "outputs": [], "source": [ - "transformer_pytorch_model_dir_s3_uri = 's3://{}/models/{}/transformer-pytorch/'.format(bucket, training_job_name)\n", + "transformer_pytorch_model_dir_s3_uri = \"s3://{}/models/{}/transformer-pytorch/\".format(bucket, training_job_name)\n", "print(transformer_pytorch_model_dir_s3_uri)" ] }, diff --git a/07_train/04_Evaluate_Model_Metrics.ipynb b/07_train/04_Evaluate_Model_Metrics.ipynb index 41002425..001e7ee4 100644 --- a/07_train/04_Evaluate_Model_Metrics.ipynb +++ b/07_train/04_Evaluate_Model_Metrics.ipynb @@ -70,7 +70,7 @@ "bucket = sess.default_bucket()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -90,11 +90,11 @@ "source": [ "try:\n", " training_job_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -103,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "#training_job_name='tensorflow-training-2021-01-02-06-07-04-440'" + "# training_job_name='tensorflow-training-2021-01-02-06-07-04-440'" ] }, { @@ -133,9 +133,9 @@ "try:\n", " raw_input_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -164,11 +164,11 @@ "source": [ "try:\n", " max_seq_length\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -197,11 +197,11 @@ "source": [ "try:\n", " experiment_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -230,11 +230,11 @@ "source": [ "try:\n", " trial_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -275,7 +275,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_dir_s3_uri = describe_training_job_response['ModelArtifacts']['S3ModelArtifacts'].replace('model.tar.gz', '')\n", + "model_dir_s3_uri = describe_training_job_response[\"ModelArtifacts\"][\"S3ModelArtifacts\"].replace(\"model.tar.gz\", \"\")\n", "model_dir_s3_uri" ] }, @@ -302,9 +302,9 @@ "outputs": [], "source": [ "experiment_config = {\n", - " 'ExperimentName': experiment_name,\n", - " 'TrialName': trial_name,\n", - " 'TrialComponentDisplayName': 'evaluate'\n", + " \"ExperimentName\": experiment_name,\n", + " \"TrialName\": trial_name,\n", + " \"TrialComponentDisplayName\": \"evaluate\",\n", "}" ] }, @@ -323,8 +323,8 @@ }, "outputs": [], "source": [ - "processing_instance_type='ml.m5.xlarge'\n", - "processing_instance_count=1" + "processing_instance_type = \"ml.m5.xlarge\"\n", + "processing_instance_count = 1" ] }, { @@ -368,11 +368,13 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " max_runtime_in_seconds=7200)" + "processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " max_runtime_in_seconds=7200,\n", + ")" ] }, { @@ -383,26 +385,26 @@ "source": [ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "\n", - "processor.run(code='evaluate_model_metrics.py',\n", - " inputs=[\n", - " ProcessingInput(input_name='model-tar-s3-uri',\n", - " source=model_dir_s3_uri,\n", - " destination='/opt/ml/processing/input/model/'),\n", - " ProcessingInput(input_name='evaluation-data-s3-uri',\n", - " source=raw_input_data_s3_uri,\n", - " destination='/opt/ml/processing/input/data/')\n", - " ],\n", - " outputs=[\n", - " ProcessingOutput(s3_upload_mode='EndOfJob',\n", - " output_name='metrics',\n", - " source='/opt/ml/processing/output/metrics'),\n", - " ],\n", - " arguments=[\n", - " '--max-seq-length', str(max_seq_length)\n", - " ],\n", - " experiment_config=experiment_config,\n", - " logs=True,\n", - " wait=False)" + "processor.run(\n", + " code=\"evaluate_model_metrics.py\",\n", + " inputs=[\n", + " ProcessingInput(\n", + " input_name=\"model-tar-s3-uri\", source=model_dir_s3_uri, destination=\"/opt/ml/processing/input/model/\"\n", + " ),\n", + " ProcessingInput(\n", + " input_name=\"evaluation-data-s3-uri\",\n", + " source=raw_input_data_s3_uri,\n", + " destination=\"/opt/ml/processing/input/data/\",\n", + " ),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(s3_upload_mode=\"EndOfJob\", output_name=\"metrics\", source=\"/opt/ml/processing/output/metrics\"),\n", + " ],\n", + " arguments=[\"--max-seq-length\", str(max_seq_length)],\n", + " experiment_config=experiment_config,\n", + " logs=True,\n", + " wait=False,\n", + ")" ] }, { @@ -413,7 +415,7 @@ }, "outputs": [], "source": [ - "scikit_processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']\n", + "scikit_processing_job_name = processor.jobs[-1].describe()[\"ProcessingJobName\"]\n", "print(scikit_processing_job_name)" ] }, @@ -427,7 +429,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Processing Job'.format(region, scikit_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Processing Job'.format(\n", + " region, scikit_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -440,7 +448,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, scikit_processing_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, scikit_processing_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -453,7 +467,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Processing Job Has Completed'.format(bucket, scikit_processing_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Processing Job Has Completed'.format(\n", + " bucket, scikit_processing_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -469,8 +489,9 @@ "metadata": {}, "outputs": [], "source": [ - "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=scikit_processing_job_name,\n", - " sagemaker_session=sess)\n", + "running_processor = sagemaker.processing.ProcessingJob.from_processing_name(\n", + " processing_job_name=scikit_processing_job_name, sagemaker_session=sess\n", + ")\n", "\n", "processing_job_description = running_processor.describe()\n", "\n", @@ -483,7 +504,7 @@ "metadata": {}, "outputs": [], "source": [ - "processing_evaluation_metrics_job_name = processing_job_description['ProcessingJobName']\n", + "processing_evaluation_metrics_job_name = processing_job_description[\"ProcessingJobName\"]\n", "print(processing_evaluation_metrics_job_name)" ] }, @@ -522,11 +543,11 @@ "source": [ "processing_job_description = running_processor.describe()\n", "\n", - "output_config = processing_job_description['ProcessingOutputConfig']\n", - "for output in output_config['Outputs']:\n", - " if output['OutputName'] == 'metrics':\n", - " processed_metrics_s3_uri = output['S3Output']['S3Uri']\n", - " \n", + "output_config = processing_job_description[\"ProcessingOutputConfig\"]\n", + "for output in output_config[\"Outputs\"]:\n", + " if output[\"OutputName\"] == \"metrics\":\n", + " processed_metrics_s3_uri = output[\"S3Output\"][\"S3Uri\"]\n", + "\n", "print(processed_metrics_s3_uri)" ] }, @@ -593,14 +614,12 @@ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", - "#pd.set_option(\"max_rows\", 100)\n", + "# pd.set_option(\"max_rows\", 100)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", - " sagemaker_session=sess,\n", - " experiment_name=experiment_name,\n", - " sort_by=\"CreationTime\",\n", - " sort_order=\"Descending\"\n", + " sagemaker_session=sess, experiment_name=experiment_name, sort_by=\"CreationTime\", sort_order=\"Descending\"\n", ")\n", "\n", "experiment_analytics_df = experiment_analytics.dataframe()\n", @@ -613,7 +632,7 @@ "metadata": {}, "outputs": [], "source": [ - "trial_component_name=experiment_analytics_df.TrialComponentName[0]\n", + "trial_component_name = experiment_analytics_df.TrialComponentName[0]\n", "print(trial_component_name)" ] }, @@ -623,7 +642,7 @@ "metadata": {}, "outputs": [], "source": [ - "trial_component_description=sm.describe_trial_component(TrialComponentName=trial_component_name)\n", + "trial_component_description = sm.describe_trial_component(TrialComponentName=trial_component_name)\n", "trial_component_description" ] }, diff --git a/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb b/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb index aadcfc90..e28c896e 100644 --- a/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb +++ b/07_train/container-demo/00_Prepare_Dataset_BERT.ipynb @@ -83,68 +83,62 @@ "import csv\n", "from transformers import DistilBertTokenizer\n", "\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - "DATA_COLUMN = 'review_body'\n", - "LABEL_COLUMN = 'star_rating'\n", + "DATA_COLUMN = \"review_body\"\n", + "LABEL_COLUMN = \"star_rating\"\n", "LABEL_VALUES = [1, 2, 3, 4, 5]\n", "\n", "label_map = {}\n", "for (i, label) in enumerate(LABEL_VALUES):\n", " label_map[label] = i\n", "\n", - " \n", + "\n", "class InputFeatures(object):\n", - " \"\"\"BERT feature vectors.\"\"\"\n", - "\n", - " def __init__(self,\n", - " input_ids,\n", - " input_mask,\n", - " segment_ids,\n", - " label_id):\n", - " self.input_ids = input_ids\n", - " self.input_mask = input_mask\n", - " self.segment_ids = segment_ids\n", - " self.label_id = label_id\n", - " \n", - " \n", + " \"\"\"BERT feature vectors.\"\"\"\n", + "\n", + " def __init__(self, input_ids, input_mask, segment_ids, label_id):\n", + " self.input_ids = input_ids\n", + " self.input_mask = input_mask\n", + " self.segment_ids = segment_ids\n", + " self.label_id = label_id\n", + "\n", + "\n", "class Input(object):\n", - " \"\"\"A single training/test input for sequence classification.\"\"\"\n", - "\n", - " def __init__(self, text, label=None):\n", - " \"\"\"Constructs an Input.\n", - " Args:\n", - " text: string. The untokenized text of the first sequence. For single\n", - " sequence tasks, only this sequence must be specified.\n", - " label: (Optional) string. The label of the example. This should be\n", - " specified for train and dev examples, but not for test examples.\n", - " \"\"\"\n", - " self.text = text\n", - " self.label = label\n", - " \n", + " \"\"\"A single training/test input for sequence classification.\"\"\"\n", + "\n", + " def __init__(self, text, label=None):\n", + " \"\"\"Constructs an Input.\n", + " Args:\n", + " text: string. The untokenized text of the first sequence. For single\n", + " sequence tasks, only this sequence must be specified.\n", + " label: (Optional) string. The label of the example. This should be\n", + " specified for train and dev examples, but not for test examples.\n", + " \"\"\"\n", + " self.text = text\n", + " self.label = label\n", + "\n", "\n", "def convert_input(text_input, max_seq_length):\n", " # First, we need to preprocess our data so that it matches the data BERT was trained on:\n", " # 1. Lowercase our text (if we're using a BERT lowercase model)\n", " # 2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", " # 3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", - " # \n", + " #\n", " # Fortunately, the Transformers tokenizer does this for us!\n", "\n", " tokens = tokenizer.tokenize(text_input.text)\n", - " print('**tokens**\\n{}\\n'.format(tokens))\n", + " print(\"**tokens**\\n{}\\n\".format(tokens))\n", "\n", - " encode_plus_tokens = tokenizer.encode_plus(text_input.text,\n", - " pad_to_max_length=True,\n", - " max_length=max_seq_length,\n", - " truncation=True\n", - " )\n", + " encode_plus_tokens = tokenizer.encode_plus(\n", + " text_input.text, pad_to_max_length=True, max_length=max_seq_length, truncation=True\n", + " )\n", "\n", " # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)\n", - " input_ids = encode_plus_tokens['input_ids']\n", - " \n", - " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. \n", - " input_mask = encode_plus_tokens['attention_mask']\n", + " input_ids = encode_plus_tokens[\"input_ids\"]\n", + "\n", + " # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements.\n", + " input_mask = encode_plus_tokens[\"attention_mask\"]\n", "\n", " # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.\n", " segment_ids = [0] * max_seq_length\n", @@ -152,41 +146,37 @@ " # Label for each training row (`star_rating` 1 through 5)\n", " label_id = label_map[text_input.label]\n", "\n", - " features = InputFeatures(\n", - " input_ids=input_ids,\n", - " input_mask=input_mask,\n", - " segment_ids=segment_ids,\n", - " label_id=label_id)\n", + " features = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)\n", "\n", - " print('**input_ids**\\n{}\\n'.format(features.input_ids))\n", - " print('**input_mask**\\n{}\\n'.format(features.input_mask))\n", - " print('**segment_ids**\\n{}\\n'.format(features.segment_ids))\n", - " print('**label_id**\\n{}\\n'.format(features.label_id))\n", + " print(\"**input_ids**\\n{}\\n\".format(features.input_ids))\n", + " print(\"**input_mask**\\n{}\\n\".format(features.input_mask))\n", + " print(\"**segment_ids**\\n{}\\n\".format(features.segment_ids))\n", + " print(\"**label_id**\\n{}\\n\".format(features.label_id))\n", "\n", " return features\n", "\n", "\n", "# We'll need to transform our data into a format that BERT understands.\n", - "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.\n", "# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data\n", "def transform_inputs_to_tfrecord(inputs, max_seq_length):\n", " tf_records = []\n", " for (input_idx, text_input) in enumerate(inputs):\n", - " if input_idx % 10000 == 0:\n", - " print('Writing input {} of {}\\n'.format(input_idx, len(inputs)))\n", + " if input_idx % 10000 == 0:\n", + " print(\"Writing input {} of {}\\n\".format(input_idx, len(inputs)))\n", + "\n", + " features = convert_input(text_input, max_seq_length)\n", "\n", - " features = convert_input(text_input, max_seq_length)\n", - " \n", - " all_features = collections.OrderedDict()\n", - " all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n", - " all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n", - " all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n", - " all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n", + " all_features = collections.OrderedDict()\n", + " all_features[\"input_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))\n", + " all_features[\"input_mask\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))\n", + " all_features[\"segment_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))\n", + " all_features[\"label_ids\"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))\n", "\n", - " tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n", - " tf_records.append(tf_record.SerializeToString())\n", + " tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))\n", + " tf_records.append(tf_record.SerializeToString())\n", "\n", - " return tf_records\n" + " return tf_records" ] }, { @@ -223,17 +213,24 @@ "import pandas as pd\n", "\n", "data = [\n", - " [5,\"\"\"I needed an antivirus application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\"],\n", - " [3,\"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\"],\n", - " [1,\"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\"]\n", - " ]\n", - "\n", - "df = pd.DataFrame(data, columns=['star_rating','review_body'])\n", + " [\n", + " 5,\n", + " \"\"\"I needed an antivirus application and know the quality of Norton products. This was a no brainer for me and I am glad it was so simple to get.\"\"\",\n", + " ],\n", + " [\n", + " 3,\n", + " \"\"\"The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.\"\"\",\n", + " ],\n", + " [\n", + " 1,\n", + " \"\"\"Terrible, none of my codes worked, and I can't uninstall it. I think this product IS malware and viruses\"\"\",\n", + " ],\n", + "]\n", + "\n", + "df = pd.DataFrame(data, columns=[\"star_rating\", \"review_body\"])\n", "\n", "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", - "inputs = df.apply(lambda x: Input(text = x[DATA_COLUMN], \n", - " label = x[LABEL_COLUMN]), \n", - " axis = 1)\n", + "inputs = df.apply(lambda x: Input(text=x[DATA_COLUMN], label=x[LABEL_COLUMN]), axis=1)\n", "\n", "max_seq_length = 64\n", "tf_records = transform_inputs_to_tfrecord(inputs, max_seq_length)" @@ -255,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('**tf_records**')\n", + "print(\"**tf_records**\")\n", "\n", "for tf_record in tf_records:\n", " print(tf_record)" diff --git a/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb b/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb index b57c2041..87f2ce7f 100644 --- a/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb +++ b/07_train/container-demo/00_setup_eks/00_01_Setup_EKS.ipynb @@ -18,7 +18,8 @@ "outputs": [], "source": [ "import sys\n", - "print('Python Version %s' % sys.version)" + "\n", + "print(\"Python Version %s\" % sys.version)" ] }, { diff --git a/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb b/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb index d94d081f..63dc0ec7 100644 --- a/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb +++ b/07_train/container-demo/00_setup_eks/00_04_Setup_FSX.ipynb @@ -24,13 +24,13 @@ "import json\n", "from botocore.exceptions import ClientError\n", "\n", - "iam = boto3.client('iam')\n", - "sts = boto3.client('sts')\n", - "cfn = boto3.client('cloudformation')\n", - "eks = boto3.client('eks')\n", + "iam = boto3.client(\"iam\")\n", + "sts = boto3.client(\"sts\")\n", + "cfn = boto3.client(\"cloudformation\")\n", + "eks = boto3.client(\"eks\")\n", "\n", "region = boto3.Session().region_name\n", - "cluster_name = 'workshop'" + "cluster_name = \"workshop\"" ] }, { @@ -77,19 +77,16 @@ "metadata": {}, "outputs": [], "source": [ - "with open('fsx/fsx-csi-driver.json') as json_file:\n", + "with open(\"fsx/fsx-csi-driver.json\") as json_file:\n", " data = json.load(json_file)\n", " policy = json.dumps(data)\n", "\n", "try:\n", - " response = iam.create_policy(\n", - " PolicyName='Amazon_FSx_Lustre_CSI_Driver',\n", - " PolicyDocument=policy\n", - " )\n", + " response = iam.create_policy(PolicyName=\"Amazon_FSx_Lustre_CSI_Driver\", PolicyDocument=policy)\n", " print(\"[OK] Policy created.\")\n", "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " print(\"[OK] Policy already exists.\")\n", " else:\n", " print(\"Error: %s\" % e)" @@ -101,8 +98,8 @@ "metadata": {}, "outputs": [], "source": [ - "account_id = sts.get_caller_identity()['Account']\n", - "csi_policy_arn = 'arn:aws:iam::{}:policy/Amazon_FSx_Lustre_CSI_Driver'.format(account_id)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", + "csi_policy_arn = \"arn:aws:iam::{}:policy/Amazon_FSx_Lustre_CSI_Driver\".format(account_id)\n", "print(csi_policy_arn)" ] }, @@ -145,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "cf_stack_name = 'eksctl-{}-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa'.format(cluster_name)\n", + "cf_stack_name = \"eksctl-{}-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa\".format(cluster_name)\n", "print(cf_stack_name)" ] }, @@ -155,9 +152,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = cfn.list_stack_resources(\n", - " StackName=cf_stack_name\n", - ")\n", + "response = cfn.list_stack_resources(StackName=cf_stack_name)\n", "print(response)" ] }, @@ -167,7 +162,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_name = response['StackResourceSummaries'][0]['PhysicalResourceId']\n", + "iam_role_name = response[\"StackResourceSummaries\"][0][\"PhysicalResourceId\"]\n", "print(iam_role_name)" ] }, @@ -177,7 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_arn = iam.get_role(RoleName=iam_role_name)['Role']['Arn']\n", + "iam_role_arn = iam.get_role(RoleName=iam_role_name)[\"Role\"][\"Arn\"]\n", "print(iam_role_arn)" ] }, @@ -194,7 +189,7 @@ "metadata": {}, "outputs": [], "source": [ - "!kubectl apply -k \"github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=master\"\n" + "!kubectl apply -k \"github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=master\"" ] }, { @@ -227,7 +222,7 @@ "metadata": {}, "outputs": [], "source": [ - "bucket = 's3://fsx-container-demo'" + "bucket = \"s3://fsx-container-demo\"" ] }, { @@ -270,8 +265,7 @@ "metadata": {}, "outputs": [], "source": [ - "!curl -o storageclass.yaml https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/examples/kubernetes/dynamic_provisioning_s3/specs/storageclass.yaml\n", - " " + "!curl -o storageclass.yaml https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/examples/kubernetes/dynamic_provisioning_s3/specs/storageclass.yaml" ] }, { @@ -472,7 +466,7 @@ "metadata": {}, "outputs": [], "source": [ - "fsx = boto3.client('fsx')" + "fsx = boto3.client(\"fsx\")" ] }, { @@ -482,7 +476,7 @@ "outputs": [], "source": [ "response = fsx.describe_file_systems()\n", - "fsx_id = response['FileSystems'][0]['FileSystemId']\n", + "fsx_id = response[\"FileSystems\"][0][\"FileSystemId\"]\n", "print(fsx_id)" ] }, @@ -492,12 +486,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = fsx.update_file_system(\n", - " FileSystemId=fsx_id,\n", - " LustreConfiguration={\n", - " 'AutoImportPolicy': 'NEW_CHANGED'\n", - " }\n", - ")\n", + "response = fsx.update_file_system(FileSystemId=fsx_id, LustreConfiguration={\"AutoImportPolicy\": \"NEW_CHANGED\"})\n", "print(response)" ] } diff --git a/07_train/container-demo/01_Develop_Code_Notebook.ipynb b/07_train/container-demo/01_Develop_Code_Notebook.ipynb index 5641c4d7..181a5adf 100644 --- a/07_train/container-demo/01_Develop_Code_Notebook.ipynb +++ b/07_train/container-demo/01_Develop_Code_Notebook.ipynb @@ -56,12 +56,12 @@ "metadata": {}, "outputs": [], "source": [ - "train_data='./input/data/train'\n", - "validation_data='./input/data/validation'\n", - "test_data='./input/data/test'\n", - "local_model_dir='./model/'\n", - "num_gpus=0\n", - "input_data_config='File'" + "train_data = \"./input/data/train\"\n", + "validation_data = \"./input/data/validation\"\n", + "test_data = \"./input/data/test\"\n", + "local_model_dir = \"./model/\"\n", + "num_gpus = 0\n", + "input_data_config = \"File\"" ] }, { @@ -70,22 +70,22 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=8\n", - "validation_batch_size=8\n", - "test_batch_size=8\n", - "train_steps_per_epoch=1\n", - "validation_steps=1\n", - "test_steps=1\n", - "use_xla=True\n", - "use_amp=False\n", - "max_seq_length=64\n", - "freeze_bert_layer=True\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 8\n", + "validation_batch_size = 8\n", + "test_batch_size = 8\n", + "train_steps_per_epoch = 1\n", + "validation_steps = 1\n", + "test_steps = 1\n", + "use_xla = True\n", + "use_amp = False\n", + "max_seq_length = 64\n", + "freeze_bert_layer = True\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -116,66 +116,66 @@ "\n", "CLASSES = [1, 2, 3, 4, 5]\n", "\n", + "\n", "def select_data_and_label_from_record(record):\n", - " x = {\n", - " 'input_ids': record['input_ids'],\n", - " 'input_mask': record['input_mask'],\n", - " 'segment_ids': record['segment_ids']\n", - " }\n", + " x = {\"input_ids\": record[\"input_ids\"], \"input_mask\": record[\"input_mask\"], \"segment_ids\": record[\"segment_ids\"]}\n", "\n", - " y = record['label_ids']\n", + " y = record[\"label_ids\"]\n", "\n", " return (x, y)\n", "\n", "\n", - "def file_based_input_dataset_builder(channel,\n", - " input_filenames,\n", - " pipe_mode,\n", - " is_training,\n", - " drop_remainder,\n", - " batch_size,\n", - " epochs,\n", - " steps_per_epoch,\n", - " max_seq_length):\n", + "def file_based_input_dataset_builder(\n", + " channel,\n", + " input_filenames,\n", + " pipe_mode,\n", + " is_training,\n", + " drop_remainder,\n", + " batch_size,\n", + " epochs,\n", + " steps_per_epoch,\n", + " max_seq_length,\n", + "):\n", "\n", " # For training, we want a lot of parallel reading and shuffling.\n", " # For eval, we want no shuffling and parallel reading doesn't matter.\n", "\n", " if pipe_mode:\n", - " print('***** Using pipe_mode with channel {}'.format(channel))\n", - " from sagemaker_tensorflow import PipeModeDataset\n", - " dataset = PipeModeDataset(channel=channel,\n", - " record_format='TFRecord')\n", + " print(\"***** Using pipe_mode with channel {}\".format(channel))\n", + " from sagemaker_tensorflow import PipeModeDataset\n", + "\n", + " dataset = PipeModeDataset(channel=channel, record_format=\"TFRecord\")\n", " else:\n", - " print('***** Using input_filenames {}'.format(input_filenames))\n", + " print(\"***** Using input_filenames {}\".format(input_filenames))\n", " dataset = tf.data.TFRecordDataset(input_filenames)\n", - " \n", + "\n", " dataset = dataset.repeat(epochs * steps_per_epoch * 100)\n", "\n", " name_to_features = {\n", - " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - " \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", - " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n", + " \"input_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " \"input_mask\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " \"segment_ids\": tf.io.FixedLenFeature([max_seq_length], tf.int64),\n", + " \"label_ids\": tf.io.FixedLenFeature([], tf.int64),\n", " }\n", "\n", " def _decode_record(record, name_to_features):\n", " \"\"\"Decodes a record to a TensorFlow example.\"\"\"\n", " record = tf.io.parse_single_example(record, name_to_features)\n", " return record\n", - " \n", + "\n", " dataset = dataset.apply(\n", " tf.data.experimental.map_and_batch(\n", - " lambda record: _decode_record(record, name_to_features),\n", - " batch_size=batch_size,\n", - " drop_remainder=drop_remainder,\n", - " num_parallel_calls=tf.data.experimental.AUTOTUNE))\n", - "\n", - " dataset = dataset.shuffle(buffer_size=1000,\n", - " reshuffle_each_iteration=True)\n", - " \n", + " lambda record: _decode_record(record, name_to_features),\n", + " batch_size=batch_size,\n", + " drop_remainder=drop_remainder,\n", + " num_parallel_calls=tf.data.experimental.AUTOTUNE,\n", + " )\n", + " )\n", + "\n", + " dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True)\n", + "\n", " row_count = 0\n", - " print('**************** {} *****************'.format(channel))\n", + " print(\"**************** {} *****************\".format(channel))\n", " for row in dataset.as_numpy_iterator():\n", " if row_count == 1:\n", " break\n", @@ -184,105 +184,106 @@ " return dataset\n", "\n", "\n", - "if __name__ == '__main__':\n", - "\n", - " args=easydict.EasyDict({\n", - " 'train_data': train_data,\n", - " 'validation_data': validation_data,\n", - " 'test_data': test_data,\n", - " 'local_model_dir': local_model_dir,\n", - " 'num_gpus': num_gpus,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp,\n", - " 'max_seq_length': max_seq_length,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size,\n", - " 'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions,\n", - " 'input_data_config': input_data_config\n", - " })\n", - " \n", - " \n", - " env_var = os.environ \n", - " print(\"Environment Variables:\") \n", - " pprint.pprint(dict(env_var), width = 1) \n", - " \n", + "if __name__ == \"__main__\":\n", + "\n", + " args = easydict.EasyDict(\n", + " {\n", + " \"train_data\": train_data,\n", + " \"validation_data\": validation_data,\n", + " \"test_data\": test_data,\n", + " \"local_model_dir\": local_model_dir,\n", + " \"num_gpus\": num_gpus,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " \"input_data_config\": input_data_config,\n", + " }\n", + " )\n", + "\n", + " env_var = os.environ\n", + " print(\"Environment Variables:\")\n", + " pprint.pprint(dict(env_var), width=1)\n", + "\n", " train_data = args.train_data\n", - " print('train_data {}'.format(train_data))\n", + " print(\"train_data {}\".format(train_data))\n", " validation_data = args.validation_data\n", - " print('validation_data {}'.format(validation_data))\n", + " print(\"validation_data {}\".format(validation_data))\n", " test_data = args.test_data\n", - " print('test_data {}'.format(test_data)) \n", + " print(\"test_data {}\".format(test_data))\n", " local_model_dir = args.local_model_dir\n", - " print('local_model_dir {}'.format(local_model_dir)) \n", + " print(\"local_model_dir {}\".format(local_model_dir))\n", " num_gpus = args.num_gpus\n", - " print('num_gpus {}'.format(num_gpus)) \n", + " print(\"num_gpus {}\".format(num_gpus))\n", " use_xla = args.use_xla\n", - " print('use_xla {}'.format(use_xla)) \n", + " print(\"use_xla {}\".format(use_xla))\n", " use_amp = args.use_amp\n", - " print('use_amp {}'.format(use_amp)) \n", + " print(\"use_amp {}\".format(use_amp))\n", " max_seq_length = args.max_seq_length\n", - " print('max_seq_length {}'.format(max_seq_length)) \n", + " print(\"max_seq_length {}\".format(max_seq_length))\n", " train_batch_size = args.train_batch_size\n", - " print('train_batch_size {}'.format(train_batch_size)) \n", + " print(\"train_batch_size {}\".format(train_batch_size))\n", " validation_batch_size = args.validation_batch_size\n", - " print('validation_batch_size {}'.format(validation_batch_size)) \n", + " print(\"validation_batch_size {}\".format(validation_batch_size))\n", " test_batch_size = args.test_batch_size\n", - " print('test_batch_size {}'.format(test_batch_size)) \n", + " print(\"test_batch_size {}\".format(test_batch_size))\n", " epochs = args.epochs\n", - " print('epochs {}'.format(epochs)) \n", + " print(\"epochs {}\".format(epochs))\n", " learning_rate = args.learning_rate\n", - " print('learning_rate {}'.format(learning_rate)) \n", + " print(\"learning_rate {}\".format(learning_rate))\n", " epsilon = args.epsilon\n", - " print('epsilon {}'.format(epsilon)) \n", + " print(\"epsilon {}\".format(epsilon))\n", " train_steps_per_epoch = args.train_steps_per_epoch\n", - " print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) \n", + " print(\"train_steps_per_epoch {}\".format(train_steps_per_epoch))\n", " validation_steps = args.validation_steps\n", - " print('validation_steps {}'.format(validation_steps)) \n", + " print(\"validation_steps {}\".format(validation_steps))\n", " test_steps = args.test_steps\n", - " print('test_steps {}'.format(test_steps)) \n", + " print(\"test_steps {}\".format(test_steps))\n", " freeze_bert_layer = args.freeze_bert_layer\n", - " print('freeze_bert_layer {}'.format(freeze_bert_layer)) \n", + " print(\"freeze_bert_layer {}\".format(freeze_bert_layer))\n", " run_validation = args.run_validation\n", - " print('run_validation {}'.format(run_validation)) \n", + " print(\"run_validation {}\".format(run_validation))\n", " run_test = args.run_test\n", - " print('run_test {}'.format(run_test)) \n", + " print(\"run_test {}\".format(run_test))\n", " run_sample_predictions = args.run_sample_predictions\n", - " print('run_sample_predictions {}'.format(run_sample_predictions))\n", + " print(\"run_sample_predictions {}\".format(run_sample_predictions))\n", " input_data_config = args.input_data_config\n", - " print('input_data_config {}'.format(input_data_config))\n", - " \n", - " # Determine if PipeMode is enabled \n", - " pipe_mode = (input_data_config.find('Pipe') >= 0)\n", - " print('Using pipe_mode: {}'.format(pipe_mode))\n", - " \n", - " # Model Output \n", - " transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/')\n", + " print(\"input_data_config {}\".format(input_data_config))\n", + "\n", + " # Determine if PipeMode is enabled\n", + " pipe_mode = input_data_config.find(\"Pipe\") >= 0\n", + " print(\"Using pipe_mode: {}\".format(pipe_mode))\n", + "\n", + " # Model Output\n", + " transformer_fine_tuned_model_path = os.path.join(local_model_dir, \"transformers/fine-tuned/\")\n", " os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)\n", "\n", " # SavedModel Output\n", - " tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')\n", - " os.makedirs(tensorflow_saved_model_path, exist_ok=True) \n", - " \n", + " tensorflow_saved_model_path = os.path.join(local_model_dir, \"tensorflow/saved_model/0\")\n", + " os.makedirs(tensorflow_saved_model_path, exist_ok=True)\n", + "\n", " distributed_strategy = tf.distribute.MirroredStrategy()\n", "\n", " with distributed_strategy.scope():\n", " tf.config.optimizer.set_jit(use_xla)\n", " tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": use_amp})\n", "\n", - " train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))\n", - " print('train_data_filenames {}'.format(train_data_filenames))\n", + " train_data_filenames = glob(os.path.join(train_data, \"*.tfrecord\"))\n", + " print(\"train_data_filenames {}\".format(train_data_filenames))\n", " train_dataset = file_based_input_dataset_builder(\n", - " channel='train',\n", + " channel=\"train\",\n", " input_filenames=train_data_filenames,\n", " pipe_mode=pipe_mode,\n", " is_training=True,\n", @@ -290,7 +291,8 @@ " batch_size=train_batch_size,\n", " epochs=epochs,\n", " steps_per_epoch=train_steps_per_epoch,\n", - " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n", + " max_seq_length=max_seq_length,\n", + " ).map(select_data_and_label_from_record)\n", "\n", " tokenizer = None\n", " config = None\n", @@ -298,48 +300,46 @@ "\n", " successful_download = False\n", " retries = 0\n", - " while (retries < 5 and not successful_download):\n", + " while retries < 5 and not successful_download:\n", " try:\n", - " tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", - " config = DistilBertConfig.from_pretrained('distilbert-base-uncased',\n", - " num_labels=len(CLASSES))\n", - " model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',\n", - " config=config)\n", + " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + " config = DistilBertConfig.from_pretrained(\"distilbert-base-uncased\", num_labels=len(CLASSES))\n", + " model = TFDistilBertForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", config=config)\n", " successful_download = True\n", - " print('Sucessfully downloaded after {} retries.'.format(retries))\n", + " print(\"Sucessfully downloaded after {} retries.\".format(retries))\n", " except:\n", " retries = retries + 1\n", " random_sleep = random.randint(1, 30)\n", - " print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep))\n", + " print(\"Retry #{}. Sleeping for {} seconds\".format(retries, random_sleep))\n", " time.sleep(random_sleep)\n", "\n", - " callbacks = [] \n", - " initial_epoch_number = 0 \n", + " callbacks = []\n", + " initial_epoch_number = 0\n", "\n", " if not tokenizer or not model or not config:\n", - " print('Not properly initialized...')\n", + " print(\"Not properly initialized...\")\n", "\n", " optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)\n", - " print('** use_amp {}'.format(use_amp)) \n", + " print(\"** use_amp {}\".format(use_amp))\n", " if use_amp:\n", " # loss scaling is currently required when using mixed precision\n", - " optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')\n", - " \n", - " print('*** OPTIMIZER {} ***'.format(optimizer))\n", - " \n", + " optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, \"dynamic\")\n", + "\n", + " print(\"*** OPTIMIZER {} ***\".format(optimizer))\n", + "\n", " loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", - " metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", + " metric = tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")\n", "\n", " model.compile(optimizer=optimizer, loss=loss, metrics=[metric])\n", - " print('Compiled model {}'.format(model)) \n", + " print(\"Compiled model {}\".format(model))\n", " model.layers[0].trainable = not freeze_bert_layer\n", " print(model.summary())\n", "\n", " if run_validation:\n", - " validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))\n", - " print('validation_data_filenames {}'.format(validation_data_filenames))\n", + " validation_data_filenames = glob(os.path.join(validation_data, \"*.tfrecord\"))\n", + " print(\"validation_data_filenames {}\".format(validation_data_filenames))\n", " validation_dataset = file_based_input_dataset_builder(\n", - " channel='validation',\n", + " channel=\"validation\",\n", " input_filenames=validation_data_filenames,\n", " pipe_mode=pipe_mode,\n", " is_training=False,\n", @@ -347,34 +347,39 @@ " batch_size=validation_batch_size,\n", " epochs=epochs,\n", " steps_per_epoch=validation_steps,\n", - " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n", - " \n", - " print('Starting Training and Validation...')\n", + " max_seq_length=max_seq_length,\n", + " ).map(select_data_and_label_from_record)\n", + "\n", + " print(\"Starting Training and Validation...\")\n", " validation_dataset = validation_dataset.take(validation_steps)\n", - " train_and_validation_history = model.fit(train_dataset,\n", - " shuffle=True,\n", - " epochs=epochs,\n", - " initial_epoch=initial_epoch_number,\n", - " steps_per_epoch=train_steps_per_epoch,\n", - " validation_data=validation_dataset,\n", - " validation_steps=validation_steps,\n", - " callbacks=callbacks) \n", + " train_and_validation_history = model.fit(\n", + " train_dataset,\n", + " shuffle=True,\n", + " epochs=epochs,\n", + " initial_epoch=initial_epoch_number,\n", + " steps_per_epoch=train_steps_per_epoch,\n", + " validation_data=validation_dataset,\n", + " validation_steps=validation_steps,\n", + " callbacks=callbacks,\n", + " )\n", " print(train_and_validation_history)\n", - " else: # Not running validation\n", - " print('Starting Training (Without Validation)...')\n", - " train_history = model.fit(train_dataset,\n", - " shuffle=True,\n", - " epochs=epochs,\n", - " initial_epoch=initial_epoch_number,\n", - " steps_per_epoch=train_steps_per_epoch,\n", - " callbacks=callbacks) \n", + " else: # Not running validation\n", + " print(\"Starting Training (Without Validation)...\")\n", + " train_history = model.fit(\n", + " train_dataset,\n", + " shuffle=True,\n", + " epochs=epochs,\n", + " initial_epoch=initial_epoch_number,\n", + " steps_per_epoch=train_steps_per_epoch,\n", + " callbacks=callbacks,\n", + " )\n", " print(train_history)\n", "\n", " if run_test:\n", - " test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))\n", - " print('test_data_filenames {}'.format(test_data_filenames))\n", + " test_data_filenames = glob(os.path.join(test_data, \"*.tfrecord\"))\n", + " print(\"test_data_filenames {}\".format(test_data_filenames))\n", " test_dataset = file_based_input_dataset_builder(\n", - " channel='test',\n", + " channel=\"test\",\n", " input_filenames=test_data_filenames,\n", " pipe_mode=pipe_mode,\n", " is_training=False,\n", @@ -382,56 +387,50 @@ " batch_size=test_batch_size,\n", " epochs=epochs,\n", " steps_per_epoch=test_steps,\n", - " max_seq_length=max_seq_length).map(select_data_and_label_from_record)\n", - "\n", - " print('Starting test...')\n", - " test_history = model.evaluate(test_dataset,\n", - " steps=test_steps,\n", - " callbacks=callbacks)\n", - " \n", - " print('Test history {}'.format(test_history))\n", - " \n", + " max_seq_length=max_seq_length,\n", + " ).map(select_data_and_label_from_record)\n", + "\n", + " print(\"Starting test...\")\n", + " test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)\n", + "\n", + " print(\"Test history {}\".format(test_history))\n", + "\n", " # Save the Fine-Tuned Transformers Model as a New \"Pre-Trained\" Model\n", - " print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) \n", + " print(\"transformer_fine_tuned_model_path {}\".format(transformer_fine_tuned_model_path))\n", " model.save_pretrained(transformer_fine_tuned_model_path)\n", "\n", " # Save the TensorFlow SavedModel for Serving Predictions\n", - " print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) \n", - " model.save(tensorflow_saved_model_path, save_format='tf')\n", - " \n", + " print(\"tensorflow_saved_model_path {}\".format(tensorflow_saved_model_path))\n", + " model.save(tensorflow_saved_model_path, save_format=\"tf\")\n", + "\n", " if run_sample_predictions:\n", - " loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path,\n", - " id2label={\n", - " 0: 1,\n", - " 1: 2,\n", - " 2: 3,\n", - " 3: 4,\n", - " 4: 5\n", - " },\n", - " label2id={\n", - " 1: 0,\n", - " 2: 1,\n", - " 3: 2,\n", - " 4: 3,\n", - " 5: 4\n", - " })\n", - "\n", - " tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + " loaded_model = TFDistilBertForSequenceClassification.from_pretrained(\n", + " transformer_fine_tuned_model_path,\n", + " id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},\n", + " label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},\n", + " )\n", + "\n", + " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", " if num_gpus >= 1:\n", - " inference_device = 0 # GPU 0\n", + " inference_device = 0 # GPU 0\n", " else:\n", - " inference_device = -1 # CPU\n", - " print('inference_device {}'.format(inference_device))\n", + " inference_device = -1 # CPU\n", + " print(\"inference_device {}\".format(inference_device))\n", "\n", - " inference_pipeline = TextClassificationPipeline(model=loaded_model, \n", - " tokenizer=tokenizer,\n", - " framework='tf',\n", - " device=inference_device) \n", + " inference_pipeline = TextClassificationPipeline(\n", + " model=loaded_model, tokenizer=tokenizer, framework=\"tf\", device=inference_device\n", + " )\n", "\n", - " print(\"\"\"I loved it! I will recommend this to everyone.\"\"\", inference_pipeline(\"\"\"I loved it! I will recommend this to everyone.\"\"\"))\n", + " print(\n", + " \"\"\"I loved it! I will recommend this to everyone.\"\"\",\n", + " inference_pipeline(\"\"\"I loved it! I will recommend this to everyone.\"\"\"),\n", + " )\n", " print(\"\"\"It's OK.\"\"\", inference_pipeline(\"\"\"It's OK.\"\"\"))\n", - " print(\"\"\"Really bad. I hope they don't make this anymore.\"\"\", inference_pipeline(\"\"\"Really bad. I hope they don't make this anymore.\"\"\"))" + " print(\n", + " \"\"\"Really bad. I hope they don't make this anymore.\"\"\",\n", + " inference_pipeline(\"\"\"Really bad. I hope they don't make this anymore.\"\"\"),\n", + " )" ] } ], diff --git a/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb b/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb index 99a730b0..1550a60d 100644 --- a/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb +++ b/07_train/container-demo/03_Run_ML_Training_SageMaker.ipynb @@ -44,7 +44,7 @@ "import boto3\n", "import sagemaker\n", "\n", - "session = sagemaker.Session()\n", + "session = sagemaker.Session()\n", "bucket = session.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -64,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "processed_train_data_s3_uri='s3://fsx-container-demo/input/data/train'\n", + "processed_train_data_s3_uri = \"s3://fsx-container-demo/input/data/train\"\n", "\n", "!aws s3 ls $processed_train_data_s3_uri/" ] @@ -75,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "processed_validation_data_s3_uri='s3://fsx-container-demo/input/data/validation'\n", + "processed_validation_data_s3_uri = \"s3://fsx-container-demo/input/data/validation\"\n", "\n", "!aws s3 ls $processed_validation_data_s3_uri/" ] @@ -88,7 +88,7 @@ }, "outputs": [], "source": [ - "processed_test_data_s3_uri='s3://fsx-container-demo/input/data/test'\n", + "processed_test_data_s3_uri = \"s3://fsx-container-demo/input/data/test\"\n", "\n", "!aws s3 ls $processed_test_data_s3_uri/" ] @@ -110,12 +110,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -135,22 +132,22 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=3\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=64\n", - "test_batch_size=64\n", - "train_steps_per_epoch=100\n", - "validation_steps=10\n", - "test_steps=10\n", - "use_xla=True\n", - "use_amp=True\n", - "max_seq_length=64\n", - "freeze_bert_layer=True\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 3\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 64\n", + "test_batch_size = 64\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 10\n", + "test_steps = 10\n", + "use_xla = True\n", + "use_amp = True\n", + "max_seq_length = 64\n", + "freeze_bert_layer = True\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -191,10 +188,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -225,36 +222,39 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='train.py',\n", - " source_dir='code',\n", - " role=role,\n", - " instance_count=1,\n", - " instance_type='ml.c5.9xlarge',\n", - " use_spot_instances=True,\n", - " max_run=3600,\n", - " max_wait=3600,\n", - " volume_size=1024,\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode='Pipe',\n", - " metric_definitions=metrics_definitions\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"train.py\",\n", + " source_dir=\"code\",\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=\"ml.c5.9xlarge\",\n", + " use_spot_instances=True,\n", + " max_run=3600,\n", + " max_wait=3600,\n", + " volume_size=1024,\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=\"Pipe\",\n", + " metric_definitions=metrics_definitions,\n", + ")" ] }, { @@ -270,11 +270,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -284,7 +283,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -295,7 +294,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -306,7 +311,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -317,7 +328,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/07_train/container-demo/code/train.py b/07_train/container-demo/code/train.py index 94bc8fc8..38f7e539 100644 --- a/07_train/container-demo/code/train.py +++ b/07_train/container-demo/code/train.py @@ -10,9 +10,9 @@ import os import tensorflow as tf -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==2.8.0"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker-tensorflow==2.1.0.1.0.0"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) from transformers import DistilBertTokenizer from transformers import TFDistilBertForSequenceClassification @@ -25,65 +25,64 @@ def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): if row_count == 1: break @@ -93,159 +92,114 @@ def _decode_record(record, name_to_features): return dataset -if __name__ == '__main__': - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('Listing /opt...') - for root, subFolder, files in os.walk('/opt'): +if __name__ == "__main__": + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("Listing /opt...") + for root, subFolder, files in os.walk("/opt"): for item in files: - print('{},{},{}'.format(root, subFolder, item)) - print('Done.') - + print("{},{},{}".format(root, subFolder, item)) + print("Done.") + parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--input_data_config', - type=str, - default=os.environ['SM_INPUT_DATA_CONFIG']) - parser.add_argument('--local_model_dir', - type=str, - default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=64) - parser.add_argument('--test_batch_size', - type=int, - default=64) - parser.add_argument('--epochs', - type=int, - default=3) - parser.add_argument('--learning_rate', - type=float, - default=0.00001) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=100) - parser.add_argument('--validation_steps', - type=int, - default=10) - parser.add_argument('--test_steps', - type=int, - default=10) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--input_data_config", type=str, default=os.environ["SM_INPUT_DATA_CONFIG"]) + parser.add_argument("--local_model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=64) + parser.add_argument("--test_batch_size", type=int, default=64) + parser.add_argument("--epochs", type=int, default=3) + parser.add_argument("--learning_rate", type=float, default=0.00001) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=100) + parser.add_argument("--validation_steps", type=int, default=10) + parser.add_argument("--test_steps", type=int, default=10) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) + print("test_data {}".format(test_data)) local_model_dir = args.local_model_dir - print('local_model_dir {}'.format(local_model_dir)) + print("local_model_dir {}".format(local_model_dir)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) + print("num_gpus {}".format(num_gpus)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) input_data_config = args.input_data_config - print('input_data_config {}'.format(input_data_config)) - - - # Determine if PipeMode is enabled - pipe_mode = (input_data_config.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + print("input_data_config {}".format(input_data_config)) + + # Determine if PipeMode is enabled + pipe_mode = input_data_config.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') - os.makedirs(tensorflow_saved_model_path, exist_ok=True) - + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") + os.makedirs(tensorflow_saved_model_path, exist_ok=True) + distributed_strategy = tf.distribute.MirroredStrategy() - + with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -253,7 +207,8 @@ def _decode_record(record, name_to_features): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -261,50 +216,47 @@ def _decode_record(record, name_to_features): successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES)) - model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', - config=config) + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=len(CLASSES)) + model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config) successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") + + print("*** OPTIMIZER {} ***".format(optimizer)) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) + print("Compiled model {}".format(model)) model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -312,34 +264,39 @@ def _decode_record(record, name_to_features): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -347,53 +304,47 @@ def _decode_record(record, name_to_features): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Tuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) model.save_pretrained(transformer_fine_tuned_model_path) # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, save_format="tf") + if run_sample_predictions: - loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path, - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + loaded_model = TFDistilBertForSequenceClassification.from_pretrained( + transformer_fine_tuned_model_path, + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") if num_gpus >= 1: - inference_device = 0 # GPU 0 + inference_device = 0 # GPU 0 else: - inference_device = -1 # CPU - print('inference_device {}'.format(inference_device)) + inference_device = -1 # CPU + print("inference_device {}".format(inference_device)) - inference_pipeline = TextClassificationPipeline(model=loaded_model, - tokenizer=tokenizer, - framework='tf', - device=inference_device) + inference_pipeline = TextClassificationPipeline( + model=loaded_model, tokenizer=tokenizer, framework="tf", device=inference_device + ) - print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + inference_pipeline("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", inference_pipeline("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore.""")) \ No newline at end of file + print( + """Really bad. I hope they don't make this anymore.""", + inference_pipeline("""Really bad. I hope they don't make this anymore."""), + ) diff --git a/07_train/evaluate_model_metrics.py b/07_train/evaluate_model_metrics.py index 024afdec..f3523174 100644 --- a/07_train/evaluate_model_metrics.py +++ b/07_train/evaluate_model_metrics.py @@ -4,13 +4,16 @@ from datetime import datetime import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) import pandas as pd import os import re @@ -33,99 +36,99 @@ from sklearn.utils import resample -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] -config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) +config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, +) def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--input-model', type=str, - default='/opt/ml/processing/input/model', + parser.add_argument( + "--input-model", + type=str, + default="/opt/ml/processing/input/model", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - + ) + return parser.parse_args() - + def process(args): - print('Current host: {}'.format(args.current_host)) - - print('input_data: {}'.format(args.input_data)) - print('input_model: {}'.format(args.input_model)) - - print('Listing contents of input model dir: {}'.format(args.input_model)) + print("Current host: {}".format(args.current_host)) + + print("input_data: {}".format(args.input_data)) + print("input_model: {}".format(args.input_model)) + + print("Listing contents of input model dir: {}".format(args.input_model)) input_files = os.listdir(args.input_model) for file in input_files: print(file) - model_tar_path = '{}/model.tar.gz'.format(args.input_model) + model_tar_path = "{}/model.tar.gz".format(args.input_model) model_tar = tarfile.open(model_tar_path) model_tar.extractall(args.input_model) - model_tar.close() + model_tar.close() - model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model)) + model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model)) print(model) - + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=args.max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -133,81 +136,86 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) - + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) ########################################################################################### # TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz # - ########################################################################################### -# evaluation_data_path = '/opt/ml/processing/input/data/' - - print('Listing contents of input data dir: {}'.format(args.input_data)) + ########################################################################################### + # evaluation_data_path = '/opt/ml/processing/input/data/' + + print("Listing contents of input data dir: {}".format(args.input_data)) input_files = os.listdir(args.input_data) - test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data) - print('Using only {} to evaluate.'.format(test_data_path)) - df_test_reviews = pd.read_csv(test_data_path, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data) + print("Using only {} to evaluate.".format(test_data_path)) + df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[ + ["review_body", "star_rating"] + ] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - y_actual = df_test_reviews['star_rating'] + y_actual = df_test_reviews["star_rating"] y_actual print(classification_report(y_true=y_test, y_pred=y_actual)) - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) def plot_conf_mat(cm, classes, title, cmap): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.ylabel("True label") + plt.xlabel("Predicted label") cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=CLASSES, - title='Confusion Matrix', - cmap=plt.cm.Greens) + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens) - # Save the confusion matrix + # Save the confusion matrix plt.show() - # Model Output - metrics_path = os.path.join(args.output_data, 'metrics/') + # Model Output + metrics_path = os.path.join(args.output_data, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) report_dict = { "metrics": { @@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap): evaluation_path = "{}/evaluation.json".format(metrics_path) with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict)) - - print('Listing contents of output dir: {}'.format(args.output_data)) + + print("Listing contents of output dir: {}".format(args.output_data)) output_files = os.listdir(args.output_data) for file in output_files: print(file) - print('Listing contents of output/metrics dir: {}'.format(metrics_path)) - output_files = os.listdir('{}'.format(metrics_path)) + print("Listing contents of output/metrics dir: {}".format(metrics_path)) + output_files = os.listdir("{}".format(metrics_path)) for file in output_files: print(file) - print('Complete') - - + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) - process(args) + process(args) diff --git a/07_train/src/inference.py b/07_train/src/inference.py index 2975dc2d..53196737 100644 --- a/07_train/src/inference.py +++ b/07_train/src/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/07_train/src/tf_bert_reviews.py b/07_train/src/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/07_train/src/tf_bert_reviews.py +++ b/07_train/src/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb b/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb index 0edeb425..2250389d 100644 --- a/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb +++ b/08_optimize/01_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb @@ -33,12 +33,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -72,11 +72,11 @@ "source": [ "try:\n", " processed_train_data_s3_uri\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -105,11 +105,11 @@ "source": [ "try:\n", " processed_validation_data_s3_uri\n", - " print('[OK]') \n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous sections before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous sections before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -138,11 +138,11 @@ "source": [ "try:\n", " processed_test_data_s3_uri\n", - " print('[OK]') \n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous sections before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous sections before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -205,12 +205,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -253,11 +250,11 @@ "source": [ "try:\n", " max_seq_length\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -277,26 +274,26 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=3\n", - "epsilon=0.00000001\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.4xlarge' #evt\n", - "#train_instance_type='ml.m5.4xlarge' #bur\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "enable_sagemaker_debugger=False\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 3\n", + "epsilon = 0.00000001\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.4xlarge\" # evt\n", + "# train_instance_type='ml.m5.4xlarge' #bur\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "enable_sagemaker_debugger = False\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -323,11 +320,11 @@ "source": [ "try:\n", " experiment_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -356,11 +353,11 @@ "source": [ "try:\n", " trial_name\n", - " print('[OK]') \n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -381,7 +378,7 @@ "import time\n", "from smexperiments.trial import Trial\n", "\n", - "timestamp = '{}'.format(int(time.time()))\n", + "timestamp = \"{}\".format(int(time.time()))\n", "\n", "trial = Trial.load(trial_name=trial_name)\n", "print(trial)" @@ -395,11 +392,10 @@ "source": [ "from smexperiments.tracker import Tracker\n", "\n", - "tracker_optimize = Tracker.create(display_name='optimize-1', \n", - " sagemaker_boto_client=sm)\n", + "tracker_optimize = Tracker.create(display_name=\"optimize-1\", sagemaker_boto_client=sm)\n", "\n", "optimize_trial_component_name = tracker_optimize.trial_component.trial_component_name\n", - "print('Optimize trial component name {}'.format(optimize_trial_component_name))" + "print(\"Optimize trial component name {}\".format(optimize_trial_component_name))" ] }, { @@ -435,11 +431,11 @@ "from sagemaker.tuner import ContinuousParameter\n", "from sagemaker.tuner import CategoricalParameter\n", "from sagemaker.tuner import HyperparameterTuner\n", - " \n", + "\n", "hyperparameter_ranges = {\n", - " 'learning_rate': ContinuousParameter(0.00001, 0.00005, scaling_type='Linear'),\n", - " 'train_batch_size': CategoricalParameter([128, 256]),\n", - " 'freeze_bert_layer': CategoricalParameter([True, False])\n", + " \"learning_rate\": ContinuousParameter(0.00001, 0.00005, scaling_type=\"Linear\"),\n", + " \"train_batch_size\": CategoricalParameter([128, 256]),\n", + " \"freeze_bert_layer\": CategoricalParameter([True, False]),\n", "}" ] }, @@ -457,10 +453,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -472,34 +468,37 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py37',\n", - " framework_version='2.3.1',\n", - " hyperparameters={'epochs': epochs,\n", - " 'epsilon': epsilon,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp,\n", - " 'max_seq_length': max_seq_length,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger, \n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " metric_definitions=metrics_definitions,\n", - "# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py37\",\n", + " framework_version=\"2.3.1\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"epsilon\": epsilon,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " metric_definitions=metrics_definitions,\n", + " # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n", + ")" ] }, { @@ -515,18 +514,18 @@ "metadata": {}, "outputs": [], "source": [ - "objective_metric_name = 'train:accuracy'\n", + "objective_metric_name = \"train:accuracy\"\n", "\n", "tuner = HyperparameterTuner(\n", " estimator=estimator,\n", - " objective_type='Maximize',\n", + " objective_type=\"Maximize\",\n", " objective_metric_name=objective_metric_name,\n", " hyperparameter_ranges=hyperparameter_ranges,\n", " metric_definitions=metrics_definitions,\n", " max_jobs=2,\n", " max_parallel_jobs=1,\n", - " strategy='Bayesian',\n", - " early_stopping_type='Auto'\n", + " strategy=\"Bayesian\",\n", + " early_stopping_type=\"Auto\",\n", ")" ] }, @@ -543,12 +542,11 @@ "metadata": {}, "outputs": [], "source": [ - "tuner.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " include_cls_metadata=False,\n", - " wait=False)" + "tuner.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " include_cls_metadata=False,\n", + " wait=False,\n", + ")" ] }, { @@ -579,8 +577,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Hyper-Parameter Tuning Job'.format(region, tuning_job_name)))" + "\n", + "display(\n", + " HTML(\n", + " 'Review Hyper-Parameter Tuning Job'.format(\n", + " region, tuning_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -624,10 +628,7 @@ "source": [ "from sagemaker.analytics import HyperparameterTuningJobAnalytics\n", "\n", - "hp_results = HyperparameterTuningJobAnalytics(\n", - " sagemaker_session=sess, \n", - " hyperparameter_tuning_job_name=tuning_job_name\n", - ")\n", + "hp_results = HyperparameterTuningJobAnalytics(sagemaker_session=sess, hyperparameter_tuning_job_name=tuning_job_name)\n", "\n", "df_results = hp_results.dataframe()\n", "df_results.shape" @@ -639,7 +640,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_results.sort_values('FinalObjectiveValue', ascending=0)" + "df_results.sort_values(\"FinalObjectiveValue\", ascending=0)" ] }, { @@ -655,7 +656,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)" + "df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)" ] }, { @@ -673,7 +674,7 @@ "metadata": {}, "outputs": [], "source": [ - "best_learning_rate = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['learning_rate']\n", + "best_learning_rate = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"learning_rate\"]\n", "print(best_learning_rate)" ] }, @@ -683,7 +684,7 @@ "metadata": {}, "outputs": [], "source": [ - "best_accuracy = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['FinalObjectiveValue']\n", + "best_accuracy = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"FinalObjectiveValue\"]\n", "print(best_accuracy)" ] }, @@ -693,9 +694,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker_optimize.log_parameters({\n", - " 'learning_rate': float(best_learning_rate)\n", - "})\n", + "tracker_optimize.log_parameters({\"learning_rate\": float(best_learning_rate)})\n", "\n", "# must save after logging\n", "tracker_optimize.trial_component.save()" @@ -707,7 +706,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker_optimize.log_metric('accuracy', float(best_accuracy))\n", + "tracker_optimize.log_metric(\"accuracy\", float(best_accuracy))\n", "\n", "# must save after logging\n", "tracker_optimize.trial_component.save()" @@ -731,7 +730,7 @@ "lineage_table = ExperimentAnalytics(\n", " sagemaker_session=sess,\n", " experiment_name=experiment_name,\n", - " metric_names=['validation:accuracy'],\n", + " metric_names=[\"validation:accuracy\"],\n", " sort_by=\"CreationTime\",\n", " sort_order=\"Descending\",\n", ")\n", diff --git a/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb b/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb index c07a2273..2e7e7cfd 100644 --- a/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb +++ b/08_optimize/02_Warm_Start_Hyper_Parameter_Tuning_Reviews_BERT_TensorFlow.ipynb @@ -32,12 +32,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -66,11 +66,11 @@ "source": [ "try:\n", " tuning_job_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the previous Hyperparameter Tuning notebook.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -95,9 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "job_description = sm.describe_hyper_parameter_tuning_job(\n", - " HyperParameterTuningJobName=tuning_job_name\n", - ")" + "job_description = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)" ] }, { @@ -107,15 +105,15 @@ "outputs": [], "source": [ "if not bool(job_description):\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++') \n", - " print('[ERROR] Please run the previous Hyperparameter Tuning notebook before you continue.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++') \n", - "elif job_description['HyperParameterTuningJobStatus'] == 'Completed':\n", - " print('[OK] Previous Tuning Job has completed. Please continue.')\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n", + "elif job_description[\"HyperParameterTuningJobStatus\"] == \"Completed\":\n", + " print(\"[OK] Previous Tuning Job has completed. Please continue.\")\n", "else:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the previous Hyperparameter Tuning notebook.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the previous Hyperparameter Tuning notebook.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -142,11 +140,11 @@ "source": [ "try:\n", " processed_train_data_s3_uri\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -175,11 +173,11 @@ "source": [ "try:\n", " processed_validation_data_s3_uri\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -208,11 +206,11 @@ "source": [ "try:\n", " processed_test_data_s3_uri\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -262,12 +260,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -310,11 +305,11 @@ "source": [ "try:\n", " max_seq_length\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous PREPARE section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous PREPARE section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -332,26 +327,26 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=3\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.4xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "enable_sagemaker_debugger=False\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 3\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.4xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "enable_sagemaker_debugger = False\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -378,11 +373,11 @@ "source": [ "try:\n", " experiment_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -411,11 +406,11 @@ "source": [ "try:\n", " trial_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -436,7 +431,7 @@ "import time\n", "from smexperiments.trial import Trial\n", "\n", - "timestamp = '{}'.format(int(time.time()))\n", + "timestamp = \"{}\".format(int(time.time()))\n", "\n", "trial = Trial.load(trial_name=trial_name)\n", "print(trial)" @@ -450,11 +445,10 @@ "source": [ "from smexperiments.tracker import Tracker\n", "\n", - "tracker_optimize = Tracker.create(display_name='optimize-2', \n", - " sagemaker_boto_client=sm)\n", + "tracker_optimize = Tracker.create(display_name=\"optimize-2\", sagemaker_boto_client=sm)\n", "\n", "optimize_trial_component_name = tracker_optimize.trial_component.trial_component_name\n", - "print('Optimize trial component name {}'.format(optimize_trial_component_name))" + "print(\"Optimize trial component name {}\".format(optimize_trial_component_name))" ] }, { @@ -491,11 +485,11 @@ "from sagemaker.tuner import ContinuousParameter\n", "from sagemaker.tuner import CategoricalParameter\n", "from sagemaker.tuner import HyperparameterTuner\n", - " \n", + "\n", "hyperparameter_ranges = {\n", - " 'learning_rate': ContinuousParameter(0.00015, 0.00075, scaling_type='Linear'),\n", - " 'train_batch_size': CategoricalParameter([64, 128]), \n", - " 'freeze_bert_layer': CategoricalParameter([True, False]),\n", + " \"learning_rate\": ContinuousParameter(0.00015, 0.00075, scaling_type=\"Linear\"),\n", + " \"train_batch_size\": CategoricalParameter([64, 128]),\n", + " \"freeze_bert_layer\": CategoricalParameter([True, False]),\n", "}" ] }, @@ -532,10 +526,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -547,34 +541,37 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py37',\n", - " framework_version='2.3.1',\n", - " hyperparameters={'epochs': epochs,\n", - " 'epsilon': epsilon,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp,\n", - " 'max_seq_length': max_seq_length,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger, \n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " metric_definitions=metrics_definitions,\n", - "# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py37\",\n", + " framework_version=\"2.3.1\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"epsilon\": epsilon,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " metric_definitions=metrics_definitions,\n", + " # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute\n", + ")" ] }, { @@ -598,7 +595,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Previous Tuning Job Name: {}'.format(tuning_job_name))" + "print(\"Previous Tuning Job Name: {}\".format(tuning_job_name))" ] }, { @@ -610,8 +607,9 @@ "from sagemaker.tuner import WarmStartConfig\n", "from sagemaker.tuner import WarmStartTypes\n", "\n", - "warm_start_config = WarmStartConfig(warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, \n", - " parents={tuning_job_name})" + "warm_start_config = WarmStartConfig(\n", + " warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, parents={tuning_job_name}\n", + ")" ] }, { @@ -627,19 +625,19 @@ "metadata": {}, "outputs": [], "source": [ - "objective_metric_name = 'train:accuracy'\n", + "objective_metric_name = \"train:accuracy\"\n", "\n", "tuner = HyperparameterTuner(\n", " estimator=estimator,\n", - " objective_type='Maximize',\n", + " objective_type=\"Maximize\",\n", " objective_metric_name=objective_metric_name,\n", " hyperparameter_ranges=hyperparameter_ranges,\n", " metric_definitions=metrics_definitions,\n", " max_jobs=2,\n", " max_parallel_jobs=1,\n", - " strategy='Bayesian',\n", - " early_stopping_type='Auto',\n", - " warm_start_config=warm_start_config\n", + " strategy=\"Bayesian\",\n", + " early_stopping_type=\"Auto\",\n", + " warm_start_config=warm_start_config,\n", ")" ] }, @@ -658,12 +656,11 @@ }, "outputs": [], "source": [ - "tuner.fit({'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " include_cls_metadata=False,\n", - " wait=False)" + "tuner.fit(\n", + " {\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " include_cls_metadata=False,\n", + " wait=False,\n", + ")" ] }, { @@ -692,27 +689,25 @@ "\n", "tuning_job_name = tuner.latest_tuning_job.job_name\n", "\n", - "job_description = sm.describe_hyper_parameter_tuning_job(\n", - " HyperParameterTuningJobName=tuning_job_name\n", - ")\n", + "job_description = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)\n", "\n", - "status = job_description['HyperParameterTuningJobStatus']\n", + "status = job_description[\"HyperParameterTuningJobStatus\"]\n", "\n", - "print('\\n')\n", + "print(\"\\n\")\n", "print(status)\n", - "print('\\n')\n", + "print(\"\\n\")\n", "pprint(job_description)\n", "\n", - "if status != 'Completed':\n", - " job_count = job_description['TrainingJobStatusCounters']['Completed']\n", - " print('Not yet complete, but {} jobs have completed.'.format(job_count))\n", - " \n", - " if job_description.get('BestTrainingJob', None):\n", + "if status != \"Completed\":\n", + " job_count = job_description[\"TrainingJobStatusCounters\"][\"Completed\"]\n", + " print(\"Not yet complete, but {} jobs have completed.\".format(job_count))\n", + "\n", + " if job_description.get(\"BestTrainingJob\", None):\n", " print(\"Best candidate:\")\n", - " pprint(job_description['BestTrainingJob']['TrainingJobName'])\n", - " pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])\n", + " pprint(job_description[\"BestTrainingJob\"][\"TrainingJobName\"])\n", + " pprint(job_description[\"BestTrainingJob\"][\"FinalHyperParameterTuningJobObjectiveMetric\"])\n", " else:\n", - " print(\"No training jobs have reported results yet.\") " + " print(\"No training jobs have reported results yet.\")" ] }, { @@ -724,8 +719,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Hyper-Parameter Tuning Job'.format(region, tuning_job_name)))" + "\n", + "display(\n", + " HTML(\n", + " 'Review Hyper-Parameter Tuning Job'.format(\n", + " region, tuning_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -760,10 +761,7 @@ "source": [ "from sagemaker.analytics import HyperparameterTuningJobAnalytics\n", "\n", - "hp_results = HyperparameterTuningJobAnalytics(\n", - " sagemaker_session=sess, \n", - " hyperparameter_tuning_job_name=tuning_job_name\n", - ")\n", + "hp_results = HyperparameterTuningJobAnalytics(sagemaker_session=sess, hyperparameter_tuning_job_name=tuning_job_name)\n", "\n", "df_results = hp_results.dataframe()\n", "df_results.shape" @@ -775,7 +773,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_results.sort_values('FinalObjectiveValue', ascending=0)" + "df_results.sort_values(\"FinalObjectiveValue\", ascending=0)" ] }, { @@ -793,7 +791,7 @@ }, "outputs": [], "source": [ - "df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)" + "df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)" ] }, { @@ -804,7 +802,7 @@ }, "outputs": [], "source": [ - "best_candidate_tuning_job_name = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['TrainingJobName']" + "best_candidate_tuning_job_name = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"TrainingJobName\"]" ] }, { @@ -822,7 +820,7 @@ "metadata": {}, "outputs": [], "source": [ - "best_learning_rate = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['learning_rate']\n", + "best_learning_rate = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"learning_rate\"]\n", "print(best_learning_rate)" ] }, @@ -832,7 +830,7 @@ "metadata": {}, "outputs": [], "source": [ - "best_accuracy = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['FinalObjectiveValue']\n", + "best_accuracy = df_results.sort_values(\"FinalObjectiveValue\", ascending=0).head(1)[\"FinalObjectiveValue\"]\n", "print(best_accuracy)" ] }, @@ -842,9 +840,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker_optimize.log_parameters({\n", - " 'learning_rate': float(best_learning_rate)\n", - "})\n", + "tracker_optimize.log_parameters({\"learning_rate\": float(best_learning_rate)})\n", "\n", "# must save after logging\n", "tracker_optimize.trial_component.save()" @@ -856,7 +852,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker_optimize.log_metric('accuracy', float(best_accuracy))\n", + "tracker_optimize.log_metric(\"accuracy\", float(best_accuracy))\n", "\n", "tracker_optimize.trial_component.save()" ] @@ -886,9 +882,9 @@ "lineage_table = ExperimentAnalytics(\n", " sagemaker_session=sess,\n", " experiment_name=experiment_name,\n", - " metric_names=['validation:accuracy'],\n", + " metric_names=[\"validation:accuracy\"],\n", " sort_by=\"CreationTime\",\n", - " sort_order=\"Descending\"\n", + " sort_order=\"Descending\",\n", ")\n", "\n", "lineage_df = lineage_table.dataframe()\n", diff --git a/08_optimize/src/inference.py b/08_optimize/src/inference.py index 2975dc2d..53196737 100644 --- a/08_optimize/src/inference.py +++ b/08_optimize/src/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/08_optimize/src/tf_bert_reviews.py b/08_optimize/src/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/08_optimize/src/tf_bert_reviews.py +++ b/08_optimize/src/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb b/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb index 83315587..0f898a39 100644 --- a/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb +++ b/09_deploy/01_Invoke_SageMaker_Autopilot_Model_From_Athena.ipynb @@ -51,12 +51,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -65,15 +65,15 @@ "metadata": {}, "outputs": [], "source": [ - "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n", - " print(' [OK] AthenaML IS SUPPORTED IN {}'.format(region))\n", - " print(' [OK] Please proceed with this notebook.')\n", + "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n", + " print(\" [OK] AthenaML IS SUPPORTED IN {}\".format(region))\n", + " print(\" [OK] Please proceed with this notebook.\")\n", "else:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print(' [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!'.format(region))\n", - " print(' [INFO] This is OK. SKIP this notebook and move ahead with the workshop.' )\n", - " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\" [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!\".format(region))\n", + " print(\" [INFO] This is OK. SKIP this notebook and move ahead with the workshop.\")\n", + " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -102,13 +102,13 @@ "source": [ "try:\n", " autopilot_endpoint_name\n", - " print('[OK]') \n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print('[ERROR] There is no Autopilot Model Endpoint deployed.')\n", - " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n", - " print('[INFO] This notebook is not required for the rest of this workshop.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' ) " + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] There is no Autopilot Model Endpoint deployed.\")\n", + " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n", + " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -128,25 +128,25 @@ "source": [ "try:\n", " resp = sm.describe_endpoint(EndpointName=autopilot_endpoint_name)\n", - " status = resp['EndpointStatus']\n", - " if status == 'InService':\n", - " print('[OK] Your Autopilot Model Endpoint is in status: {}'.format(status))\n", - " elif status == 'Creating':\n", - " print('[INFO] Your Autopilot Model Endpoint is in status: {}'.format(status))\n", - " print('[INFO] Waiting for the endpoint to be InService. Please be patient. This might take a few minutes.')\n", - " sm.get_waiter('endpoint_in_service').wait(EndpointName=autopilot_endpoint_name) \n", - " else: \n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print('[ERROR] Your Autopilot Model is in status: {}'.format(status))\n", - " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n", - " print('[INFO] This notebook is not required for the rest of this workshop.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", + " status = resp[\"EndpointStatus\"]\n", + " if status == \"InService\":\n", + " print(\"[OK] Your Autopilot Model Endpoint is in status: {}\".format(status))\n", + " elif status == \"Creating\":\n", + " print(\"[INFO] Your Autopilot Model Endpoint is in status: {}\".format(status))\n", + " print(\"[INFO] Waiting for the endpoint to be InService. Please be patient. This might take a few minutes.\")\n", + " sm.get_waiter(\"endpoint_in_service\").wait(EndpointName=autopilot_endpoint_name)\n", + " else:\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Your Autopilot Model is in status: {}\".format(status))\n", + " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n", + " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", "except:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print('[ERROR] There is no Autopilot Model Endpoint deployed.')\n", - " print('[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.')\n", - " print('[INFO] This notebook is not required for the rest of this workshop.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' ) " + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] There is no Autopilot Model Endpoint deployed.\")\n", + " print(\"[INFO] This is OK. Just skip this notebook and move ahead with the next notebook.\")\n", + " print(\"[INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -197,9 +197,9 @@ "try:\n", " ingest_create_athena_table_tsv_passed\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -218,11 +218,11 @@ "outputs": [], "source": [ "if not ingest_create_athena_table_tsv_passed:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++')\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE `INGEST` SECTION.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -231,7 +231,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_staging_dir = 's3://{}/athena/staging'.format(bucket)" + "s3_staging_dir = \"s3://{}/athena/staging\".format(bucket)" ] }, { @@ -240,10 +240,10 @@ "metadata": {}, "outputs": [], "source": [ - "tsv_prefix = 'amazon-reviews-pds/tsv'\n", - "database_name = 'dsoaws'\n", - "table_name_tsv = 'amazon_reviews_tsv'\n", - "table_name = 'product_reviews'" + "tsv_prefix = \"amazon-reviews-pds/tsv\"\n", + "database_name = \"dsoaws\"\n", + "table_name_tsv = \"amazon_reviews_tsv\"\n", + "table_name = \"product_reviews\"" ] }, { @@ -256,7 +256,9 @@ "CREATE TABLE IF NOT EXISTS {}.{} AS \n", "SELECT review_id, review_body \n", "FROM {}.{}\n", - "\"\"\".format(database_name, table_name, database_name, table_name_tsv)\n", + "\"\"\".format(\n", + " database_name, table_name, database_name, table_name_tsv\n", + ")\n", "\n", "print(statement)" ] @@ -269,17 +271,17 @@ "source": [ "import pandas as pd\n", "\n", - "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n", + "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n", " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)\n", " pd.read_sql(statement, conn)\n", "\n", - " print('[OK]')\n", - "else: \n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n", - " print(' [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!'.format(region))\n", - " print(' [INFO] This is OK. SKIP this notebook and move ahead with the workshop.' )\n", - " print(' [INFO] This notebook is not required for the rest of this workshop.' )\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )" + " print(\"[OK]\")\n", + "else:\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\" [ERROR] AthenaML IS *NOT* SUPPORTED IN {} !!\".format(region))\n", + " print(\" [INFO] This is OK. SKIP this notebook and move ahead with the workshop.\")\n", + " print(\" [INFO] This notebook is not required for the rest of this workshop.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -288,8 +290,8 @@ "metadata": {}, "outputs": [], "source": [ - "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n", - " statement = 'SELECT * FROM {}.{} LIMIT 10'.format(database_name, table_name)\n", + "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n", + " statement = \"SELECT * FROM {}.{} LIMIT 10\".format(database_name, table_name)\n", " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)\n", " df_table = pd.read_sql(statement, conn)\n", " print(df_table)" @@ -310,17 +312,17 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "client = boto3.client('athena')\n", + "client = boto3.client(\"athena\")\n", "\n", - "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n", + "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n", " try:\n", - " response = client.create_work_group(Name='AmazonAthenaPreviewFunctionality') \n", + " response = client.create_work_group(Name=\"AmazonAthenaPreviewFunctionality\")\n", " print(response)\n", " except ClientError as e:\n", - " if e.response['Error']['Code'] == 'InvalidRequestException':\n", + " if e.response[\"Error\"][\"Code\"] == \"InvalidRequestException\":\n", " print(\"[OK] Workgroup already exists.\")\n", " else:\n", - " print('[ERROR] {}'.format(e))" + " print(\"[ERROR] {}\".format(e))" ] }, { @@ -352,7 +354,9 @@ ")\n", "SELECT review_id, review_body, predict_star_rating(REPLACE(review_body, ',', ' ')) AS predicted_star_rating \n", " FROM {}.{} LIMIT 10\n", - " \"\"\".format(autopilot_endpoint_name, database_name, table_name)\n", + " \"\"\".format(\n", + " autopilot_endpoint_name, database_name, table_name\n", + ")\n", "\n", "print(statement)" ] @@ -370,8 +374,8 @@ "metadata": {}, "outputs": [], "source": [ - "if region in ['eu-west-1', 'ap-south-1', 'us-east-1', 'us-west-2']:\n", - " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir, work_group='AmazonAthenaPreviewFunctionality')\n", + "if region in [\"eu-west-1\", \"ap-south-1\", \"us-east-1\", \"us-west-2\"]:\n", + " conn = connect(region_name=region, s3_staging_dir=s3_staging_dir, work_group=\"AmazonAthenaPreviewFunctionality\")\n", " df = pd.read_sql(statement, conn)\n", " print(df)" ] @@ -389,12 +393,10 @@ "metadata": {}, "outputs": [], "source": [ - "sm = boto3.client('sagemaker')\n", + "sm = boto3.client(\"sagemaker\")\n", "\n", "if autopilot_endpoint_name:\n", - " sm.delete_endpoint(\n", - " EndpointName=autopilot_endpoint_name\n", - " )" + " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)" ] }, { diff --git a/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb b/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb index 8d482558..222dc164 100644 --- a/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb +++ b/09_deploy/02_Deploy_Reviews_BERT_PyTorch_REST_Endpoint.ipynb @@ -34,12 +34,12 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -94,10 +94,12 @@ "source": [ "class StarRatingPredictor(Predictor):\n", " def __init__(self, endpoint_name, sagemaker_session):\n", - " super().__init__(endpoint_name, \n", - " sagemaker_session=sagemaker_session, \n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer())" + " super().__init__(\n", + " endpoint_name,\n", + " sagemaker_session=sagemaker_session,\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + " )" ] }, { @@ -107,9 +109,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "pytorch_model_name = '{}-{}-{}'.format(training_job_name, 'pt', timestamp)\n", + "pytorch_model_name = \"{}-{}-{}\".format(training_job_name, \"pt\", timestamp)\n", "\n", "print(pytorch_model_name)" ] @@ -120,14 +123,16 @@ "metadata": {}, "outputs": [], "source": [ - "model = PyTorchModel(model_data=transformer_pytorch_model_dir_s3_uri + 'model.tar.gz',\n", - " name=pytorch_model_name,\n", - " role=role, \n", - " entry_point='inference.py',\n", - " source_dir='code-pytorch',\n", - " framework_version='1.6.0',\n", - " py_version='py3',\n", - " predictor_cls=StarRatingPredictor)" + "model = PyTorchModel(\n", + " model_data=transformer_pytorch_model_dir_s3_uri + \"model.tar.gz\",\n", + " name=pytorch_model_name,\n", + " role=role,\n", + " entry_point=\"inference.py\",\n", + " source_dir=\"code-pytorch\",\n", + " framework_version=\"1.6.0\",\n", + " py_version=\"py3\",\n", + " predictor_cls=StarRatingPredictor,\n", + ")" ] }, { @@ -138,7 +143,7 @@ "source": [ "import time\n", "\n", - "pytorch_endpoint_name = '{}-{}-{}'.format(training_job_name, 'pt', timestamp)\n", + "pytorch_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"pt\", timestamp)\n", "\n", "print(pytorch_endpoint_name)" ] @@ -149,10 +154,9 @@ "metadata": {}, "outputs": [], "source": [ - "predictor = model.deploy(initial_instance_count=1, \n", - " instance_type='ml.m5.4xlarge', \n", - " endpoint_name=pytorch_endpoint_name, \n", - " wait=False)" + "predictor = model.deploy(\n", + " initial_instance_count=1, instance_type=\"ml.m5.4xlarge\", endpoint_name=pytorch_endpoint_name, wait=False\n", + ")" ] }, { @@ -172,7 +176,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, pytorch_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, pytorch_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -183,7 +193,7 @@ "source": [ "%%time\n", "\n", - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=pytorch_endpoint_name)" ] }, @@ -200,7 +210,7 @@ "metadata": {}, "outputs": [], "source": [ - "pytorch_endpoint_arn = sm.describe_endpoint(EndpointName=pytorch_endpoint_name)['EndpointArn']\n", + "pytorch_endpoint_arn = sm.describe_endpoint(EndpointName=pytorch_endpoint_name)[\"EndpointArn\"]\n", "print(pytorch_endpoint_arn)" ] }, @@ -233,15 +243,12 @@ "source": [ "import json\n", "\n", - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -260,12 +267,14 @@ "import csv\n", "import pandas as pd\n", "\n", - "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_reviews = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "\n", - "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n", + "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n", "df_sample_reviews = df_sample_reviews.reset_index()\n", "df_sample_reviews.shape" ] @@ -278,14 +287,14 @@ "source": [ "import pandas as pd\n", "\n", + "\n", "def predict(review_body):\n", - " inputs = [\n", - " {\"features\": [review_body]}\n", - " ]\n", + " inputs = [{\"features\": [review_body]}]\n", " predicted_classes = predictor.predict(inputs)\n", - " return predicted_classes[0]['predicted_label']\n", - " \n", - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + " return predicted_classes[0][\"predicted_label\"]\n", + "\n", + "\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews.head(5)" ] }, diff --git a/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb index 483b2f83..8f415cc4 100644 --- a/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb +++ b/09_deploy/03_Deploy_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb @@ -28,12 +28,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -53,11 +53,11 @@ "source": [ "try:\n", " training_job_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous TRAIN section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -162,7 +162,7 @@ "\n", "timestamp = int(time.time())\n", "\n", - "tensorflow_model_name = '{}-{}-{}'.format(training_job_name, 'tf', timestamp)\n", + "tensorflow_model_name = \"{}-{}-{}\".format(training_job_name, \"tf\", timestamp)\n", "\n", "print(tensorflow_model_name)" ] @@ -185,7 +185,7 @@ "outputs": [], "source": [ "# requires enough disk space for tensorflow, transformers, and bert downloads\n", - "instance_type = 'ml.m5.4xlarge' # evt " + "instance_type = \"ml.m5.4xlarge\" # evt" ] }, { @@ -196,12 +196,14 @@ "source": [ "from sagemaker.tensorflow.model import TensorFlowModel\n", "\n", - "tensorflow_model = TensorFlowModel(name=tensorflow_model_name,\n", - " source_dir='code',\n", - " entry_point='inference.py',\n", - " model_data='s3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name),\n", - " role=role,\n", - " framework_version='2.3.1')" + "tensorflow_model = TensorFlowModel(\n", + " name=tensorflow_model_name,\n", + " source_dir=\"code\",\n", + " entry_point=\"inference.py\",\n", + " model_data=\"s3://{}/{}/output/model.tar.gz\".format(bucket, training_job_name),\n", + " role=role,\n", + " framework_version=\"2.3.1\",\n", + ")" ] }, { @@ -210,7 +212,7 @@ "metadata": {}, "outputs": [], "source": [ - "tensorflow_endpoint_name = '{}-{}-{}'.format(training_job_name, 'tf', timestamp)\n", + "tensorflow_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"tf\", timestamp)\n", "\n", "print(tensorflow_endpoint_name)" ] @@ -223,10 +225,12 @@ }, "outputs": [], "source": [ - "tensorflow_model.deploy(endpoint_name=tensorflow_endpoint_name,\n", - " initial_instance_count=1, # Should use >=2 for high(er) availability \n", - " instance_type=instance_type,\n", - " wait=False)" + "tensorflow_model.deploy(\n", + " endpoint_name=tensorflow_endpoint_name,\n", + " initial_instance_count=1, # Should use >=2 for high(er) availability\n", + " instance_type=instance_type,\n", + " wait=False,\n", + ")" ] }, { @@ -239,7 +243,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, tensorflow_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -257,7 +267,7 @@ "source": [ "%%time\n", "\n", - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=tensorflow_endpoint_name)" ] }, @@ -274,7 +284,7 @@ "metadata": {}, "outputs": [], "source": [ - "tensorflow_endpoint_arn = sm.describe_endpoint(EndpointName=tensorflow_endpoint_name)['EndpointArn']\n", + "tensorflow_endpoint_arn = sm.describe_endpoint(EndpointName=tensorflow_endpoint_name)[\"EndpointArn\"]\n", "print(tensorflow_endpoint_arn)" ] }, @@ -315,15 +325,17 @@ "from sagemaker.tensorflow.model import TensorFlowPredictor\n", "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", - " \n", - "predictor = TensorFlowPredictor(endpoint_name=tensorflow_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer())" + "\n", + "predictor = TensorFlowPredictor(\n", + " endpoint_name=tensorflow_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -357,15 +369,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -383,11 +392,13 @@ "source": [ "import csv\n", "\n", - "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", - "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=5)\n", + "df_reviews = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", + "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=5)\n", "df_sample_reviews = df_sample_reviews.reset_index()\n", "df_sample_reviews.shape" ] @@ -400,14 +411,14 @@ "source": [ "import pandas as pd\n", "\n", + "\n", "def predict(review_body):\n", - " inputs = [\n", - " {\"features\": [review_body]}\n", - " ]\n", + " inputs = [{\"features\": [review_body]}]\n", " predicted_classes = predictor.predict(inputs)\n", - " return predicted_classes[0]['predicted_label']\n", - " \n", - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + " return predicted_classes[0][\"predicted_label\"]\n", + "\n", + "\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews.head(5)" ] }, diff --git a/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb index efd0e83a..0145e5a8 100644 --- a/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb +++ b/09_deploy/04_Autoscale_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb @@ -17,13 +17,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "autoscale = boto3.Session().client(service_name='application-autoscaling', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "autoscale = boto3.Session().client(service_name=\"application-autoscaling\", region_name=region)" ] }, { @@ -34,7 +34,7 @@ }, "outputs": [], "source": [ - "%store -r tensorflow_endpoint_name " + "%store -r tensorflow_endpoint_name" ] }, { @@ -45,11 +45,11 @@ "source": [ "try:\n", " tensorflow_endpoint_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the previous notebook before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the previous notebook before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -77,17 +77,17 @@ "outputs": [], "source": [ "autoscale.register_scalable_target(\n", - " ServiceNamespace='sagemaker',\n", + " ServiceNamespace=\"sagemaker\",\n", " ResourceId=\"endpoint/\" + tensorflow_endpoint_name + \"/variant/AllTraffic\",\n", - " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n", + " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n", " MinCapacity=1,\n", " MaxCapacity=2,\n", " RoleARN=role,\n", " SuspendedState={\n", - " 'DynamicScalingInSuspended': False,\n", - " 'DynamicScalingOutSuspended': False,\n", - " 'ScheduledScalingSuspended': False\n", - " }\n", + " \"DynamicScalingInSuspended\": False,\n", + " \"DynamicScalingOutSuspended\": False,\n", + " \"ScheduledScalingSuspended\": False,\n", + " },\n", ")" ] }, @@ -99,7 +99,7 @@ "source": [ "# check the target is available\n", "autoscale.describe_scalable_targets(\n", - " ServiceNamespace='sagemaker',\n", + " ServiceNamespace=\"sagemaker\",\n", " MaxResults=100,\n", ")" ] @@ -111,19 +111,19 @@ "outputs": [], "source": [ "autoscale.put_scaling_policy(\n", - " PolicyName='bert-reviews-autoscale-policy',\n", - " ServiceNamespace='sagemaker',\n", + " PolicyName=\"bert-reviews-autoscale-policy\",\n", + " ServiceNamespace=\"sagemaker\",\n", " ResourceId=\"endpoint/\" + tensorflow_endpoint_name + \"/variant/AllTraffic\",\n", - " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n", - " PolicyType='TargetTrackingScaling',\n", + " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n", + " PolicyType=\"TargetTrackingScaling\",\n", " TargetTrackingScalingPolicyConfiguration={\n", - " 'TargetValue': 2.0,\n", - " 'PredefinedMetricSpecification': {\n", - " 'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance',\n", + " \"TargetValue\": 2.0,\n", + " \"PredefinedMetricSpecification\": {\n", + " \"PredefinedMetricType\": \"SageMakerVariantInvocationsPerInstance\",\n", " },\n", - " 'ScaleOutCooldown': 60,\n", - " 'ScaleInCooldown': 300,\n", - " }\n", + " \"ScaleOutCooldown\": 60,\n", + " \"ScaleInCooldown\": 300,\n", + " },\n", ")" ] }, @@ -137,7 +137,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, tensorflow_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -148,7 +154,7 @@ "source": [ "%%time\n", "\n", - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=tensorflow_endpoint_name)" ] }, @@ -170,14 +176,16 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=tensorflow_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer()) " + "predictor = TensorFlowPredictor(\n", + " endpoint_name=tensorflow_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -213,7 +221,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, tensorflow_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, tensorflow_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -224,16 +238,13 @@ }, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "for i in range(0, 100000):\n", " predicted_classes = predictor.predict(inputs)\n", "\n", " for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { diff --git a/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb b/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb index 03f149e9..b6370504 100644 --- a/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb +++ b/09_deploy/05_Perform_AB_Test_Reviews_BERT_TensorFlow_REST_Endpoints.ipynb @@ -47,13 +47,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "cw = boto3.Session().client(service_name='cloudwatch', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "cw = boto3.Session().client(service_name=\"cloudwatch\", region_name=region)" ] }, { @@ -78,14 +78,12 @@ "metadata": {}, "outputs": [], "source": [ - "try: \n", + "try:\n", " autopilot_endpoint_name\n", - " sm.delete_endpoint(\n", - " EndpointName=autopilot_endpoint_name\n", - " )\n", - " print('Autopilot Endpoint has been deleted to save resources. This is good.') \n", + " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)\n", + " print(\"Autopilot Endpoint has been deleted to save resources. This is good.\")\n", "except:\n", - " print('Endpoints are cleaned up. This is good. Keep moving forward!')" + " print(\"Endpoints are cleaned up. This is good. Keep moving forward!\")" ] }, { @@ -193,7 +191,7 @@ " version=\"2.3.1\",\n", " py_version=\"py37\",\n", " instance_type=\"ml.m5.4xlarge\",\n", - " image_scope=\"inference\"\n", + " image_scope=\"inference\",\n", ")\n", "print(inference_image_uri)" ] @@ -205,14 +203,14 @@ "outputs": [], "source": [ "import time\n", - "timestamp = '{}'.format(int(time.time()))\n", "\n", - "model_a_name = '{}-{}-{}'.format(training_job_name, 'varianta', timestamp)\n", + "timestamp = \"{}\".format(int(time.time()))\n", + "\n", + "model_a_name = \"{}-{}-{}\".format(training_job_name, \"varianta\", timestamp)\n", "\n", - "sess.create_model_from_job(name=model_a_name,\n", - " training_job_name=training_job_name,\n", - " role=role,\n", - " image_uri=inference_image_uri)" + "sess.create_model_from_job(\n", + " name=model_a_name, training_job_name=training_job_name, role=role, image_uri=inference_image_uri\n", + ")" ] }, { @@ -234,12 +232,11 @@ "metadata": {}, "outputs": [], "source": [ - "model_b_name = '{}-{}-{}'.format(training_job_name, 'variantb', timestamp)\n", + "model_b_name = \"{}-{}-{}\".format(training_job_name, \"variantb\", timestamp)\n", "\n", - "sess.create_model_from_job(name=model_b_name,\n", - " training_job_name=training_job_name,\n", - " role=role,\n", - " image_uri=inference_image_uri)" + "sess.create_model_from_job(\n", + " name=model_b_name, training_job_name=training_job_name, role=role, image_uri=inference_image_uri\n", + ")" ] }, { @@ -271,25 +268,28 @@ "source": [ "from sagemaker.session import production_variant\n", "\n", - "timestamp = '{}'.format(int(time.time()))\n", + "timestamp = \"{}\".format(int(time.time()))\n", "\n", - "endpoint_config_name = '{}-{}-{}'.format(training_job_name, 'abtest', timestamp)\n", + "endpoint_config_name = \"{}-{}-{}\".format(training_job_name, \"abtest\", timestamp)\n", "\n", - "variantA = production_variant(model_name=model_a_name,\n", - " instance_type='ml.m5.4xlarge',\n", - " initial_instance_count=1,\n", - " variant_name='VariantA',\n", - " initial_weight=50)\n", + "variantA = production_variant(\n", + " model_name=model_a_name,\n", + " instance_type=\"ml.m5.4xlarge\",\n", + " initial_instance_count=1,\n", + " variant_name=\"VariantA\",\n", + " initial_weight=50,\n", + ")\n", "\n", - "variantB = production_variant(model_name=model_b_name,\n", - " instance_type='ml.m5.4xlarge',\n", - " initial_instance_count=1,\n", - " variant_name='VariantB',\n", - " initial_weight=50)\n", + "variantB = production_variant(\n", + " model_name=model_b_name,\n", + " instance_type=\"ml.m5.4xlarge\",\n", + " initial_instance_count=1,\n", + " variant_name=\"VariantB\",\n", + " initial_weight=50,\n", + ")\n", "\n", "endpoint_config = sm.create_endpoint_config(\n", - " EndpointConfigName=endpoint_config_name,\n", - " ProductionVariants=[variantA, variantB]\n", + " EndpointConfigName=endpoint_config_name, ProductionVariants=[variantA, variantB]\n", ")" ] }, @@ -301,7 +301,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review REST Endpoint Configuration'.format(region, endpoint_config_name)))\n" + "display(\n", + " HTML(\n", + " 'Review REST Endpoint Configuration'.format(\n", + " region, endpoint_config_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -310,11 +316,9 @@ "metadata": {}, "outputs": [], "source": [ - "model_ab_endpoint_name = '{}-{}-{}'.format(training_job_name, 'abtest', timestamp)\n", + "model_ab_endpoint_name = \"{}-{}-{}\".format(training_job_name, \"abtest\", timestamp)\n", "\n", - "endpoint_response = sm.create_endpoint(\n", - " EndpointName=model_ab_endpoint_name,\n", - " EndpointConfigName=endpoint_config_name)" + "endpoint_response = sm.create_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=endpoint_config_name)" ] }, { @@ -384,7 +388,7 @@ "source": [ "from smexperiments.trial import Trial\n", "\n", - "timestamp = '{}'.format(int(time.time()))\n", + "timestamp = \"{}\".format(int(time.time()))\n", "\n", "trial = Trial.load(trial_name=trial_name)\n", "print(trial)" @@ -398,11 +402,10 @@ "source": [ "from smexperiments.tracker import Tracker\n", "\n", - "tracker_deploy = Tracker.create(display_name='deploy', \n", - " sagemaker_boto_client=sm)\n", + "tracker_deploy = Tracker.create(display_name=\"deploy\", sagemaker_boto_client=sm)\n", "\n", "deploy_trial_component_name = tracker_deploy.trial_component.trial_component_name\n", - "print('Deploy trial component name {}'.format(deploy_trial_component_name))" + "print(\"Deploy trial component name {}\".format(deploy_trial_component_name))" ] }, { @@ -434,9 +437,11 @@ "metadata": {}, "outputs": [], "source": [ - "tracker_deploy.log_parameters({\n", - " 'endpoint_name': model_ab_endpoint_name,\n", - "})\n", + "tracker_deploy.log_parameters(\n", + " {\n", + " \"endpoint_name\": model_ab_endpoint_name,\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "tracker_deploy.trial_component.save()" @@ -453,7 +458,7 @@ "lineage_table = ExperimentAnalytics(\n", " sagemaker_session=sess,\n", " experiment_name=experiment_name,\n", - " metric_names=['validation:accuracy'],\n", + " metric_names=[\"validation:accuracy\"],\n", " sort_by=\"CreationTime\",\n", " sort_order=\"Ascending\",\n", ")\n", @@ -479,7 +484,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review REST Endpoint'.format(\n", + " region, model_ab_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -495,7 +506,7 @@ "metadata": {}, "outputs": [], "source": [ - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=model_ab_endpoint_name)" ] }, @@ -523,14 +534,16 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=model_ab_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer()) " + "predictor = TensorFlowPredictor(\n", + " endpoint_name=model_ab_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -564,15 +577,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -590,11 +600,13 @@ "source": [ "import csv\n", "\n", - "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", - "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n", + "df_reviews = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", + "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n", "df_sample_reviews = df_sample_reviews.reset_index()\n", "df_sample_reviews.shape" ] @@ -607,14 +619,14 @@ "source": [ "import pandas as pd\n", "\n", + "\n", "def predict(review_body):\n", - " inputs = [\n", - " {\"features\": [review_body]}\n", - " ]\n", + " inputs = [{\"features\": [review_body]}]\n", " predicted_classes = predictor.predict(inputs)\n", - " return predicted_classes[0]['predicted_label']\n", - " \n", - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + " return predicted_classes[0][\"predicted_label\"]\n", + "\n", + "\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews.head(5)" ] }, @@ -633,7 +645,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review REST Endpoint Performance Metrics'.format(region, model_ab_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review REST Endpoint Performance Metrics'.format(\n", + " region, model_ab_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -656,12 +674,10 @@ "import boto3\n", "import pandas as pd\n", "\n", - "def get_invocation_metrics_for_endpoint_variant(endpoint_name,\n", - " namespace_name,\n", - " metric_name,\n", - " variant_name,\n", - " start_time,\n", - " end_time):\n", + "\n", + "def get_invocation_metrics_for_endpoint_variant(\n", + " endpoint_name, namespace_name, metric_name, variant_name, start_time, end_time\n", + "):\n", " metrics = cw.get_metric_statistics(\n", " Namespace=namespace_name,\n", " MetricName=metric_name,\n", @@ -669,55 +685,48 @@ " EndTime=end_time,\n", " Period=60,\n", " Statistics=[\"Sum\"],\n", - " Dimensions=[\n", - " {\n", - " \"Name\": \"EndpointName\",\n", - " \"Value\": endpoint_name\n", - " },\n", - " {\n", - " \"Name\": \"VariantName\",\n", - " \"Value\": variant_name\n", - " }\n", - " ]\n", + " Dimensions=[{\"Name\": \"EndpointName\", \"Value\": endpoint_name}, {\"Name\": \"VariantName\", \"Value\": variant_name}],\n", " )\n", "\n", - " if metrics['Datapoints']:\n", - " return pd.DataFrame(metrics[\"Datapoints\"])\\\n", - " .sort_values(\"Timestamp\")\\\n", - " .set_index(\"Timestamp\")\\\n", - " .drop(\"Unit\", axis=1)\\\n", - " .rename(columns={\"Sum\": variant_name})\n", + " if metrics[\"Datapoints\"]:\n", + " return (\n", + " pd.DataFrame(metrics[\"Datapoints\"])\n", + " .sort_values(\"Timestamp\")\n", + " .set_index(\"Timestamp\")\n", + " .drop(\"Unit\", axis=1)\n", + " .rename(columns={\"Sum\": variant_name})\n", + " )\n", " else:\n", " return pd.DataFrame()\n", "\n", "\n", - "def plot_endpoint_metrics_for_variants(endpoint_name,\n", - " namespace_name,\n", - " metric_name,\n", - " start_time=None):\n", + "def plot_endpoint_metrics_for_variants(endpoint_name, namespace_name, metric_name, start_time=None):\n", " try:\n", " start_time = start_time or datetime.now() - timedelta(minutes=60)\n", " end_time = datetime.now()\n", "\n", - " metrics_variantA = get_invocation_metrics_for_endpoint_variant(endpoint_name=model_ab_endpoint_name, \n", - " namespace_name=namespace_name,\n", - " metric_name=metric_name,\n", - " variant_name=variantA[\"VariantName\"], \n", - " start_time=start_time, \n", - " end_time=end_time)\n", - "\n", - " metrics_variantB = get_invocation_metrics_for_endpoint_variant(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name=namespace_name,\n", - " metric_name=metric_name, \n", - " variant_name=variantB[\"VariantName\"], \n", - " start_time=start_time, \n", - " end_time=end_time)\n", + " metrics_variantA = get_invocation_metrics_for_endpoint_variant(\n", + " endpoint_name=model_ab_endpoint_name,\n", + " namespace_name=namespace_name,\n", + " metric_name=metric_name,\n", + " variant_name=variantA[\"VariantName\"],\n", + " start_time=start_time,\n", + " end_time=end_time,\n", + " )\n", + "\n", + " metrics_variantB = get_invocation_metrics_for_endpoint_variant(\n", + " endpoint_name=model_ab_endpoint_name,\n", + " namespace_name=namespace_name,\n", + " metric_name=metric_name,\n", + " variant_name=variantB[\"VariantName\"],\n", + " start_time=start_time,\n", + " end_time=end_time,\n", + " )\n", "\n", " metrics_variants = metrics_variantA.join(metrics_variantB, how=\"outer\")\n", " metrics_variants.plot()\n", " except:\n", - " pass\n", - " " + " pass" ] }, { @@ -737,13 +746,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(20)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='/aws/sagemaker/Endpoints',\n", - " metric_name='CPUUtilization')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n", + ")" ] }, { @@ -753,13 +763,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='Invocations')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n", + ")" ] }, { @@ -769,13 +780,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='InvocationsPerInstance')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n", + ")" ] }, { @@ -785,13 +797,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='ModelLatency')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n", + ")" ] }, { @@ -812,13 +825,13 @@ "source": [ "updated_endpoint_config = [\n", " {\n", - " 'VariantName': variantA['VariantName'],\n", - " 'DesiredWeight': 0,\n", + " \"VariantName\": variantA[\"VariantName\"],\n", + " \"DesiredWeight\": 0,\n", " },\n", " {\n", - " 'VariantName': variantB['VariantName'],\n", - " 'DesiredWeight': 100,\n", - " }\n", + " \"VariantName\": variantB[\"VariantName\"],\n", + " \"DesiredWeight\": 100,\n", + " },\n", "]" ] }, @@ -829,8 +842,7 @@ "outputs": [], "source": [ "sm.update_endpoint_weights_and_capacities(\n", - " EndpointName=model_ab_endpoint_name,\n", - " DesiredWeightsAndCapacities=updated_endpoint_config\n", + " EndpointName=model_ab_endpoint_name, DesiredWeightsAndCapacities=updated_endpoint_config\n", ")" ] }, @@ -844,7 +856,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review REST Endpoint'.format(\n", + " region, model_ab_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -861,7 +879,7 @@ "metadata": {}, "outputs": [], "source": [ - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=model_ab_endpoint_name)" ] }, @@ -878,7 +896,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews.head(5)" ] }, @@ -899,13 +917,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(20)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='/aws/sagemaker/Endpoints',\n", - " metric_name='CPUUtilization')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n", + ")" ] }, { @@ -915,13 +934,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='Invocations')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n", + ")" ] }, { @@ -931,13 +951,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='InvocationsPerInstance')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n", + ")" ] }, { @@ -947,13 +968,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='ModelLatency')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n", + ")" ] }, { @@ -975,21 +997,23 @@ "outputs": [], "source": [ "import time\n", - "timestamp = '{}'.format(int(time.time()))\n", "\n", - "updated_endpoint_config_name = '{}-{}'.format(training_job_name, timestamp)\n", + "timestamp = \"{}\".format(int(time.time()))\n", + "\n", + "updated_endpoint_config_name = \"{}-{}\".format(training_job_name, timestamp)\n", "\n", "updated_endpoint_config = sm.create_endpoint_config(\n", " EndpointConfigName=updated_endpoint_config_name,\n", " ProductionVariants=[\n", " {\n", - " 'VariantName': variantB['VariantName'],\n", - " 'ModelName': model_b_name, # Only specify variant B to remove variant A\n", - " 'InstanceType':'ml.m5.4xlarge',\n", - " 'InitialInstanceCount': 1,\n", - " 'InitialVariantWeight': 100\n", + " \"VariantName\": variantB[\"VariantName\"],\n", + " \"ModelName\": model_b_name, # Only specify variant B to remove variant A\n", + " \"InstanceType\": \"ml.m5.4xlarge\",\n", + " \"InitialInstanceCount\": 1,\n", + " \"InitialVariantWeight\": 100,\n", " }\n", - " ])" + " ],\n", + ")" ] }, { @@ -1000,10 +1024,7 @@ }, "outputs": [], "source": [ - "sm.update_endpoint(\n", - " EndpointName=model_ab_endpoint_name,\n", - " EndpointConfigName=updated_endpoint_config_name\n", - ")" + "sm.update_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=updated_endpoint_config_name)" ] }, { @@ -1021,7 +1042,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review REST Endpoint'.format(region, model_ab_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review REST Endpoint'.format(\n", + " region, model_ab_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -1040,7 +1067,7 @@ }, "outputs": [], "source": [ - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=model_ab_endpoint_name)" ] }, @@ -1057,7 +1084,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews" ] }, @@ -1078,13 +1105,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(20)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='/aws/sagemaker/Endpoints',\n", - " metric_name='CPUUtilization')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"/aws/sagemaker/Endpoints\", metric_name=\"CPUUtilization\"\n", + ")" ] }, { @@ -1094,13 +1122,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='Invocations')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"Invocations\"\n", + ")" ] }, { @@ -1110,13 +1139,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='InvocationsPerInstance')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"InvocationsPerInstance\"\n", + ")" ] }, { @@ -1126,13 +1156,14 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'\n", "\n", "time.sleep(5)\n", - "plot_endpoint_metrics_for_variants(endpoint_name=model_ab_endpoint_name,\n", - " namespace_name='AWS/SageMaker', \n", - " metric_name='ModelLatency')" + "plot_endpoint_metrics_for_variants(\n", + " endpoint_name=model_ab_endpoint_name, namespace_name=\"AWS/SageMaker\", metric_name=\"ModelLatency\"\n", + ")" ] }, { @@ -1160,9 +1191,7 @@ "metadata": {}, "outputs": [], "source": [ - "sm.delete_endpoint(\n", - " EndpointName=model_ab_endpoint_name\n", - ")" + "sm.delete_endpoint(EndpointName=model_ab_endpoint_name)" ] }, { diff --git a/09_deploy/code-pytorch/inference.py b/09_deploy/code-pytorch/inference.py index 72075ffd..85ce8084 100644 --- a/09_deploy/code-pytorch/inference.py +++ b/09_deploy/code-pytorch/inference.py @@ -10,67 +10,69 @@ logger.addHandler(logging.StreamHandler(sys.stdout)) ################################### -### VARIABLES +### VARIABLES ################################### max_seq_length = 64 classes = [1, 2, 3, 4, 5] -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") ################################### -### SAGEMKAER LOAD MODEL FUNCTION -################################### +### SAGEMKAER LOAD MODEL FUNCTION +################################### -# You need to put in config.json from saved fine-tuned Hugging Face model in code/ +# You need to put in config.json from saved fine-tuned Hugging Face model in code/ # Reference it in the inference container at /opt/ml/model/code # The model needs to be called 'model.pth' per https://github.com/aws/sagemaker-pytorch-inference-toolkit/blob/6936c08581e26ff3bac26824b1e4946ec68ffc85/src/sagemaker_pytorch_serving_container/torchserve.py#L45 + def model_fn(model_dir): - config = DistilBertConfig.from_json_file('/opt/ml/model/code/config.json') - - model_path = '{}/{}'.format(model_dir, 'model.pth') + config = DistilBertConfig.from_json_file("/opt/ml/model/code/config.json") + + model_path = "{}/{}".format(model_dir, "model.pth") model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config) - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) - + return model ################################### -### SAGEMKAER PREDICT FUNCTION -################################### +### SAGEMKAER PREDICT FUNCTION +################################### + def predict_fn(input_data, model): model.eval() - print('input_data: {}'.format(input_data)) - print('type(input_data): {}'.format(type(input_data))) - - data_str = input_data.decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + print("input_data: {}".format(input_data)) + print("type(input_data): {}".format(type(input_data))) + + data_str = input_data.decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) predicted_classes = [] for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0]: review_body # features[1..n]: is anything else (we can define the order ourselves) - # Example: - # {"features": ["The best gift ever", "Gift Cards"]} + # Example: + # {"features": ["The best gift ever", "Gift Cards"]} # review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - + encode_plus_token = tokenizer.encode_plus( review_body, max_length=max_seq_length, @@ -78,49 +80,53 @@ def predict_fn(input_data, model): return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, - return_tensors='pt', - truncation=True) + return_tensors="pt", + truncation=True, + ) - input_ids = encode_plus_token['input_ids'] - attention_mask = encode_plus_token['attention_mask'] + input_ids = encode_plus_token["input_ids"] + attention_mask = encode_plus_token["attention_mask"] output = model(input_ids, attention_mask) - print('output: {}'.format(output)) + print("output: {}".format(output)) - # output is a tuple: + # output is a tuple: # output: (tensor([[-1.9840, -0.9870, 2.8947]], grad_fn=), - # for torch.max() you need to pass in the tensor, output[0] + # for torch.max() you need to pass in the tensor, output[0] _, prediction = torch.max(output[0], dim=1) predicted_class_idx = prediction.item() predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class + prediction_dict["predicted_label"] = predicted_class jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) + print("jsonline: {}".format(jsonline)) predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) + print("predicted_classes in the loop: {}".format(predicted_classes)) - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) return predicted_classes_jsonlines ################################### -### SAGEMKAER MODEL INPUT FUNCTION -################################### +### SAGEMKAER MODEL INPUT FUNCTION +################################### + -def input_fn(serialized_input_data, content_type='application/jsonlines'): +def input_fn(serialized_input_data, content_type="application/jsonlines"): return serialized_input_data + +################################### +### SAGEMKAER MODEL OUTPUT FUNCTION ################################### -### SAGEMKAER MODEL OUTPUT FUNCTION -################################### -def output_fn(prediction_output, accept='application/jsonlines'): + +def output_fn(prediction_output, accept="application/jsonlines"): return prediction_output, accept diff --git a/09_deploy/code/inference.py b/09_deploy/code/inference.py index 2975dc2d..53196737 100644 --- a/09_deploy/code/inference.py +++ b/09_deploy/code/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/09_deploy/common/docker_utils.py b/09_deploy/common/docker_utils.py index 5c34c7b0..54870328 100644 --- a/09_deploy/common/docker_utils.py +++ b/09_deploy/common/docker_utils.py @@ -29,7 +29,7 @@ IMAGE_TEMPLATE = "{account}.dkr.ecr.{region}.amazonaws.com/{image_name}:{version}" -def build_and_push_docker_image(repository_name, dockerfile='Dockerfile', build_args={}): +def build_and_push_docker_image(repository_name, dockerfile="Dockerfile", build_args={}): """Builds a docker image from the specified dockerfile, and pushes it to ECR. Handles things like ECR login, creating the repository. @@ -42,15 +42,15 @@ def build_and_push_docker_image(repository_name, dockerfile='Dockerfile', build_ return ecr_tag -def _build_from_dockerfile(repository_name, dockerfile='Dockerfile', build_args={}): - build_cmd = ['docker', 'build', '-t', repository_name, '-f', dockerfile, '.'] - for k,v in build_args.items(): - build_cmd += ['--build-arg', '%s=%s' % (k,v)] +def _build_from_dockerfile(repository_name, dockerfile="Dockerfile", build_args={}): + build_cmd = ["docker", "build", "-t", repository_name, "-f", dockerfile, "."] + for k, v in build_args.items(): + build_cmd += ["--build-arg", "%s=%s" % (k, v)] print("Building docker image %s from %s" % (repository_name, dockerfile)) _execute(build_cmd) print("Done building docker image %s" % repository_name) - + def _find_base_image_in_dockerfile(dockerfile): dockerfile_lines = open(dockerfile).readlines() @@ -72,14 +72,14 @@ def push(tag, aws_account=None, aws_region=None): (string): ECR repo image that was pushed """ session = boto3.Session() - aws_account = aws_account or session.client("sts").get_caller_identity()['Account'] + aws_account = aws_account or session.client("sts").get_caller_identity()["Account"] aws_region = aws_region or session.region_name try: - repository_name, version = tag.split(':') + repository_name, version = tag.split(":") except ValueError: # split failed because no : repository_name = tag version = "latest" - ecr_client = session.client('ecr', region_name=aws_region) + ecr_client = session.client("ecr", region_name=aws_region) _create_ecr_repo(ecr_client, repository_name) _ecr_login(ecr_client, aws_account) @@ -89,11 +89,11 @@ def push(tag, aws_account=None, aws_region=None): def _push(aws_account, aws_region, tag): - ecr_repo = '%s.dkr.ecr.%s.amazonaws.com' % (aws_account, aws_region) - ecr_tag = '%s/%s' % (ecr_repo, tag) - _execute(['docker', 'tag', tag, ecr_tag]) + ecr_repo = "%s.dkr.ecr.%s.amazonaws.com" % (aws_account, aws_region) + ecr_tag = "%s/%s" % (ecr_repo, tag) + _execute(["docker", "tag", tag, ecr_tag]) print("Pushing docker image to ECR repository %s/%s\n" % (ecr_repo, tag)) - _execute(['docker', 'push', ecr_tag]) + _execute(["docker", "push", ecr_tag]) print("Done pushing %s" % ecr_tag) return ecr_tag @@ -111,34 +111,34 @@ def _create_ecr_repo(ecr_client, repository_name): def _ecr_login(ecr_client, aws_account): auth = ecr_client.get_authorization_token(registryIds=[aws_account]) - authorization_data = auth['authorizationData'][0] + authorization_data = auth["authorizationData"][0] - raw_token = base64.b64decode(authorization_data['authorizationToken']) - token = raw_token.decode('utf-8').strip('AWS:') - ecr_url = auth['authorizationData'][0]['proxyEndpoint'] + raw_token = base64.b64decode(authorization_data["authorizationToken"]) + token = raw_token.decode("utf-8").strip("AWS:") + ecr_url = auth["authorizationData"][0]["proxyEndpoint"] - cmd = ['docker', 'login', '-u', 'AWS', '-p', token, ecr_url] + cmd = ["docker", "login", "-u", "AWS", "-p", token, ecr_url] _execute(cmd, quiet=True) print("Logged into ECR") def _ecr_login_if_needed(image): - ecr_client = boto3.client('ecr') + ecr_client = boto3.client("ecr") # Only ECR images need login - if not ('dkr.ecr' in image and 'amazonaws.com' in image): + if not ("dkr.ecr" in image and "amazonaws.com" in image): return # do we have the image? - if _check_output('docker images -q %s' % image).strip(): + if _check_output("docker images -q %s" % image).strip(): return - aws_account = image.split('.')[0] + aws_account = image.split(".")[0] _ecr_login(ecr_client, aws_account) @contextlib.contextmanager -def _tmpdir(suffix='', prefix='tmp', dir=None): # type: (str, str, str) -> None +def _tmpdir(suffix="", prefix="tmp", dir=None): # type: (str, str, str) -> None """Create a temporary directory with a context manager. The file is deleted when the context exits. The prefix, suffix, and dir arguments are the same as for mkstemp(). @@ -160,10 +160,8 @@ def _tmpdir(suffix='', prefix='tmp', dir=None): # type: (str, str, str) -> None def _execute(command, quiet=False): if not quiet: - print("$ %s" % ' '.join(command)) - process = subprocess.Popen(command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + print("$ %s" % " ".join(command)) + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) try: _stream_output(process) except RuntimeError as e: diff --git a/09_deploy/common/env_utils.py b/09_deploy/common/env_utils.py index 516af621..6d512731 100644 --- a/09_deploy/common/env_utils.py +++ b/09_deploy/common/env_utils.py @@ -6,10 +6,12 @@ gym.logger.set_level(40) -class VectoredGymEnvironment(): + +class VectoredGymEnvironment: """ Envrioment class to run multiple similations and collect rollout data """ + def __init__(self, registered_gym_env, num_of_envs=1): self.envs_initialized = False self.initialized_envs = {} @@ -19,14 +21,11 @@ def __init__(self, registered_gym_env, num_of_envs=1): self.data_rows = [] self.initialize_envs(num_of_envs, registered_gym_env) - + def is_initialized(self): return self.envs_initialized - - def initialize_envs( - self, - num_of_envs, - registered_gym_env): + + def initialize_envs(self, num_of_envs, registered_gym_env): """Initialize multiple Openai gym environments. Each envrionment will start with a different random seed. @@ -51,7 +50,7 @@ def get_environment_states(self): def dump_environment_states(self, dir_path, file_name): """Dumping current states of all the envrionments into file - + Arguments: dir_path {str} -- Directory path of the target file file_name {str} -- File name of the target file @@ -59,43 +58,42 @@ def dump_environment_states(self, dir_path, file_name): data_folder = Path(dir_path) file_path = data_folder / file_name - with open(file_path, 'w') as outfile: + with open(file_path, "w") as outfile: for state in self.env_states.values(): json.dump(list(state), outfile) - outfile.write('\n') + outfile.write("\n") def get_environment_ids(self): return list(self.initialized_envs.keys()) - + def step(self, environment_id, action): local_env = self.initialized_envs[environment_id] observation, reward, done, info = local_env.step(action) self.env_states[environment_id] = observation return observation, reward, done, info - + def reset(self, environment_id): - self.env_states[environment_id] = \ - self.initialized_envs[environment_id].reset() + self.env_states[environment_id] = self.initialized_envs[environment_id].reset() return self.env_states[environment_id] def reset_all_envs(self): print("Resetting all the environments...") - for i in range(0, self.num_of_envs): + for i in range(0, self.num_of_envs): environment_id = "environment_" + str(i) self.reset(environment_id) - + def close(self, environment_id): self.initialized_envs[environment_id].close() return - + def render(self, environment_id): self.initialized_envs[environment_id].render() return def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, action_prob, num_episodes): """Collect rollouts with given steps from one environment - + Arguments: environment_id {str} -- Environment id for the environment action_prob {list} -- Action probabilities of the simulated policy @@ -116,8 +114,9 @@ def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, ac cur_state_features = self.env_states[environment_id] _, reward, done, _ = self.step(environment_id, action) cumulative_rewards += reward - episode_id = int(environment_id.split('_')[-1]) + \ - self.num_of_envs * self.env_reset_counter[environment_id] + episode_id = ( + int(environment_id.split("_")[-1]) + self.num_of_envs * self.env_reset_counter[environment_id] + ) if not done: data_item.extend([action, action_prob, episode_id, reward, 0.0]) else: @@ -131,7 +130,7 @@ def collect_rollouts_for_single_env_with_given_episodes(self, environment_id, ac def collect_rollouts_for_single_env_with_given_steps(self, environment_id, action_prob, num_steps): """Collect rollouts with given steps from one environment - + Arguments: environment_id {str} -- Environment id for the environment action_prob {list} -- Action probabilities of the simulated policy @@ -148,8 +147,7 @@ def collect_rollouts_for_single_env_with_given_steps(self, environment_id, actio action = np.random.choice(len(action_prob), p=action_prob) cur_state_features = self.env_states[environment_id] _, reward, done, _ = self.step(environment_id, action) - episode_id = int(environment_id.split('_')[-1]) + \ - self.num_of_envs * self.env_reset_counter[environment_id] + episode_id = int(environment_id.split("_")[-1]) + self.num_of_envs * self.env_reset_counter[environment_id] data_item.extend([action, action_prob, episode_id, reward]) for j in range(len(cur_state_features)): data_item.append(cur_state_features[j]) @@ -158,25 +156,27 @@ def collect_rollouts_for_single_env_with_given_steps(self, environment_id, actio self.reset(environment_id) self.env_reset_counter[environment_id] += 1 - def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes=None, action_probs=None, file_name=None): + def collect_rollouts_with_given_action_probs( + self, num_steps=None, num_episodes=None, action_probs=None, file_name=None + ): """Collect rollouts from all the initiated environments with given action probs - + Keyword Arguments: num_steps {int} -- Number of steps to run rollouts (default: {None}) num_episodes {int} -- Number of episodes to run rollouts (default: {None}) action_probs {list} -- Action probs for the policy (default: {None}) file_name {str} -- Batch transform output that contain predictions of probs (default: {None}) - + Returns: [Dataframe] -- Dataframe that contains the rollout data from all envs """ if file_name is not None: assert action_probs is None - json_lines = [json.loads(line.rstrip('\n')) for line in open(file_name) if line is not ''] + json_lines = [json.loads(line.rstrip("\n")) for line in open(file_name) if line is not ""] action_probs = [] for line in json_lines: - if line.get('SageMakerOutput') is not None: - action_probs.append(line['SageMakerOutput'].get("predictions")[0]) + if line.get("SageMakerOutput") is not None: + action_probs.append(line["SageMakerOutput"].get("predictions")[0]) else: action_probs.append(line.get("predictions")[0]) @@ -184,9 +184,7 @@ def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes= for index, environment_id in enumerate(self.get_environment_ids()): if num_steps is not None: assert num_episodes is None - self.collect_rollouts_for_single_env_with_given_steps( - environment_id, action_probs[index], num_steps - ) + self.collect_rollouts_for_single_env_with_given_steps(environment_id, action_probs[index], num_steps) else: assert num_episodes is not None self.collect_rollouts_for_single_env_with_given_episodes( @@ -194,18 +192,18 @@ def collect_rollouts_with_given_action_probs(self, num_steps=None, num_episodes= ) col_names = self._create_col_names() - df = pd.DataFrame(self.data_rows, columns = col_names) + df = pd.DataFrame(self.data_rows, columns=col_names) return df def _create_col_names(self): """Create column names of dataframe that can be consumed by Coach - + Returns: [list] -- List of column names """ - col_names = ['action', 'all_action_probabilities', 'episode_id', 'reward', 'cumulative_rewards'] + col_names = ["action", "all_action_probabilities", "episode_id", "reward", "cumulative_rewards"] for i in range(self.state_dims): - col_names.append('state_feature_' + str(i)) + col_names.append("state_feature_" + str(i)) - return col_names \ No newline at end of file + return col_names diff --git a/09_deploy/common/markdown_helper.py b/09_deploy/common/markdown_helper.py index f545cffd..66d67260 100644 --- a/09_deploy/common/markdown_helper.py +++ b/09_deploy/common/markdown_helper.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. + def generate_s3_write_permission_for_sagemaker_role(role): role_name = role.split("/")[-1] url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name @@ -19,6 +20,7 @@ def generate_s3_write_permission_for_sagemaker_role(role): text += "3. Search and select `AmazonKinesisVideoStreamsFullAccess` policy\n" return text + def generate_kinesis_create_permission_for_sagemaker_role(role): role_name = role.split("/")[-1] url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name @@ -27,6 +29,7 @@ def generate_kinesis_create_permission_for_sagemaker_role(role): text += "3. Search and select `AmazonS3FullAccess` policy\n" return text + def generate_help_for_s3_endpoint_permissions(role): role_name = role.split("/")[-1] url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name @@ -138,12 +141,14 @@ def generate_robomaker_links(job_arns, aws_region): simulation_ids = [job_arn.split("/")[-1] for job_arn in job_arns] robomaker_links = [] for simulation_id in simulation_ids: - robomaker_link = "https://%s.console.aws.amazon.com/robomaker/home?region=%s#simulationJobs/%s" % (aws_region, - aws_region, - simulation_id) + robomaker_link = "https://%s.console.aws.amazon.com/robomaker/home?region=%s#simulationJobs/%s" % ( + aws_region, + aws_region, + simulation_id, + ) robomaker_links.append(robomaker_link) - markdown_content = '> Click on the following links for visualization of simulation jobs on RoboMaker Console\n' + markdown_content = "> Click on the following links for visualization of simulation jobs on RoboMaker Console\n" for i in range(len(robomaker_links)): markdown_content += "- [Simulation %s](%s) \n" % (i + 1, robomaker_links[i]) @@ -152,12 +157,16 @@ def generate_robomaker_links(job_arns, aws_region): def create_s3_endpoint_manually(aws_region, default_vpc): - url = "https://%s.console.aws.amazon.com/vpc/home?region=%s#Endpoints:sort=vpcEndpointId" % (aws_region, aws_region) + url = "https://%s.console.aws.amazon.com/vpc/home?region=%s#Endpoints:sort=vpcEndpointId" % ( + aws_region, + aws_region, + ) text = ">VPC S3 endpoint creation failed. Please do the following to create an endpoint manually:\n" text += "1. Go to [VPC console | Endpoints](%s)\n" % url text += "2. Click on `Create Endpoint`. Select Service Name as `com.amazonaws.%s.s3`.\n" % (aws_region) text += "3. Next, select your Default VPC: `%s` and click the checkbox against the main Route Table ID\n" % ( - default_vpc) + default_vpc + ) text += "4. Select `Full Access` in policy and click on `Create Endpoint`\n" text += "5. That should be it! Now wait for a few seconds before proceeding to the next cell." return text @@ -174,6 +183,7 @@ def generate_help_for_administrator_policy(role): text += "6. Once this is complete, you are all set." return text + def generate_help_for_experiment_manager_permissions(role): role_name = role.split("/")[-1] url = "https://console.aws.amazon.com/iam/home#/roles/%s" % role_name @@ -222,4 +232,3 @@ def generate_help_for_experiment_manager_permissions(role): },```\n""" text += "4. Now wait for a few minutes before executing this cell again!" return text - diff --git a/09_deploy/common/misc.py b/09_deploy/common/misc.py index 45ad3c78..e51551bd 100644 --- a/09_deploy/common/misc.py +++ b/09_deploy/common/misc.py @@ -26,10 +26,19 @@ import boto3 import json - -def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='', - aws_account=None, aws_region=None, timeout=1200, limit=20, - fetch_only=None, training_job_name=None): + +def wait_for_s3_object( + s3_bucket, + key, + local_dir, + local_prefix="", + aws_account=None, + aws_region=None, + timeout=1200, + limit=20, + fetch_only=None, + training_job_name=None, +): """ Keep polling s3 object until it is generated. Pulling down latest data to local directory with short key @@ -50,15 +59,15 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='', A list of all downloaded files, as local filenames """ session = boto3.Session() - aws_account = aws_account or session.client("sts").get_caller_identity()['Account'] + aws_account = aws_account or session.client("sts").get_caller_identity()["Account"] aws_region = aws_region or session.region_name - s3 = session.resource('s3') - sagemaker = session.client('sagemaker') + s3 = session.resource("s3") + sagemaker = session.client("sagemaker") bucket = s3.Bucket(s3_bucket) objects = [] - print("Waiting for s3://%s/%s..." % (s3_bucket, key), end='', flush=True) + print("Waiting for s3://%s/%s..." % (s3_bucket, key), end="", flush=True) start_time = time.time() cnt = 0 while len(objects) == 0: @@ -67,7 +76,7 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='', objects = list(filter(fetch_only, objects)) if objects: continue - print('.', end='', flush=True) + print(".", end="", flush=True) time.sleep(5) cnt += 1 if cnt % 80 == 0: @@ -75,12 +84,17 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='', if time.time() > start_time + timeout: raise FileNotFoundError("S3 object s3://%s/%s never appeared after %d seconds" % (s3_bucket, key, timeout)) if training_job_name: - training_job_status = sagemaker.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus'] - if training_job_status == 'Failed': - raise RuntimeError("Training job {} failed while waiting for S3 object s3://{}/{}" - .format(training_job_name, s3_bucket, key)) - - print('\n', end='', flush=True) + training_job_status = sagemaker.describe_training_job(TrainingJobName=training_job_name)[ + "TrainingJobStatus" + ] + if training_job_status == "Failed": + raise RuntimeError( + "Training job {} failed while waiting for S3 object s3://{}/{}".format( + training_job_name, s3_bucket, key + ) + ) + + print("\n", end="", flush=True) if len(objects) > limit: print("Only downloading %d of %d files" % (limit, len(objects))) @@ -89,7 +103,7 @@ def wait_for_s3_object(s3_bucket, key, local_dir, local_prefix='', fetched_files = [] for obj in objects: print("Downloading %s" % obj.key) - local_path = os.path.join(local_dir, local_prefix, obj.key.split('/')[-1]) + local_path = os.path.join(local_dir, local_prefix, obj.key.split("/")[-1]) obj.Object().download_file(local_path) fetched_files.append(local_path) @@ -106,38 +120,30 @@ def get_execution_role(role_name="sagemaker", aws_account=None, aws_region=None) aws_region (string): aws region where the repo is located """ session = boto3.Session() - aws_account = aws_account or session.client("sts").get_caller_identity()['Account'] + aws_account = aws_account or session.client("sts").get_caller_identity()["Account"] aws_region = aws_region or session.region_name - assume_role_policy_document = json.dumps({ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": ["sagemaker.amazonaws.com", "robomaker.amazonaws.com"] - }, - "Action": "sts:AssumeRole" - } - ] - }) - - client = session.client('iam') + assume_role_policy_document = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["sagemaker.amazonaws.com", "robomaker.amazonaws.com"]}, + "Action": "sts:AssumeRole", + } + ], + } + ) + + client = session.client("iam") try: client.get_role(RoleName=role_name) except client.exceptions.NoSuchEntityException: - client.create_role( - RoleName=role_name, - AssumeRolePolicyDocument=str(assume_role_policy_document) - ) + client.create_role(RoleName=role_name, AssumeRolePolicyDocument=str(assume_role_policy_document)) print("Created new sagemaker execution role: %s" % role_name) - client.attach_role_policy( - PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess', - RoleName=role_name - ) - - return client.get_role(RoleName=role_name)['Role']['Arn'] - + client.attach_role_policy(PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", RoleName=role_name) + return client.get_role(RoleName=role_name)["Role"]["Arn"] diff --git a/09_deploy/common/sagemaker_rl/coach_launcher.py b/09_deploy/common/sagemaker_rl/coach_launcher.py index 4eff7a5f..4708e8d3 100644 --- a/09_deploy/common/sagemaker_rl/coach_launcher.py +++ b/09_deploy/common/sagemaker_rl/coach_launcher.py @@ -5,7 +5,7 @@ from rl_coach.base_parameters import VisualizationParameters, TaskParameters, Frameworks from rl_coach.utils import short_dynamic_import from rl_coach.core_types import SelectedPhaseOnlyDumpFilter, MaxDumpFilter, RunPhase -import rl_coach.core_types +import rl_coach.core_types from rl_coach import logger from rl_coach.logger import screen import argparse @@ -22,22 +22,22 @@ screen.set_use_colors(False) # Simple text logging so it looks good in CloudWatch + class CoachConfigurationList(ConfigurationList): - """Helper Object for converting CLI arguments (or SageMaker hyperparameters) + """Helper Object for converting CLI arguments (or SageMaker hyperparameters) into Coach configuration. """ # Being security-paranoid and not instantiating any arbitrary string the customer passes in ALLOWED_TYPES = { - 'Frames': rl_coach.core_types.Frames, - 'EnvironmentSteps': rl_coach.core_types.EnvironmentSteps, - 'EnvironmentEpisodes': rl_coach.core_types.EnvironmentEpisodes, - 'TrainingSteps': rl_coach.core_types.TrainingSteps, - 'Time': rl_coach.core_types.Time, + "Frames": rl_coach.core_types.Frames, + "EnvironmentSteps": rl_coach.core_types.EnvironmentSteps, + "EnvironmentEpisodes": rl_coach.core_types.EnvironmentEpisodes, + "TrainingSteps": rl_coach.core_types.TrainingSteps, + "Time": rl_coach.core_types.Time, } - class SageMakerCoachPresetLauncher(CoachLauncher): """Base class for training RL tasks using RL-Coach. Customers subclass this to define specific kinds of workloads, overriding these methods as needed. @@ -47,7 +47,6 @@ def __init__(self): super().__init__() self.hyperparams = None - def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace: """Overrides the default CLI parsing. Sets the configuration parameters for what a SageMaker run should do. @@ -58,20 +57,20 @@ def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace args, _ = parser.parse_known_args(args=empty_arg_list) parser = self.sagemaker_argparser() sage_args, unknown = parser.parse_known_args() - + # Now fill in the args that we care about. sagemaker_job_name = os.environ.get("sagemaker_job_name", "sagemaker-experiment") args.experiment_name = logger.get_experiment_name(sagemaker_job_name) - + # Override experiment_path used for outputs - args.experiment_path = '/opt/ml/output/intermediate' - rl_coach.logger.experiment_path = '/opt/ml/output/intermediate' # for gifs + args.experiment_path = "/opt/ml/output/intermediate" + rl_coach.logger.experiment_path = "/opt/ml/output/intermediate" # for gifs - args.checkpoint_save_dir = '/opt/ml/output/data/checkpoint' - args.checkpoint_save_secs = 10 # should avoid hardcoding + args.checkpoint_save_dir = "/opt/ml/output/data/checkpoint" + args.checkpoint_save_secs = 10 # should avoid hardcoding # onnx for deployment for mxnet (not tensorflow) - save_model = (sage_args.save_model == 1) - backend = os.getenv('COACH_BACKEND', 'tensorflow') + save_model = sage_args.save_model == 1 + backend = os.getenv("COACH_BACKEND", "tensorflow") if save_model and backend == "mxnet": args.export_onnx_graph = True @@ -92,7 +91,7 @@ def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace name = name[2:] else: raise ValueError("Unknown command-line argument %s" % name) - val = unknown[i+1] + val = unknown[i + 1] self.map_hyperparameter(name, val) return args @@ -107,29 +106,29 @@ def map_hyperparameter(self, name, value): else: raise ValueError("Unknown hyperparameter %s" % name) - def apply_hyperparameter(self, name, value): """Save this hyperparameter to be applied to the graph_manager object when it's ready. """ - print("Applying RL hyperparameter %s=%s" % (name,value)) + print("Applying RL hyperparameter %s=%s" % (name, value)) self.hyperparameters.store(name, value) - def default_preset_name(self): """ Sub-classes will typically return a single hard-coded string. """ try: - #TODO: remove this after converting all samples. + # TODO: remove this after converting all samples. default_preset = self.DEFAULT_PRESET screen.warning("Deprecated configuration of default preset. Please implement default_preset_name()") return default_preset except: pass - raise NotImplementedError("Sub-classes must specify the name of the default preset "+ - "for this RL problem. This will be the name of a python "+ - "file (without .py) that defines a graph_manager variable") + raise NotImplementedError( + "Sub-classes must specify the name of the default preset " + + "for this RL problem. This will be the name of a python " + + "file (without .py) that defines a graph_manager variable" + ) def sagemaker_argparser(self) -> argparse.ArgumentParser: """ @@ -138,27 +137,32 @@ def sagemaker_argparser(self) -> argparse.ArgumentParser: parser = argparse.ArgumentParser() # Arguably this would be cleaner if we copied the config from the base class argparser. - parser.add_argument('-n', '--num_workers', - help="(int) Number of workers for multi-process based agents, e.g. A3C", - default=1, - type=int) - parser.add_argument('-p', '--RLCOACH_PRESET', - help="(string) Name of the file with the RLCoach preset", - default=self.default_preset_name(), - type=str) - parser.add_argument('--save_model', - help="(int) Flag to save model artifact after training finish", - default=0, - type=int) + parser.add_argument( + "-n", + "--num_workers", + help="(int) Number of workers for multi-process based agents, e.g. A3C", + default=1, + type=int, + ) + parser.add_argument( + "-p", + "--RLCOACH_PRESET", + help="(string) Name of the file with the RLCoach preset", + default=self.default_preset_name(), + type=str, + ) + parser.add_argument( + "--save_model", help="(int) Flag to save model artifact after training finish", default=0, type=int + ) return parser def path_of_main_launcher(self): """ A bit of python magic to find the path of the file that launched the current process. """ - main_mod = sys.modules['__main__'] + main_mod = sys.modules["__main__"] try: - launcher_file = os.path.abspath(sys.modules['__main__'].__file__) + launcher_file = os.path.abspath(sys.modules["__main__"].__file__) return os.path.dirname(launcher_file) except AttributeError: # If __main__.__file__ is missing, then we're probably in an interactive python shell @@ -167,7 +171,7 @@ def path_of_main_launcher(self): def preset_from_name(self, preset_name): preset_path = self.path_of_main_launcher() print("Loading preset %s from %s" % (preset_name, preset_path)) - preset_path = os.path.join(self.path_of_main_launcher(),preset_name) + '.py:graph_manager' + preset_path = os.path.join(self.path_of_main_launcher(), preset_name) + ".py:graph_manager" graph_manager = short_dynamic_import(preset_path, ignore_module_case=True) return graph_manager @@ -178,56 +182,63 @@ def get_graph_manager_from_args(self, args): self.hyperparameters.apply_subset(graph_manager, "rl.") # Set framework # Note: Some graph managers (e.g. HAC preset) create multiple agents and the attribute is called agents_params - if hasattr(graph_manager, 'agent_params'): + if hasattr(graph_manager, "agent_params"): for network_parameters in graph_manager.agent_params.network_wrappers.values(): network_parameters.framework = args.framework - elif hasattr(graph_manager, 'agents_params'): + elif hasattr(graph_manager, "agents_params"): for ap in graph_manager.agents_params: for network_parameters in ap.network_wrappers.values(): network_parameters.framework = args.framework return graph_manager def _save_tf_model(self): - ckpt_dir = '/opt/ml/output/data/checkpoint' - model_dir = '/opt/ml/model' + ckpt_dir = "/opt/ml/output/data/checkpoint" + model_dir = "/opt/ml/model" # Re-Initialize from the checkpoint so that you will have the latest models up. - tf.train.init_from_checkpoint(ckpt_dir, - {'main_level/agent/online/network_0/': 'main_level/agent/online/network_0'}) - tf.train.init_from_checkpoint(ckpt_dir, - {'main_level/agent/online/network_1/': 'main_level/agent/online/network_1'}) + tf.train.init_from_checkpoint( + ckpt_dir, {"main_level/agent/online/network_0/": "main_level/agent/online/network_0"} + ) + tf.train.init_from_checkpoint( + ckpt_dir, {"main_level/agent/online/network_1/": "main_level/agent/online/network_1"} + ) # Create a new session with a new tf graph. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) # initialize the checkpoint. # This is the node that will accept the input. - input_nodes = tf.get_default_graph().get_tensor_by_name('main_level/agent/main/online/' + \ - 'network_0/observation/observation:0') + input_nodes = tf.get_default_graph().get_tensor_by_name( + "main_level/agent/main/online/" + "network_0/observation/observation:0" + ) # This is the node that will produce the output. - output_nodes = tf.get_default_graph().get_operation_by_name('main_level/agent/main/online/' + \ - 'network_1/ppo_head_0/policy') + output_nodes = tf.get_default_graph().get_operation_by_name( + "main_level/agent/main/online/" + "network_1/ppo_head_0/policy" + ) # Save the model as a servable model. - tf.saved_model.simple_save(session=sess, - export_dir='model', - inputs={"observation": input_nodes}, - outputs={"policy": output_nodes.outputs[0]}) + tf.saved_model.simple_save( + session=sess, + export_dir="model", + inputs={"observation": input_nodes}, + outputs={"policy": output_nodes.outputs[0]}, + ) # Move to the appropriate folder. Don't mind the directory, this just works. # rl-cart-pole is the name of the model. Remember it. - shutil.move('model/', model_dir + '/model/tf-model/00000001/') + shutil.move("model/", model_dir + "/model/tf-model/00000001/") # EASE will pick it up and upload to the right path. print("Success") def _save_onnx_model(self): from .onnx_utils import fix_onnx_model - ckpt_dir = '/opt/ml/output/data/checkpoint' - model_dir = '/opt/ml/model' + + ckpt_dir = "/opt/ml/output/data/checkpoint" + model_dir = "/opt/ml/model" # find latest onnx file # currently done by name, expected to be changed in future release of coach. - glob_pattern = os.path.join(ckpt_dir, '*.onnx') + glob_pattern = os.path.join(ckpt_dir, "*.onnx") onnx_files = [file for file in glob.iglob(glob_pattern, recursive=True)] if len(onnx_files) > 0: - extract_step = lambda string: int(re.search('/(\d*)_Step.*', string, re.IGNORECASE).group(1)) + extract_step = lambda string: int(re.search("/(\d*)_Step.*", string, re.IGNORECASE).group(1)) onnx_files.sort(key=extract_step) latest_onnx_file = onnx_files[-1] # move to model directory @@ -237,10 +248,10 @@ def _save_onnx_model(self): fix_onnx_model(filepath_to) else: screen.warning("No ONNX files found in {}".format(ckpt_dir)) - + @classmethod def train_main(cls): - """Entrypoint for training. + """Entrypoint for training. Parses command-line arguments and starts training. """ trainer = cls() @@ -250,10 +261,10 @@ def train_main(cls): parser = trainer.sagemaker_argparser() sage_args, unknown = parser.parse_known_args() if sage_args.save_model == 1: - backend = os.getenv('COACH_BACKEND', 'tensorflow') - if backend == 'tensorflow': + backend = os.getenv("COACH_BACKEND", "tensorflow") + if backend == "tensorflow": trainer._save_tf_model() - if backend == 'mxnet': + if backend == "mxnet": trainer._save_onnx_model() @@ -265,14 +276,15 @@ class SageMakerCoachLauncher(SageMakerCoachPresetLauncher): def __init__(self): super().__init__() screen.warning("DEPRECATION WARNING: Please switch to SageMakerCoachPresetLauncher") - #TODO: Remove this whole class when nobody's using it any more. + # TODO: Remove this whole class when nobody's using it any more. def define_environment(self): - return NotImplementedEror("Sub-class must define environment e.g. GymVectorEnvironment(level='your_module:YourClass')") + return NotImplementedEror( + "Sub-class must define environment e.g. GymVectorEnvironment(level='your_module:YourClass')" + ) def get_graph_manager_from_args(self, args): - """Returns the GraphManager object for coach to use to train by calling improve() - """ + """Returns the GraphManager object for coach to use to train by calling improve()""" # NOTE: TaskParameters are not configurable at this time. # Visualization @@ -306,8 +318,10 @@ def config_schedule(self, schedule_params): pass def define_agent(self): - raise NotImplementedError("Subclass must create define_agent() method which returns an AgentParameters object. e.g.\n" \ - " return rl_coach.agents.dqn_agent.DQNAgentParameters()"); + raise NotImplementedError( + "Subclass must create define_agent() method which returns an AgentParameters object. e.g.\n" + " return rl_coach.agents.dqn_agent.DQNAgentParameters()" + ) def config_visualization(self, vis_params): vis_params.dump_gifs = True diff --git a/09_deploy/common/sagemaker_rl/configuration_list.py b/09_deploy/common/sagemaker_rl/configuration_list.py index 6768d0c3..ac4b92ce 100644 --- a/09_deploy/common/sagemaker_rl/configuration_list.py +++ b/09_deploy/common/sagemaker_rl/configuration_list.py @@ -8,14 +8,13 @@ class ConfigurationList(object): def __init__(self): """Args: - - arg_list [list]: list of arguments on the command-line like [key1, value1, key2, value2, ...] - - prefix [str]: Prefix for every key that must be present, e.g. "--" for common command-line args + - arg_list [list]: list of arguments on the command-line like [key1, value1, key2, value2, ...] + - prefix [str]: Prefix for every key that must be present, e.g. "--" for common command-line args """ self.hp_dict = {} def store(self, name, value): - """Store a key/value hyperparameter combination - """ + """Store a key/value hyperparameter combination""" self.hp_dict[name] = value def apply_subset(self, config_object, prefix): @@ -31,7 +30,7 @@ def apply_subset(self, config_object, prefix): for key, val in list(self.hp_dict.items()): if key.startswith(prefix): logging.debug("Configuring %s with %s=%s" % (prefix, key, val)) - subkey = key[ len(prefix): ] + subkey = key[len(prefix) :] msg = "%s%s=%s" % (prefix, subkey, val) try: self._set_rl_property_value(config_object, subkey, val, prefix) @@ -41,20 +40,19 @@ def apply_subset(self, config_object, prefix): del self.hp_dict[key] def _set_rl_property_value(self, obj, key, val, path=""): - """Sets a property on obj to val, or to a sub-object within obj if key looks like "foo.bar" - """ + """Sets a property on obj to val, or to a sub-object within obj if key looks like "foo.bar" """ if key.find(".") >= 0: - top_key, sub_keys = key_list = key.split(".",1) + top_key, sub_keys = key_list = key.split(".", 1) if top_key.startswith("__"): raise ValueError("Attempting to set unsafe property name %s" % top_key) - if isinstance(obj,dict): + if isinstance(obj, dict): sub_obj = obj[top_key] else: sub_obj = obj.__dict__[top_key] # Recurse - return self._set_rl_property_value(sub_obj, sub_keys, val, "%s.%s" % (path,top_key) ) + return self._set_rl_property_value(sub_obj, sub_keys, val, "%s.%s" % (path, top_key)) else: - key, val = self._parse_type(key,val) + key, val = self._parse_type(key, val) if key.startswith("__"): raise ValueError("Attempting to set unsafe property name %s" % key) if isinstance(obj, dict): @@ -63,8 +61,7 @@ def _set_rl_property_value(self, obj, key, val, path=""): obj.__dict__[key] = val def _autotype(self, val): - """Converts string to an int or float as possible. - """ + """Converts string to an int or float as possible.""" if type(val) == dict: return val if type(val) == list: @@ -96,6 +93,8 @@ def _parse_type(self, key, val): key, obj_type = key.split(":", 1) cls = self.ALLOWED_TYPES.get(obj_type) if not cls: - raise ValueError("Unrecognized object type %s. Allowed values are %s" % (obj_type, self.ALLOWED_TYPES.keys())) + raise ValueError( + "Unrecognized object type %s. Allowed values are %s" % (obj_type, self.ALLOWED_TYPES.keys()) + ) val = cls(val) return key, val diff --git a/09_deploy/common/sagemaker_rl/docker_utils.py b/09_deploy/common/sagemaker_rl/docker_utils.py index d9111b6b..eced1d2f 100644 --- a/09_deploy/common/sagemaker_rl/docker_utils.py +++ b/09_deploy/common/sagemaker_rl/docker_utils.py @@ -1,6 +1,7 @@ import socket import time + def get_ip_from_host(timeout=100, host_name=None): counter = 0 ip_address = None @@ -17,8 +18,11 @@ def get_ip_from_host(timeout=100, host_name=None): time.sleep(1) if counter == timeout and not ip_address: - error_string = "Platform Error: Could not retrieve IP address \ - for %s in past %s seconds" % (host_name, timeout) + error_string = ( + "Platform Error: Could not retrieve IP address \ + for %s in past %s seconds" + % (host_name, timeout) + ) raise RuntimeError(error_string) - return ip_address \ No newline at end of file + return ip_address diff --git a/09_deploy/common/sagemaker_rl/mpi_launcher.py b/09_deploy/common/sagemaker_rl/mpi_launcher.py index 5fe9f169..5d8f0146 100644 --- a/09_deploy/common/sagemaker_rl/mpi_launcher.py +++ b/09_deploy/common/sagemaker_rl/mpi_launcher.py @@ -38,21 +38,18 @@ def _change_hostname(current_host): def _start_ssh_daemon(): - """Starts the ssh deamon - """ + """Starts the ssh deamon""" subprocess.Popen(["/usr/sbin/sshd", "-D"]) def _setup_mpi_environment(env): - """Setup MPI environment, i.e. executing change hostname scrip and starting ssh deamon. - """ + """Setup MPI environment, i.e. executing change hostname scrip and starting ssh deamon.""" _change_hostname(env.current_host) _start_ssh_daemon() def _can_connect(host, port, s): - """Checks if the connection to provided ``host`` and ``port`` is possible or not. - """ + """Checks if the connection to provided ``host`` and ``port`` is possible or not.""" try: print("Testing connection to host {}".format(host)) s.connect((host, port)) @@ -86,15 +83,18 @@ def _create_mpi_script(env, train_script, train_script_args): python_cmd.extend(hyperparameters) python_cmd.extend(channels) - content = textwrap.dedent("""#!/usr/bin/env bash + content = textwrap.dedent( + """#!/usr/bin/env bash touch /mpi_is_running %s EXIT_CODE=$? touch /mpi_is_finished exit ${EXIT_CODE} -""" % ' '.join(python_cmd)) +""" + % " ".join(python_cmd) + ) - with open(_MPI_SCRIPT, 'w') as w: + with open(_MPI_SCRIPT, "w") as w: w.write(content) st = os.stat(_MPI_SCRIPT) @@ -104,11 +104,11 @@ def _create_mpi_script(env, train_script, train_script_args): class MPIMaster(object): """MPI Master - Args: - env (TrainingEnv): an instance of the training environment. - process_per_host (int): Number of processes per host to be executed by MPI - instance_type (str): Type of instance used for this job. It will be "local" for local mode. Its used to - perform different setup for local mode or sagemaker mode. + Args: + env (TrainingEnv): an instance of the training environment. + process_per_host (int): Number of processes per host to be executed by MPI + instance_type (str): Type of instance used for this job. It will be "local" for local mode. Its used to + perform different setup for local mode or sagemaker mode. """ def __init__(self, env, process_per_host, instance_type): @@ -117,8 +117,7 @@ def __init__(self, env, process_per_host, instance_type): self.instance_type = instance_type def _wait_for_worker_nodes_to_start_sshd(self, hosts, interval=1, timeout_in_seconds=180): - """Wait for worker nodes to start their ssh deamon to allow MPI communication. - """ + """Wait for worker nodes to start their ssh deamon to allow MPI communication.""" with timeout(seconds=timeout_in_seconds): while hosts: print("hosts that aren't SSHable yet: {}".format(str(hosts))) @@ -130,8 +129,7 @@ def _wait_for_worker_nodes_to_start_sshd(self, hosts, interval=1, timeout_in_sec time.sleep(interval) def _run_mpi_on_all_nodes(self): - """Run MPI command to execute MPI_SCRIPT on all hosts. - """ + """Run MPI command to execute MPI_SCRIPT on all hosts.""" mpi_command = self._build_mpi_command() cmd = shlex.split(mpi_command) @@ -139,44 +137,50 @@ def _run_mpi_on_all_nodes(self): print("MPI Command: {}".format(mpi_command)) with open(_MPI_SCRIPT) as f: - print('Running user script:\n\n%s', f.read()) + print("Running user script:\n\n%s", f.read()) subprocess.check_call(cmd) def _build_mpi_command(self): - """Build MPI command. - """ + """Build MPI command.""" num_hosts = len(self.env.hosts) num_processes = self.process_per_host * num_hosts # By default, use one process per GPU, or one process per node (if training with CPU). - host_list = self.env.hosts if self.process_per_host == 1 else \ - [host + ':{}'.format(self.process_per_host) for host in self.env.hosts] - - print("Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format(self.env.hosts, host_list, - self.process_per_host, - num_processes)) - credential_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN'] + host_list = ( + self.env.hosts + if self.process_per_host == 1 + else [host + ":{}".format(self.process_per_host) for host in self.env.hosts] + ) + + print( + "Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format( + self.env.hosts, host_list, self.process_per_host, num_processes + ) + ) + credential_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"] interface_name = interface_name = self.env.network_interface_name if self.instance_type == "local": interface_name = "eth0" - print('network interface name:' + interface_name + " " + str(self.instance_type)) - - mpi_command = 'mpirun --host {}'.format(",".join(host_list)) \ - + " -np {} ".format(num_processes) \ - + " --allow-run-as-root" \ - + " --display-map" \ - + " --tag-output" \ - + " -mca btl_tcp_if_include {}".format(interface_name) \ - + " -mca oob_tcp_if_include {}".format(interface_name) \ - + " -x NCCL_SOCKET_IFNAME={}".format(interface_name) \ - + " --mca plm_rsh_no_tree_spawn 1" \ - + " -mca orte_abort_on_non_zero_status 1" \ - + " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO" \ - + " -x LD_LIBRARY_PATH -x PATH" \ - + " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY) + print("network interface name:" + interface_name + " " + str(self.instance_type)) + + mpi_command = ( + "mpirun --host {}".format(",".join(host_list)) + + " -np {} ".format(num_processes) + + " --allow-run-as-root" + + " --display-map" + + " --tag-output" + + " -mca btl_tcp_if_include {}".format(interface_name) + + " -mca oob_tcp_if_include {}".format(interface_name) + + " -x NCCL_SOCKET_IFNAME={}".format(interface_name) + + " --mca plm_rsh_no_tree_spawn 1" + + " -mca orte_abort_on_non_zero_status 1" + + " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO" + + " -x LD_LIBRARY_PATH -x PATH" + + " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY) + ) for v in credential_vars: if v in os.environ: @@ -194,8 +198,7 @@ def __call__(self): self._run_mpi_on_all_nodes() def is_master(self, hosts, current_host): - """Checks if the current host is master or worker. - """ + """Checks if the current host is master or worker.""" print("Hosts: " + str(hosts) + " current host: " + str(current_host)) return current_host == sorted(list(hosts))[0] @@ -205,14 +208,12 @@ class MPIWorker(object): @retry(stop_max_delay=30000 * 1000, wait_fixed=1000, retry_on_result=lambda result: result is False) def _wait_for_mpi_to_start_running(self): - """Wait and retry loop until the MPI training starts on this worker. - """ + """Wait and retry loop until the MPI training starts on this worker.""" return os.path.isfile(_MPI_IS_RUNNING) @retry(wait_fixed=5000, retry_on_result=lambda result: result is False) def _wait_until_mpi_stops_running(self): - """Wait and retry loop until the MPI training is finished on this worker. - """ + """Wait and retry loop until the MPI training is finished on this worker.""" return os.path.isfile(_MPI_IS_FINISHED) def __call__(self, env): @@ -248,7 +249,7 @@ def timeout(seconds=0, minutes=0, hours=0): limit = seconds + 60 * minutes + 3600 * hours def handler(signum, frame): # pylint: disable=W0613 - raise TimeoutError('timed out after {} seconds'.format(limit)) + raise TimeoutError("timed out after {} seconds".format(limit)) try: signal.signal(signal.SIGALRM, handler) @@ -280,8 +281,7 @@ def __init__(self, train_script, train_script_args=None, num_of_processes_per_ho def mpi_run(self): env = sagemaker_containers.training_env() - print("MPI requested with process per hosts: {}" - .format(self._num_of_processes_per_host)) + print("MPI requested with process per hosts: {}".format(self._num_of_processes_per_host)) _setup_mpi_environment(env) _create_mpi_script(env, self._train_script, self._train_script_args) diff --git a/09_deploy/common/sagemaker_rl/onnx_utils.py b/09_deploy/common/sagemaker_rl/onnx_utils.py index 1840bed7..d5712db4 100644 --- a/09_deploy/common/sagemaker_rl/onnx_utils.py +++ b/09_deploy/common/sagemaker_rl/onnx_utils.py @@ -23,23 +23,19 @@ def get_correct_outputs(model): else: raise Exception("Can't determine the RL Agent used from the ONNX graph provided.") - + def make_output(node_name, shape): """ Given a node name and output shape, will construct the correct Protobuf object. """ - return helper.make_tensor_value_info( - name=node_name, - elem_type=TensorProto.FLOAT, - shape=shape - ) + return helper.make_tensor_value_info(name=node_name, elem_type=TensorProto.FLOAT, shape=shape) def ppo_continuous_outputs(model): """ Collects the output nodes for continuous PPO. """ - # determine number of actions + # determine number of actions log_std_node_name = "generalmodel0_singlemodel1_scaledgradhead0_continuousppohead0_log_std" log_std_node = [i for i in model.graph.input if i.name == log_std_node_name][0] num_actions = log_std_node.type.tensor_type.shape.dim[0].dim_value @@ -59,7 +55,7 @@ def ppo_discrete_outputs(model): """ Collects the output nodes for discrete PPO. """ - # determine number of actions + # determine number of actions bias_node_name = "generalmodel0_singlemodel1_scaledgradhead0_discreteppohead0_dense0_bias" bias_node = [i for i in model.graph.input if i.name == bias_node_name][0] num_actions = bias_node.type.tensor_type.shape.dim[0].dim_value @@ -77,21 +73,23 @@ def save_model(model, output_nodes, filepath): """ Given an in memory model, will save to disk at given filepath. """ - new_graph = helper.make_graph(nodes=model.graph.node, - name='new_graph', - inputs=model.graph.input, - outputs=output_nodes, - initializer=model.graph.initializer) + new_graph = helper.make_graph( + nodes=model.graph.node, + name="new_graph", + inputs=model.graph.input, + outputs=output_nodes, + initializer=model.graph.initializer, + ) checker.check_graph(new_graph) new_model = helper.make_model(new_graph) with open(filepath, "wb") as file_handle: serialized = new_model.SerializeToString() file_handle.write(serialized) - + def fix_onnx_model(filepath): """ - Applies an inplace fix to ONNX file from Coach. + Applies an inplace fix to ONNX file from Coach. """ model = onnx.load_model(filepath) output_nodes = get_correct_outputs(model) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py index 8f3f3013..018c8d5c 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/experiment_db_client.py @@ -2,7 +2,8 @@ from boto3.dynamodb.conditions import Key from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException -logger=logging.getLogger(__name__) +logger = logging.getLogger(__name__) + class ExperimentDbClient(object): def __init__(self, table_session): @@ -10,151 +11,138 @@ def __init__(self, table_session): def get_experiment_record(self, experiment_id): response = self.table_session.query( - ConsistentRead=True, - KeyConditionExpression=Key('experiment_id').eq(experiment_id) + ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id) ) - for i in response['Items']: + for i in response["Items"]: return i return None def create_new_experiment_record(self, record): try: - self.table_session.put_item( - Item=record, - ConditionExpression='attribute_not_exists(experiment_id)' - ) + self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(experiment_id)") except Exception as e: if "ConditionalCheckFailedException" in str(e): raise RecordAlreadyExistsException() raise e def update_experiment_record(self, record): - self.table_session.put_item( - Item=record - ) + self.table_session.put_item(Item=record) def delete_item(self, experiment_id): logger.warning("Deleting experiment record...") - self.table_session.delete_item( - Key={ - "experiment_id": experiment_id - } - ) + self.table_session.delete_item(Key={"experiment_id": experiment_id}) #### Update states for training workflow def update_training_workflow_metadata_with_validation( - self, - experiment_id, - training_workflow_metadata, - expected_current_next_model_to_train_id - ): - ''' + self, experiment_id, training_workflow_metadata, expected_current_next_model_to_train_id + ): + """ Updates ExperimentDb record for experiment_id with new training_workflow_metadata, while validating, next_model_to_train_id is as expected in the old record. - ''' + """ self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET training_workflow_metadata = :new_val', - ConditionExpression='training_workflow_metadata.next_model_to_train_id = :exp_model_id', + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET training_workflow_metadata = :new_val", + ConditionExpression="training_workflow_metadata.next_model_to_train_id = :exp_model_id", ExpressionAttributeValues={ - ':new_val': training_workflow_metadata, - ':exp_model_id': expected_current_next_model_to_train_id - } + ":new_val": training_workflow_metadata, + ":exp_model_id": expected_current_next_model_to_train_id, + }, ) def update_experiment_training_state(self, experiment_id, training_state): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET training_workflow_metadata.training_state = :val', - ExpressionAttributeValues={':val': training_state} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET training_workflow_metadata.training_state = :val", + ExpressionAttributeValues={":val": training_state}, ) def update_experiment_last_trained_model_id(self, experiment_id, last_trained_model_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET training_workflow_metadata.last_trained_model_id = :val', - ExpressionAttributeValues={':val': last_trained_model_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET training_workflow_metadata.last_trained_model_id = :val", + ExpressionAttributeValues={":val": last_trained_model_id}, ) def update_experiment_next_model_to_train_id(self, experiment_id, next_model_to_train_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET training_workflow_metadata.next_model_to_train_id = :val', - ExpressionAttributeValues={':val': next_model_to_train_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET training_workflow_metadata.next_model_to_train_id = :val", + ExpressionAttributeValues={":val": next_model_to_train_id}, ) #### Update states for hosting workflow def update_experiment_hosting_state(self, experiment_id, hosting_state): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET hosting_workflow_metadata.hosting_state = :val', - ExpressionAttributeValues={':val': hosting_state} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET hosting_workflow_metadata.hosting_state = :val", + ExpressionAttributeValues={":val": hosting_state}, ) def update_experiment_last_hosted_model_id(self, experiment_id, last_hosted_model_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET hosting_workflow_metadata.last_hosted_model_id = :val', - ExpressionAttributeValues={':val': last_hosted_model_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET hosting_workflow_metadata.last_hosted_model_id = :val", + ExpressionAttributeValues={":val": last_hosted_model_id}, ) def update_experiment_next_model_to_host_id(self, experiment_id, next_model_to_host_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET hosting_workflow_metadata.next_model_to_host_id = :val', - ExpressionAttributeValues={':val': next_model_to_host_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET hosting_workflow_metadata.next_model_to_host_id = :val", + ExpressionAttributeValues={":val": next_model_to_host_id}, ) def update_experiment_hosting_endpoint(self, experiment_id, hosting_endpoint): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET hosting_workflow_metadata.hosting_endpoint = :val', - ExpressionAttributeValues={':val': hosting_endpoint} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET hosting_workflow_metadata.hosting_endpoint = :val", + ExpressionAttributeValues={":val": hosting_endpoint}, ) #### Update states for joining workflow def update_experiment_joining_state(self, experiment_id, joining_state): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET joining_workflow_metadata.joining_state = :val', - ExpressionAttributeValues={':val': joining_state} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET joining_workflow_metadata.joining_state = :val", + ExpressionAttributeValues={":val": joining_state}, ) def update_experiment_last_joined_job_id(self, experiment_id, last_joined_job_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET joining_workflow_metadata.last_joined_job_id = :val', - ExpressionAttributeValues={':val': last_joined_job_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET joining_workflow_metadata.last_joined_job_id = :val", + ExpressionAttributeValues={":val": last_joined_job_id}, ) def update_experiment_next_join_job_id(self, experiment_id, next_join_job_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET joining_workflow_metadata.next_join_job_id = :val', - ExpressionAttributeValues={':val': next_join_job_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET joining_workflow_metadata.next_join_job_id = :val", + ExpressionAttributeValues={":val": next_join_job_id}, ) #### Update states for evaluation workflow def update_experiment_evaluation_state(self, experiment_id, evaluation_state): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET evaluation_workflow_metadata.evaluation_state = :val', - ExpressionAttributeValues={':val': evaluation_state} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET evaluation_workflow_metadata.evaluation_state = :val", + ExpressionAttributeValues={":val": evaluation_state}, ) def update_experiment_last_evaluation_job_id(self, experiment_id, last_evaluation_job_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET evaluation_workflow_metadata.last_evaluation_job_id = :val', - ExpressionAttributeValues={':val': last_evaluation_job_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET evaluation_workflow_metadata.last_evaluation_job_id = :val", + ExpressionAttributeValues={":val": last_evaluation_job_id}, ) def update_experiment_next_evaluation_job_id(self, experiment_id, next_evaluation_job_id): self.table_session.update_item( - Key={'experiment_id': experiment_id}, - UpdateExpression=f'SET evaluation_workflow_metadata.next_evaluation_job_id = :val', - ExpressionAttributeValues={':val': next_evaluation_job_id} + Key={"experiment_id": experiment_id}, + UpdateExpression=f"SET evaluation_workflow_metadata.next_evaluation_job_id = :val", + ExpressionAttributeValues={":val": next_evaluation_job_id}, ) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py index 9299266e..9d7a7605 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/join_db_client.py @@ -2,7 +2,8 @@ from boto3.dynamodb.conditions import Key from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException -logger=logging.getLogger(__name__) +logger = logging.getLogger(__name__) + class JoinDbClient(object): def __init__(self, table_session): @@ -17,35 +18,29 @@ def check_join_job_record_exists(self, experiment_id, join_job_id): def get_join_job_record(self, experiment_id, join_job_id): response = self.table_session.query( ConsistentRead=True, - KeyConditionExpression=Key('experiment_id').eq(experiment_id) & Key('join_job_id').eq(join_job_id) + KeyConditionExpression=Key("experiment_id").eq(experiment_id) & Key("join_job_id").eq(join_job_id), ) - for i in response['Items']: + for i in response["Items"]: return i return None def create_new_join_job_record(self, record): try: - self.table_session.put_item( - Item=record, - ConditionExpression='attribute_not_exists(join_job_id)' - ) + self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(join_job_id)") except Exception as e: if "ConditionalCheckFailedException" in str(e): raise RecordAlreadyExistsException() raise e def update_join_job_record(self, record): - self.table_session.put_item( - Item=record - ) + self.table_session.put_item(Item=record) def get_all_join_job_records_of_experiment(self, experiment_id): response = self.table_session.query( - ConsistentRead=True, - KeyConditionExpression=Key('experiment_id').eq(experiment_id) + ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id) ) - if response['Items']: - return response['Items'] + if response["Items"]: + return response["Items"] else: return None @@ -54,69 +49,64 @@ def batch_delete_items(self, experiment_id, join_job_id_list): with self.table_session.batch_writer() as batch: for join_job_id in join_job_id_list: logger.debug(f"Deleting join job record {join_job_id}...") - batch.delete_item( - Key={ - 'experiment_id': experiment_id, - 'join_job_id': join_job_id - } - ) + batch.delete_item(Key={"experiment_id": experiment_id, "join_job_id": join_job_id}) def update_join_job_current_state(self, experiment_id, join_job_id, current_state): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET current_state = :val', - ExpressionAttributeValues={':val': current_state} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET current_state = :val", + ExpressionAttributeValues={":val": current_state}, ) - def update_join_job_input_obs_data_s3_path(self, experiment_id, - join_job_id, input_obs_data_s3_path): + def update_join_job_input_obs_data_s3_path(self, experiment_id, join_job_id, input_obs_data_s3_path): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET input_obs_data_s3_path = :val', - ExpressionAttributeValues={':val': input_obs_data_s3_path} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET input_obs_data_s3_path = :val", + ExpressionAttributeValues={":val": input_obs_data_s3_path}, ) - - def update_join_job_input_reward_data_s3_path(self, experiment_id, - join_job_id, input_reward_data_s3_path): + + def update_join_job_input_reward_data_s3_path(self, experiment_id, join_job_id, input_reward_data_s3_path): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET input_reward_data_s3_path = :val', - ExpressionAttributeValues={':val': input_reward_data_s3_path} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET input_reward_data_s3_path = :val", + ExpressionAttributeValues={":val": input_reward_data_s3_path}, ) def update_join_job_join_query_ids(self, experiment_id, join_job_id, join_query_ids): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET join_query_ids = :val', - ExpressionAttributeValues={':val': join_query_ids} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET join_query_ids = :val", + ExpressionAttributeValues={":val": join_query_ids}, ) def update_join_job_obs_end_time(self, experiment_id, join_job_id, obs_end_time): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET obs_end_time = :val', - ExpressionAttributeValues={':val': obs_end_time} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET obs_end_time = :val", + ExpressionAttributeValues={":val": obs_end_time}, ) def update_join_job_obs_start_time(self, experiment_id, join_job_id, obs_start_time): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET obs_start_time = :val', - ExpressionAttributeValues={':val': obs_start_time} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET obs_start_time = :val", + ExpressionAttributeValues={":val": obs_start_time}, ) - def update_join_job_output_joined_eval_data_s3_path(self, experiment_id, - join_job_id, output_joined_eval_data_s3_path): + def update_join_job_output_joined_eval_data_s3_path( + self, experiment_id, join_job_id, output_joined_eval_data_s3_path + ): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET output_joined_eval_data_s3_path = :val', - ExpressionAttributeValues={':val': output_joined_eval_data_s3_path} + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET output_joined_eval_data_s3_path = :val", + ExpressionAttributeValues={":val": output_joined_eval_data_s3_path}, ) - def update_join_job_output_joined_train_data_s3_path(self, experiment_id, - join_job_id, output_joined_train_data_s3_path): + def update_join_job_output_joined_train_data_s3_path( + self, experiment_id, join_job_id, output_joined_train_data_s3_path + ): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'join_job_id': join_job_id}, - UpdateExpression=f'SET output_joined_train_data_s3_path = :val', - ExpressionAttributeValues={':val': output_joined_train_data_s3_path} - ) \ No newline at end of file + Key={"experiment_id": experiment_id, "join_job_id": join_job_id}, + UpdateExpression=f"SET output_joined_train_data_s3_path = :val", + ExpressionAttributeValues={":val": output_joined_train_data_s3_path}, + ) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py index 11510e59..7d14d496 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/clients/ddb/model_db_client.py @@ -4,12 +4,14 @@ from boto3.dynamodb.conditions import Key from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException -logger=logging.getLogger(__name__) +logger = logging.getLogger(__name__) + class ModelDbClient: """ - TODO: Deprecate and embed this class in ModelRecord. + TODO: Deprecate and embed this class in ModelRecord. """ + def __init__(self, table_session): self.table_session = table_session @@ -22,9 +24,9 @@ def check_model_record_exists(self, experiment_id, model_id): def get_model_record(self, experiment_id, model_id): response = self.table_session.query( ConsistentRead=True, - KeyConditionExpression=Key('experiment_id').eq(experiment_id) & Key('model_id').eq(model_id) + KeyConditionExpression=Key("experiment_id").eq(experiment_id) & Key("model_id").eq(model_id), ) - for i in response['Items']: + for i in response["Items"]: return i return None @@ -38,52 +40,46 @@ def get_model_record_with_retry(self, experiment_id, model_id, retry_gap=5): def create_new_model_record(self, record): try: - self.table_session.put_item( - Item=record, - ConditionExpression='attribute_not_exists(model_id)' - ) + self.table_session.put_item(Item=record, ConditionExpression="attribute_not_exists(model_id)") except Exception as e: if "ConditionalCheckFailedException" in str(e): raise RecordAlreadyExistsException() raise e - + def update_model_job_state(self, model_record): self.update_model_record(model_record) - + def update_model_as_pending(self, model_record): # TODO: a model can only be put to pending, from pending state. self.update_model_record(model_record) - + def update_model_as_failed(self, model_record): self.update_model_record(model_record) def update_model_eval_job_state(self, model_record): - # TODO: conditional check to verify model is in *ing state while updating... + # TODO: conditional check to verify model is in *ing state while updating... # Not Trained or some final state. self.update_model_record(model_record) def update_model_eval_as_pending(self, model_record): - # TODO: a model eval_state can only be put to pending, from pending state + # TODO: a model eval_state can only be put to pending, from pending state # or a final state. (coz of reruns of evaluation) self.update_model_record(model_record) def update_model_eval_as_failed(self, model_record): - # TODO: conditional check to verify model is in *ing state while updating... + # TODO: conditional check to verify model is in *ing state while updating... # Not Trained or some final state. self.update_model_record(model_record) def update_model_record(self, record): - self.table_session.put_item( - Item=record - ) + self.table_session.put_item(Item=record) def get_all_model_records_of_experiment(self, experiment_id): response = self.table_session.query( - ConsistentRead=True, - KeyConditionExpression=Key('experiment_id').eq(experiment_id) + ConsistentRead=True, KeyConditionExpression=Key("experiment_id").eq(experiment_id) ) - if response['Items']: - return response['Items'] + if response["Items"]: + return response["Items"] else: return None @@ -92,88 +88,82 @@ def batch_delete_items(self, experiment_id, model_id_list): with self.table_session.batch_writer() as batch: for model_id in model_id_list: logger.debug(f"Deleting model record '{model_id}'...") - batch.delete_item( - Key={ - 'experiment_id': experiment_id, - 'model_id': model_id - } - ) + batch.delete_item(Key={"experiment_id": experiment_id, "model_id": model_id}) def update_model_input_model_id(self, experiment_id, model_id, input_model_id): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET input_model_id = :val', - ExpressionAttributeValues={':val': input_model_id} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET input_model_id = :val", + ExpressionAttributeValues={":val": input_model_id}, ) def update_model_input_data_s3_prefix(self, experiment_id, model_id, input_data_s3_prefix): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET input_data_s3_prefix = :val', - ExpressionAttributeValues={':val': input_data_s3_prefix} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET input_data_s3_prefix = :val", + ExpressionAttributeValues={":val": input_data_s3_prefix}, ) + def update_model_s3_model_output_path(self, experiment_id, model_id, s3_model_output_path): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET s3_model_output_path = :val', - ExpressionAttributeValues={':val': s3_model_output_path} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET s3_model_output_path = :val", + ExpressionAttributeValues={":val": s3_model_output_path}, ) def update_model_train_state(self, experiment_id, model_id, train_state): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET train_state = :val', - ExpressionAttributeValues={':val': train_state} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET train_state = :val", + ExpressionAttributeValues={":val": train_state}, ) - + def update_model_eval_state(self, experiment_id, model_id, eval_state): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET eval_state = :val', - ExpressionAttributeValues={':val': eval_state} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET eval_state = :val", + ExpressionAttributeValues={":val": eval_state}, ) def update_model_eval_scores(self, experiment_id, model_id, eval_scores): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET eval_scores = :val', - ExpressionAttributeValues={':val': eval_scores} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET eval_scores = :val", + ExpressionAttributeValues={":val": eval_scores}, ) def update_model_eval_scores_and_state(self, experiment_id, model_id, eval_scores, eval_state): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET eval_scores = :score_val, eval_state = :state_val', - ExpressionAttributeValues={ - ':score_val': eval_scores, - ':state_val': eval_state - } - ) + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET eval_scores = :score_val, eval_state = :state_val", + ExpressionAttributeValues={":score_val": eval_scores, ":state_val": eval_state}, + ) def update_model_training_start_time(self, experiment_id, model_id, training_start_time): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET training_start_time = :val', - ExpressionAttributeValues={':val': training_start_time} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET training_start_time = :val", + ExpressionAttributeValues={":val": training_start_time}, ) def update_model_training_end_time(self, experiment_id, model_id, training_end_time): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, - UpdateExpression=f'SET training_end_time = :val', - ExpressionAttributeValues={':val': training_end_time} + Key={"experiment_id": experiment_id, "model_id": model_id}, + UpdateExpression=f"SET training_end_time = :val", + ExpressionAttributeValues={":val": training_end_time}, ) - def update_model_training_stats(self, experiment_id, model_id, - s3_model_output_path, training_start_time, training_end_time, train_state): + def update_model_training_stats( + self, experiment_id, model_id, s3_model_output_path, training_start_time, training_end_time, train_state + ): self.table_session.update_item( - Key={'experiment_id': experiment_id, 'model_id': model_id}, + Key={"experiment_id": experiment_id, "model_id": model_id}, UpdateExpression=f"SET s3_model_output_path = :path_val, training_start_time = :start_time_val, " f"training_end_time = :end_time_val, train_state = :state_val", ExpressionAttributeValues={ - ':path_val': s3_model_output_path, - ':start_time_val': training_start_time, - ':end_time_val': training_end_time, - ':state_val': train_state - } - ) \ No newline at end of file + ":path_val": s3_model_output_path, + ":start_time_val": training_start_time, + ":end_time_val": training_end_time, + ":state_val": train_state, + }, + ) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py index 1855e790..d22cc96a 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/ddb_client_exceptions.py @@ -1,8 +1,10 @@ class RecordAlreadyExistsException(Exception): pass + class ConcurrentModificationException(Exception): pass + class ConditionalCheckFailure(Exception): - pass \ No newline at end of file + pass diff --git a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py index a46a65f1..334226f7 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/exceptions/workflow_exceptions.py @@ -1,20 +1,26 @@ class UnhandledWorkflowException(Exception): pass + class SageMakerTrainingJobException(Exception): pass + class SageMakerHostingException(Exception): pass + class WorkflowJoiningJobException(Exception): pass + class EvalScoreNotAvailableException(Exception): pass + class JoinQueryIdsNotAvailableException(Exception): pass + class InvalidUsageException(Exception): - pass \ No newline at end of file + pass diff --git a/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py index b09fb0a0..001cfeed 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/resource_manager.py @@ -17,18 +17,15 @@ logger = logging.getLogger(__name__) + class ResourceManager(object): """A resource manager entity to manage computing resource creation and cleanup for the experiment. """ - def __init__( - self, - resource_config, - boto_session=None - ): + def __init__(self, resource_config, boto_session=None): """Initialize a resource manager entity given a resource config - + Args: resource_config (dict): A dictionary containing configuration of the computing resource @@ -56,7 +53,7 @@ def __init__( @property def firehose_bucket(self): - if hasattr(self, 'firehose_s3_bucket_name'): + if hasattr(self, "firehose_s3_bucket_name"): return self.firehose_s3_bucket_name account = self.boto_session.client("sts").get_caller_identity()["Account"] region = self.boto_session.region_name @@ -71,32 +68,36 @@ def create_shared_resource_if_not_exist(self): and IAM role to grant relevant resource permission """ if self._usable_shared_cf_stack_exists(): - logger.info("Using Resources in CloudFormation stack named: {} " \ - "for Shared Resources.".format(self.shared_resource_stack_name)) + logger.info( + "Using Resources in CloudFormation stack named: {} " + "for Shared Resources.".format(self.shared_resource_stack_name) + ) else: - logger.info("Creating a new CloudFormation stack for Shared Resources. " \ - "You can always reuse this StackName in your other experiments") + logger.info( + "Creating a new CloudFormation stack for Shared Resources. " + "You can always reuse this StackName in your other experiments" + ) self._create_new_cloudformation_stack() # use Output Resources Names from CloudFromation stack - self.exp_db_table_name = self._get_cf_output_by_key('ExperimentDbTableName') - self.join_db_table_name = self._get_cf_output_by_key('JoinDbTableName') + self.exp_db_table_name = self._get_cf_output_by_key("ExperimentDbTableName") + self.join_db_table_name = self._get_cf_output_by_key("JoinDbTableName") self.model_db_table_name = self._get_cf_output_by_key("ModelDbTableName") - self.iam_role_arn = self._get_cf_output_by_key('IAMRoleArn') - + self.iam_role_arn = self._get_cf_output_by_key("IAMRoleArn") + # initialize DynamoDb clients! - experiment_db_session = self.boto_session.resource('dynamodb').Table(self.exp_db_table_name) + experiment_db_session = self.boto_session.resource("dynamodb").Table(self.exp_db_table_name) self.exp_db_client = ExperimentDbClient(experiment_db_session) - join_db_session = self.boto_session.resource('dynamodb').Table(self.join_db_table_name) + join_db_session = self.boto_session.resource("dynamodb").Table(self.join_db_table_name) self.join_db_client = JoinDbClient(join_db_session) - model_db_session = self.boto_session.resource('dynamodb').Table(self.model_db_table_name) + model_db_session = self.boto_session.resource("dynamodb").Table(self.model_db_table_name) self.model_db_client = ModelDbClient(model_db_session) def _usable_shared_cf_stack_exists(self): """Check if the shared cf stack exist and is usable - + Returns: bool: Whether the shared cf stack is usable """ @@ -104,44 +105,54 @@ def _usable_shared_cf_stack_exists(self): # CF stack in one of [CREATE|UPDATE|ROLLBACK]_COMPLETE state try: stack_name = self.shared_resource_stack_name - response = self.cf_client.describe_stacks( - StackName=stack_name)["Stacks"] + response = self.cf_client.describe_stacks(StackName=stack_name)["Stacks"] if len(response) == 0: return False except Exception as e: if "UnauthorizedOperation" in str(e): - raise Exception("You are unauthorized to describe a CloudFormation Stack. Please update your Role with " - " appropriate permissions.") + raise Exception( + "You are unauthorized to describe a CloudFormation Stack. Please update your Role with " + " appropriate permissions." + ) elif "ValidationError" in str(e): # stack doesn't exists return False else: raise e - + stack_details = response[0] - stack_status = stack_details['StackStatus'] - if stack_status in ['UPDATE_COMPLETE', 'CREATE_COMPLETE']: + stack_status = stack_details["StackStatus"] + if stack_status in ["UPDATE_COMPLETE", "CREATE_COMPLETE"]: return True elif stack_status in ["DELETE_COMPLETE"]: return False elif stack_status in ["ROLLBACK_COMPLETE"]: - logger.error(f"Stack with name {stack_name} is in {stack_status} state! Please delete/ stabilize/ or " - "or update Config.yaml to create a new stack") - raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " - f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}" + logger.error( + f"Stack with name {stack_name} is in {stack_status} state! Please delete/ stabilize/ or " + "or update Config.yaml to create a new stack" + ) + raise Exception( + f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " + f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}" ) elif "FAILED" in stack_status: - logger.error(f"Stack with name {stack_name} in {stack_status} state! Please delete the stack" - " or update Config.yaml to create a new stack") - raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " - f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}" + logger.error( + f"Stack with name {stack_name} in {stack_status} state! Please delete the stack" + " or update Config.yaml to create a new stack" + ) + raise Exception( + f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " + f"Please debug/ or delete the stack here: {self._get_cf_stack_events_link()}" ) elif "DELETE" in stack_status: # already checked DELETE_COMPLETE above - logger.error("Stack with name {} is in {} state! Cannot continue further!" \ - " Please wait for the delete to complete".format(stack_name, stack_status)) - raise Exception(f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " - f"Please retry after the stack gets Deleted/or debug the stack here: {self._get_cf_stack_events_link()}" + logger.error( + "Stack with name {} is in {} state! Cannot continue further!" + " Please wait for the delete to complete".format(stack_name, stack_status) + ) + raise Exception( + f"A Cloudformation Stack with name {stack_name}, already exists in {stack_status} State. " + f"Please retry after the stack gets Deleted/or debug the stack here: {self._get_cf_stack_events_link()}" ) elif "CREATE" in stack_status: # one of the create statuses! @@ -150,15 +161,14 @@ def _usable_shared_cf_stack_exists(self): self._wait_for_cf_stack_create_to_complete() return True else: - # assume stack in modifying. wait for it to goto + # assume stack in modifying. wait for it to goto logger.info("Stack in {} state. Waiting for it's to end in successful state...".format(stack_status)) self._wait_for_cf_stack_update_to_complete() return True - def _create_new_cloudformation_stack(self): """Create a new cloudformation stack - + Returns: bool: whether successfully create a new cloudformation stack """ @@ -167,9 +177,9 @@ def _create_new_cloudformation_stack(self): parameters = [ { "ParameterKey": "IAMRoleName", - "ParameterValue": self._get_iam_role_property('role_name', 'role_for_cl'), + "ParameterValue": self._get_iam_role_property("role_name", "role_for_cl"), "UsePreviousValue": True, - "ResolvedValue": "string" + "ResolvedValue": "string", }, ] parameters.extend(self._get_cloudformation_parameters_for_db()) @@ -179,28 +189,30 @@ def _create_new_cloudformation_stack(self): StackName=cf_stack_name, TemplateBody=self._parse_template(), Parameters=parameters, - Capabilities=[ - 'CAPABILITY_NAMED_IAM' - ] + Capabilities=["CAPABILITY_NAMED_IAM"], ) logger.info("Creating CloudFormation Stack for shared resource!") self._wait_for_cf_stack_create_to_complete() return True - except Exception as e: + except Exception as e: if "UnauthorizedOperation" in str(e): - raise Exception("You are unauthorized to create a CloudFormation Stack. Please update your Role with " - " appropriate permissions.") + raise Exception( + "You are unauthorized to create a CloudFormation Stack. Please update your Role with " + " appropriate permissions." + ) elif "AlreadyExists" in str(e): # it came here it means it must be in one for "CREATING states" - logger.warn(f"A stack with name {cf_stack_name} already exists. Reusing the stack" \ - " resources for this experiment") + logger.warn( + f"A stack with name {cf_stack_name} already exists. Reusing the stack" + " resources for this experiment" + ) self._wait_for_cf_stack_create_to_complete() return False - raise(e) + raise (e) def _get_cf_stack_events_link(self): """Get events link for the given shared cf stack - + Returns: str: events link for the cf stack """ @@ -209,43 +221,31 @@ def _get_cf_stack_events_link(self): return f"https://{region}.console.aws.amazon.com/cloudformation/home?region={region}#/stacks/events?stackId={self.shared_resource_stack_name}" def _wait_for_cf_stack_create_to_complete(self): - """Wait until the cf stack creation complete - """ - cf_waiter = self.cf_client.get_waiter('stack_create_complete') + """Wait until the cf stack creation complete""" + cf_waiter = self.cf_client.get_waiter("stack_create_complete") logger.info("Waiting for stack to get to CREATE_COMPLETE state....") try: - cf_waiter.wait( - StackName=self.shared_resource_stack_name, - WaiterConfig={ - 'Delay': 10, - 'MaxAttempts': 60 - } - ) + cf_waiter.wait(StackName=self.shared_resource_stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60}) except Exception as e: logger.error(e) logger.error("Failed to Create Stack with name {} ".format(self.shared_resource_stack_name)) - raise Exception(f"Failed to Create Shared Resource Stack. " - f"Please debug the stack here: {self._get_cf_stack_events_link()}" + raise Exception( + f"Failed to Create Shared Resource Stack. " + f"Please debug the stack here: {self._get_cf_stack_events_link()}" ) def _wait_for_cf_stack_update_to_complete(self): - """Wait until the cf stack update complete - """ - cf_waiter = self.cf_client.get_waiter('stack_update_complete') + """Wait until the cf stack update complete""" + cf_waiter = self.cf_client.get_waiter("stack_update_complete") logger.info("Waiting for stack to get to Successful Update state....") try: - cf_waiter.wait( - StackName=self.shared_resource_stack_name, - WaiterConfig={ - 'Delay': 10, - 'MaxAttempts': 6 - } - ) + cf_waiter.wait(StackName=self.shared_resource_stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 6}) except Exception as e: logger.error(e) logger.error("Failed to use Stack with name {} ".format(self.shared_resource_stack_name)) - raise Exception(f"The provided CloudFormation Stack for Shared Resource is unstable. " - f"Please debug the stack here: {self._get_cf_stack_events_link()}" + raise Exception( + f"The provided CloudFormation Stack for Shared Resource is unstable. " + f"Please debug the stack here: {self._get_cf_stack_events_link()}" ) def _parse_template(self): @@ -275,32 +275,32 @@ def _get_cloudformation_parameters_for_db(self): "ParameterKey": parameter_prefix + "Name", "ParameterValue": self._get_resource_property(parameter_prefix, "table_name"), "UsePreviousValue": True, - "ResolvedValue": "string" - }, + "ResolvedValue": "string", + }, { "ParameterKey": parameter_prefix + "RCU", - "ParameterValue": self._get_resource_property(parameter_prefix, "rcu", '5'), + "ParameterValue": self._get_resource_property(parameter_prefix, "rcu", "5"), "UsePreviousValue": True, - "ResolvedValue": "string" - }, + "ResolvedValue": "string", + }, { "ParameterKey": parameter_prefix + "WCU", - "ParameterValue": self._get_resource_property(parameter_prefix, "wcu", '5'), + "ParameterValue": self._get_resource_property(parameter_prefix, "wcu", "5"), "UsePreviousValue": True, - "ResolvedValue": "string" - } + "ResolvedValue": "string", + }, ] json_parameter_list.extend(json_params) return json_parameter_list def _get_resource_property(self, resource_name, property_name, default_value=None): """Get property value of given resource - + Args: - resource_name (str): Name of the resource + resource_name (str): Name of the resource property_name (str): Name of the property default_value (str): Default value of the property - + Returns: str: Property value of the resource """ @@ -320,43 +320,43 @@ def _get_experiment_db_property(self, property_name, default_value=None): Args: property_name (str): name of property default_value (): default value of the property - + Returns: value of the property """ experiment_db_config = self._resource_config.get("shared_resource").get("experiment_db") return experiment_db_config.get(property_name, default_value) - + def _get_model_db_property(self, property_name, default_value=None): """Return property value of model table Args: property_name (str): name of property default_value (): default value of the property - + Returns: value of the property """ model_db_config = self._resource_config.get("shared_resource").get("model_db") return model_db_config.get(property_name, default_value) - def _get_join_db_property(self, property_name,default_value=None): + def _get_join_db_property(self, property_name, default_value=None): """Return property value of join table Args: property_name (str): name of property default_value (): default value of the property - + Returns: value of the property - """ + """ join_db_config = self._resource_config.get("shared_resource").get("join_db") return join_db_config.get(property_name, default_value) - + def _get_iam_role_property(self, property_name, default_value=None): """Return property value of iam role Args: property_name (str): name of property default_value (): default value of the property - + Returns: value of the property """ @@ -365,25 +365,30 @@ def _get_iam_role_property(self, property_name, default_value=None): def _get_cf_output_by_key(self, output_key): """Return cf output value of given output key - + Args: output_key (str): key of a specific output - + Returns: str: value of the output key """ - stack_json = self.cf_client.describe_stacks( - StackName=self.shared_resource_stack_name - )["Stacks"][0] - + stack_json = self.cf_client.describe_stacks(StackName=self.shared_resource_stack_name)["Stacks"][0] + # validate stack has been successfully updater - if stack_json["StackStatus"] not in \ - ["CREATE_COMPLETE", "UPDATE_COMPLETE", - "ROLLBACK_COMPLETE", "UPDATE_ROLLBACK_COMPLETE"]: - logger.error("Looks like Resource CF Stack is in {} state. " \ - "Cannot continue forward. ".format(stack_json["StackStatus"])) - raise Exception("Please wait while the Shared Resources Stack gets into a usable state." \ - "Currently in state {}!".format(stack_json["StackStatus"])) + if stack_json["StackStatus"] not in [ + "CREATE_COMPLETE", + "UPDATE_COMPLETE", + "ROLLBACK_COMPLETE", + "UPDATE_ROLLBACK_COMPLETE", + ]: + logger.error( + "Looks like Resource CF Stack is in {} state. " + "Cannot continue forward. ".format(stack_json["StackStatus"]) + ) + raise Exception( + "Please wait while the Shared Resources Stack gets into a usable state." + "Currently in state {}!".format(stack_json["StackStatus"]) + ) stack_outputs = stack_json["Outputs"] for stack_output in stack_outputs: @@ -393,33 +398,33 @@ def _get_cf_output_by_key(self, output_key): def _wait_for_active_firehose(self, stream_name): """Wait until the firehose stream creation complete and be active - + Args: stream_name (str): stream name of the firehose """ - status = 'CREATING' + status = "CREATING" timeout = 60 * 2 - while status != 'ACTIVE' and timeout >= 0: + while status != "ACTIVE" and timeout >= 0: logger.info("Creating firehose delivery stream...") try: result = self.firehose_client.describe_delivery_stream(DeliveryStreamName=stream_name) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError(f"Failed to describe delivery stream '{stream_name}' " - f"with error {error_code}: {message}") - status = result['DeliveryStreamDescription']['DeliveryStreamStatus'] + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError( + f"Failed to describe delivery stream '{stream_name}' " f"with error {error_code}: {message}" + ) + status = result["DeliveryStreamDescription"]["DeliveryStreamStatus"] time.sleep(10) timeout = timeout - 10 - if status == 'ACTIVE': + if status == "ACTIVE": logger.info(f"Successfully created delivery stream '{stream_name}'") else: raise RuntimeError(f"Failed to create delivery stream '{stream_name}'") - def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix, - buffer_size=128, buffer_time=60): + def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix, buffer_size=128, buffer_time=60): """Initiate a firehose stream with given config - + Args: stream_name (str): name of the firehose stream s3_bucket (str): s3 bucket for delivering the firehose streaming data @@ -429,42 +434,39 @@ def _init_firehose_from_config(self, stream_name, s3_bucket, s3_prefix, buffer_time (int): buffer time(s) in firehose before pushing data to s3 destination """ - exist_delivery_streams = self.firehose_client.list_delivery_streams(Limit=1000)['DeliveryStreamNames'] + exist_delivery_streams = self.firehose_client.list_delivery_streams(Limit=1000)["DeliveryStreamNames"] if stream_name in exist_delivery_streams: - logger.warning(f"Delivery stream {stream_name} already exist. " - "No new delivery stream created.") + logger.warning(f"Delivery stream {stream_name} already exist. " "No new delivery stream created.") else: firehose_role_arn = self.iam_role_arn s3_bucket_arn = f"arn:aws:s3:::{s3_bucket}" s3_config = { - 'BucketARN': s3_bucket_arn, - 'RoleARN': firehose_role_arn, - 'Prefix': s3_prefix.strip() + '/', - 'BufferingHints': { - 'IntervalInSeconds': buffer_time, - 'SizeInMBs': buffer_size - }, + "BucketARN": s3_bucket_arn, + "RoleARN": firehose_role_arn, + "Prefix": s3_prefix.strip() + "/", + "BufferingHints": {"IntervalInSeconds": buffer_time, "SizeInMBs": buffer_size}, } try: self.firehose_client.create_delivery_stream( DeliveryStreamName=stream_name, - DeliveryStreamType='DirectPut', - ExtendedS3DestinationConfiguration=s3_config + DeliveryStreamType="DirectPut", + ExtendedS3DestinationConfiguration=s3_config, ) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError(f"Failed to create delivery stream '{stream_name}' " - f"with error {error_code}: {message}") + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError( + f"Failed to create delivery stream '{stream_name}' " f"with error {error_code}: {message}" + ) # check if delivery stream created self._wait_for_active_firehose(stream_name) def create_firehose_stream_if_not_exists(self, stream_name, s3_prefix): """Create firehose stream with given stream name - + Arguments: stream_name (str): name of the firehose stream s3_prefix (str): s3 prefix path for delivering the firehose data @@ -475,29 +477,28 @@ def create_firehose_stream_if_not_exists(self, stream_name, s3_prefix): def delete_firehose_stream(self, stream_name): """Delete the firehose with given stream name - + Args: stream_name (str): name of the firehose stream """ logger.warning(f"Deleting firehose stream '{stream_name}'...") try: - self.firehose_client.delete_delivery_stream( - DeliveryStreamName=stream_name - ) + self.firehose_client.delete_delivery_stream(DeliveryStreamName=stream_name) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError(f"Failed to delete delivery stream '{stream_name}' " - f"with error {error_code}: {message}") + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError( + f"Failed to delete delivery stream '{stream_name}' " f"with error {error_code}: {message}" + ) def _create_s3_bucket_if_not_exist(self, prefix): """Create s3 bucket if not exist - + Args: prefix (str): A bucket name prefix, followed by region name - and account id - + and account id + Returns: str: s3 bucket name """ @@ -513,9 +514,7 @@ def _create_s3_bucket_if_not_exist(self, prefix): if region == "us-east-1": s3.create_bucket(Bucket=s3_bucket_name) else: - s3.create_bucket( - Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region} - ) + s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region}) logger.info("Successfully create S3 bucket '{}' for storing {} data".format(s3_bucket_name, prefix)) except ClientError as e: error_code = e.response["Error"]["Code"] @@ -523,9 +522,7 @@ def _create_s3_bucket_if_not_exist(self, prefix): if error_code == "BucketAlreadyOwnedByYou": pass - elif ( - error_code == "OperationAborted" and "conflicting conditional operation" in message - ): + elif error_code == "OperationAborted" and "conflicting conditional operation" in message: # If this bucket is already being concurrently created, we don't need to create it again. pass elif error_code == "TooManyBuckets": @@ -533,8 +530,8 @@ def _create_s3_bucket_if_not_exist(self, prefix): s3.meta.client.head_bucket(Bucket=s3_bucket_name) else: raise - - s3_waiter = s3_client.get_waiter('bucket_exists') + + s3_waiter = s3_client.get_waiter("bucket_exists") s3_waiter.wait(Bucket=s3_bucket_name) return s3_bucket_name @@ -548,14 +545,16 @@ def __init__(self, endpoint_name, sagemaker_session=None): with the Amazon SageMaker APIs and any other AWS services needed. """ self.endpoint_name = endpoint_name - self._realtime_predictor = sagemaker.predictor.Predictor(endpoint_name, - serializer=sagemaker.serializers.JSONSerializer(), - deserializer=sagemaker.deserializers.JSONDeserializer(), - sagemaker_session=sagemaker_session) + self._realtime_predictor = sagemaker.predictor.Predictor( + endpoint_name, + serializer=sagemaker.serializers.JSONSerializer(), + deserializer=sagemaker.deserializers.JSONDeserializer(), + sagemaker_session=sagemaker_session, + ) def get_action(self, obs=None): """Get prediction from the endpoint - + Args: obs (list/str): observation of the environment @@ -567,32 +566,31 @@ def get_action(self, obs=None): sample_prob: sample probability distribution used for data split """ payload = {} - payload['request_type'] = "observation" - payload['observation'] = obs + payload["request_type"] = "observation" + payload["observation"] = obs response = self._realtime_predictor.predict(payload) - action = response['action'] - action_prob = response['action_prob'] - event_id = response['event_id'] - model_id = response['model_id'] - sample_prob = response['sample_prob'] + action = response["action"] + action_prob = response["action_prob"] + event_id = response["event_id"] + model_id = response["model_id"] + sample_prob = response["sample_prob"] return action, event_id, model_id, action_prob, sample_prob def get_hosted_model_id(self): """Return hostdd model id in the hosting endpoint - + Returns: str: model id of the model being hosted """ payload = {} - payload['request_type'] = "model_id" - payload['observation'] = None + payload["request_type"] = "model_id" + payload["observation"] = None response = self._realtime_predictor.predict(payload) - model_id = response['model_id'] + model_id = response["model_id"] return model_id def delete_endpoint(self): - """Delete the Sagemaker endpoint - """ + """Delete the Sagemaker endpoint""" logger.warning(f"Deleting hosting endpoint '{self.endpoint_name}'...") - self._realtime_predictor.delete_endpoint() \ No newline at end of file + self._realtime_predictor.delete_endpoint() diff --git a/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py b/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py index f907383a..d9d5a930 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/utils/cloudwatch_logger.py @@ -2,34 +2,30 @@ import json -class CloudWatchLogger(): - +class CloudWatchLogger: def __init__(self, cw_client, region_name): self.region_name = region_name self.cw_client = cw_client - + def get_cloudwatch_dashboard_details(self, experiment_id): # update for non-commercial region cw_dashboard_url = f"https://{self.region_name}.console.aws.amazon.com/cloudwatch/home?region={self.region_name}#dashboards:name={experiment_id};start=PT1H" text = f"You can monitor your Training/Hosting evaluation metrics on this [CloudWatch Dashboard]({cw_dashboard_url})" - text += "\n\n(Note: This would need Trained/Hosted Models to be evaluated in order to publish Evaluation Scores)" + text += ( + "\n\n(Note: This would need Trained/Hosted Models to be evaluated in order to publish Evaluation Scores)" + ) return text - - def publish_latest_hosting_information( - self, - experiment_id, - latest_hosted_model_id, - latest_hosted_model_score - ): + + def publish_latest_hosting_information(self, experiment_id, latest_hosted_model_id, latest_hosted_model_score): self.cw_client.put_metric_data( Namespace=experiment_id, MetricData=[ { "MetricName": "latest_hosted_model_id_continuous", "Timestamp": time.time(), - "Value": int(latest_hosted_model_id.split('-')[-1]) + "Value": int(latest_hosted_model_id.split("-")[-1]), } - ] + ], ) self.cw_client.put_metric_data( Namespace=experiment_id, @@ -37,26 +33,21 @@ def publish_latest_hosting_information( { "MetricName": "latest_hosted_model_score_continuous", "Timestamp": time.time(), - "Value": float(latest_hosted_model_score) + "Value": float(latest_hosted_model_score), } - ] + ], ) - - def publish_latest_training_information( - self, - experiment_id, - latest_trained_model_id, - latest_trained_model_score - ): + + def publish_latest_training_information(self, experiment_id, latest_trained_model_id, latest_trained_model_score): self.cw_client.put_metric_data( Namespace=experiment_id, MetricData=[ { "MetricName": "latest_trained_model_id_continuous", "Timestamp": time.time(), - "Value": int(latest_trained_model_id.split('-')[-1]) + "Value": int(latest_trained_model_id.split("-")[-1]), } - ] + ], ) self.cw_client.put_metric_data( Namespace=experiment_id, @@ -64,16 +55,13 @@ def publish_latest_training_information( { "MetricName": "latest_trained_model_score_continuous", "Timestamp": time.time(), - "Value": float(latest_trained_model_score) + "Value": float(latest_trained_model_score), } - ] + ], ) - + def publish_newly_trained_model_eval_information( - self, - experiment_id, - new_trained_model_id, - new_trained_model_score + self, experiment_id, new_trained_model_id, new_trained_model_score ): self.cw_client.put_metric_data( Namespace=experiment_id, @@ -81,9 +69,9 @@ def publish_newly_trained_model_eval_information( { "MetricName": "newly_trained_model_id", "Timestamp": time.time(), - "Value": int(new_trained_model_id.split('-')[-1]) + "Value": int(new_trained_model_id.split("-")[-1]), } - ] + ], ) self.cw_client.put_metric_data( Namespace=experiment_id, @@ -91,45 +79,28 @@ def publish_newly_trained_model_eval_information( { "MetricName": "newly_trained_model_score", "Timestamp": time.time(), - "Value": float(new_trained_model_score) + "Value": float(new_trained_model_score), } - ] + ], ) - - def publish_rewards_for_simulation( - self, - experiment_id, - reported_rewards_sum - ): + + def publish_rewards_for_simulation(self, experiment_id, reported_rewards_sum): self.cw_client.put_metric_data( Namespace=experiment_id, MetricData=[ { "MetricName": "reported_rewards_score", "Timestamp": time.time(), - "Value": float(reported_rewards_sum) + "Value": float(reported_rewards_sum), } - ] + ], ) - def create_cloudwatch_dashboard_from_experiment_id( - self, - experiment_id - ): - cw_json = self.get_cloudwatch_dashboard_json_for_experiment_id( - experiment_id, - self.region_name - ) - self.cw_client.put_dashboard( - DashboardName=experiment_id, - DashboardBody=cw_json - ) + def create_cloudwatch_dashboard_from_experiment_id(self, experiment_id): + cw_json = self.get_cloudwatch_dashboard_json_for_experiment_id(experiment_id, self.region_name) + self.cw_client.put_dashboard(DashboardName=experiment_id, DashboardBody=cw_json) - def get_cloudwatch_dashboard_json_for_experiment_id( - self, - experiment_id, - region_name - ): + def get_cloudwatch_dashboard_json_for_experiment_id(self, experiment_id, region_name): dashboard_json = { "widgets": [ { @@ -143,17 +114,15 @@ def get_cloudwatch_dashboard_json_for_experiment_id( [ experiment_id, "latest_hosted_model_id_continuous", - { - "label": "(ModelId suffix part only)" - } + {"label": "(ModelId suffix part only)"}, ] ], "view": "singleValue", "region": region_name, "title": "Currently Hosted Model Id", "period": 60, - "stat": "Maximum" - } + "stat": "Maximum", + }, }, { "type": "metric", @@ -162,19 +131,13 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "width": 9, "height": 3, "properties": { - "metrics": [ - [ - experiment_id, - "latest_hosted_model_score_continuous", - {"label": "EvalScore" } - ] - ], + "metrics": [[experiment_id, "latest_hosted_model_score_continuous", {"label": "EvalScore"}]], "view": "singleValue", "region": region_name, "title": "Currently Hosted Model Eval Score (On latest data)", "period": 60, - "stat": "Minimum" - } + "stat": "Minimum", + }, }, { "type": "metric", @@ -184,11 +147,7 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "height": 3, "properties": { "metrics": [ - [ - experiment_id, - "latest_trained_model_id_continuous", - { "label": "(ModelId suffix only)" } - ] + [experiment_id, "latest_trained_model_id_continuous", {"label": "(ModelId suffix only)"}] ], "view": "singleValue", "region": region_name, @@ -196,8 +155,8 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "stat": "Maximum", "period": 60, "setPeriodToTimeRange": False, - "stacked": True - } + "stacked": True, + }, }, { "type": "metric", @@ -206,19 +165,13 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "width": 9, "height": 3, "properties": { - "metrics": [ - [ - experiment_id, - "latest_trained_model_score_continuous", - { "label": "EvalScore" } - ] - ], + "metrics": [[experiment_id, "latest_trained_model_score_continuous", {"label": "EvalScore"}]], "view": "singleValue", "region": region_name, "title": "Latest Trained Model Eval Score", "period": 60, - "stat": "Maximum" - } + "stat": "Maximum", + }, }, { "type": "metric", @@ -227,26 +180,15 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "width": 9, "height": 9, "properties": { - "metrics": [ - [ - experiment_id, - "newly_trained_model_score", - {"label": "EvalScore" } - ] - ], + "metrics": [[experiment_id, "newly_trained_model_score", {"label": "EvalScore"}]], "view": "timeSeries", "stacked": False, "region": region_name, "stat": "Maximum", "period": 60, "title": "New Model Eval Score Over Time", - "yAxis": { - "left": { - "min": 0, - "max": 1 - } - } - } + "yAxis": {"left": {"min": 0, "max": 1}}, + }, }, { "type": "metric", @@ -255,31 +197,18 @@ def get_cloudwatch_dashboard_json_for_experiment_id( "width": 9, "height": 9, "properties": { - "metrics": [ - [ - experiment_id, - "reported_rewards_score", - {"label": "Rewards" } - ] - ], + "metrics": [[experiment_id, "reported_rewards_score", {"label": "Rewards"}]], "view": "timeSeries", "stacked": False, "region": region_name, "stat": "Maximum", "period": 60, "title": "Experiment's Reported Rewards", - "yAxis": { - "left": { - "min": 0, - "max": 1 - } - }, + "yAxis": {"left": {"min": 0, "max": 1}}, "liveData": True, - "legend": { - "position": "bottom" - } - } - } + "legend": {"position": "bottom"}, + }, + }, ] } return json.dumps(dashboard_json) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py index a1de2316..f018c352 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/experiment_record.py @@ -1,16 +1,17 @@ -class ExperimentRecord(): - ''' +class ExperimentRecord: + """ This class captures all the data that is needed to run a experiment for Continuosly Training and Updating models on SageMaker - ''' + """ + def __init__( - self, - experiment_id, - training_workflow_metadata={}, - hosting_workflow_metadata={}, - joining_workflow_metadata={}, - evaluation_workflow_metadata={} - ): + self, + experiment_id, + training_workflow_metadata={}, + hosting_workflow_metadata={}, + joining_workflow_metadata={}, + evaluation_workflow_metadata={}, + ): # unique id common across all experiments in the account self.experiment_id = experiment_id @@ -26,13 +27,13 @@ def __init__( self._last_hosted_model_id = hosting_workflow_metadata.get("last_hosted_model_id", None) self._next_model_to_host_id = hosting_workflow_metadata.get("next_model_to_host_id", None) self._hosting_endpoint = hosting_workflow_metadata.get("hosting_endpoint", None) - + # joining workflow metadata self.joining_workflow_metadata = joining_workflow_metadata self._joining_state = joining_workflow_metadata.get("joining_state", None) self._last_joined_job_id = joining_workflow_metadata.get("last_joined_job_id", None) self._next_join_job_id = joining_workflow_metadata.get("next_join_job_id", None) - + # evaluation workflow metadata self.evaluation_workflow_metadata = evaluation_workflow_metadata self._evaluation_state = evaluation_workflow_metadata.get("evaluation_state", None) @@ -58,11 +59,11 @@ def to_ddb_record(self): self.evaluation_workflow_metadata["next_evaluation_job_id"] = self._next_evaluation_job_id return { - 'experiment_id': self.experiment_id, - 'training_workflow_metadata': self.training_workflow_metadata, - 'hosting_workflow_metadata': self.hosting_workflow_metadata, - 'joining_workflow_metadata': self.joining_workflow_metadata, - 'evaluation_workflow_metadata': self.evaluation_workflow_metadata + "experiment_id": self.experiment_id, + "training_workflow_metadata": self.training_workflow_metadata, + "hosting_workflow_metadata": self.hosting_workflow_metadata, + "joining_workflow_metadata": self.joining_workflow_metadata, + "evaluation_workflow_metadata": self.evaluation_workflow_metadata, } @classmethod @@ -72,5 +73,5 @@ def load_from_ddb_record(cls, record): record["training_workflow_metadata"], record["hosting_workflow_metadata"], record["joining_workflow_metadata"], - record["evaluation_workflow_metadata"] - ) \ No newline at end of file + record["evaluation_workflow_metadata"], + ) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py index c6841a5d..5cab8320 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/join_job_record.py @@ -1,22 +1,25 @@ from datetime import datetime -class JoinJobRecord(): - ''' + +class JoinJobRecord: + """ This class captures all the data that is needed to run a joining job for Continuosly Training and Updating models on SageMaker - ''' + """ + def __init__( - self, - experiment_id, - join_job_id, - current_state=None, - input_obs_data_s3_path=None, - obs_start_time=None, - obs_end_time=None, - input_reward_data_s3_path=None, - output_joined_train_data_s3_path=None, - output_joined_eval_data_s3_path=None, - join_query_ids=[]): + self, + experiment_id, + join_job_id, + current_state=None, + input_obs_data_s3_path=None, + obs_start_time=None, + obs_end_time=None, + input_reward_data_s3_path=None, + output_joined_train_data_s3_path=None, + output_joined_eval_data_s3_path=None, + join_query_ids=[], + ): self.experiment_id = experiment_id self.join_job_id = join_job_id @@ -32,29 +35,31 @@ def __init__( self._join_query_ids = join_query_ids def to_ddb_record(self): - obs_start_time_str = self._obs_start_time.strftime("%Y-%m-%d-%H") if \ - self._obs_start_time is not None else None - obs_end_time_str = self._obs_end_time.strftime("%Y-%m-%d-%H") if \ - self._obs_end_time is not None else None + obs_start_time_str = self._obs_start_time.strftime("%Y-%m-%d-%H") if self._obs_start_time is not None else None + obs_end_time_str = self._obs_end_time.strftime("%Y-%m-%d-%H") if self._obs_end_time is not None else None return { - 'experiment_id': self.experiment_id, - 'join_job_id': self.join_job_id, - 'current_state': self._current_state, - 'input_obs_data_s3_path': self._input_obs_data_s3_path, - 'obs_start_time': obs_start_time_str, - 'obs_end_time': obs_end_time_str, - 'input_reward_data_s3_path': self._input_reward_data_s3_path, - 'output_joined_train_data_s3_path': self._output_joined_train_data_s3_path, - 'output_joined_eval_data_s3_path': self._output_joined_eval_data_s3_path, - 'join_query_ids': self._join_query_ids + "experiment_id": self.experiment_id, + "join_job_id": self.join_job_id, + "current_state": self._current_state, + "input_obs_data_s3_path": self._input_obs_data_s3_path, + "obs_start_time": obs_start_time_str, + "obs_end_time": obs_end_time_str, + "input_reward_data_s3_path": self._input_reward_data_s3_path, + "output_joined_train_data_s3_path": self._output_joined_train_data_s3_path, + "output_joined_eval_data_s3_path": self._output_joined_eval_data_s3_path, + "join_query_ids": self._join_query_ids, } @classmethod def load_from_ddb_record(cls, record): - obs_start_time = datetime.strptime(record["obs_start_time"], "%Y-%m-%d-%H") if \ - record["obs_start_time"] is not None else None - obs_end_time = datetime.strptime(record["obs_end_time"], "%Y-%m-%d-%H") if \ - record["obs_end_time"] is not None else None + obs_start_time = ( + datetime.strptime(record["obs_start_time"], "%Y-%m-%d-%H") + if record["obs_start_time"] is not None + else None + ) + obs_end_time = ( + datetime.strptime(record["obs_end_time"], "%Y-%m-%d-%H") if record["obs_end_time"] is not None else None + ) return JoinJobRecord( record["experiment_id"], @@ -66,8 +71,8 @@ def load_from_ddb_record(cls, record): record["input_reward_data_s3_path"], record["output_joined_train_data_s3_path"], record["output_joined_eval_data_s3_path"], - record["join_query_ids"] - ) + record["join_query_ids"], + ) def get_input_obs_data_s3_path(self): return self._input_obs_data_s3_path diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py index 00ec9df8..14ba8589 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/datatypes/model_record.py @@ -1,23 +1,25 @@ -class ModelRecord(): - ''' +class ModelRecord: + """ This class captures all the data that is needed to run a training job for Continuosly Training and Updating models on SageMaker - ''' + """ + def __init__( - self, - experiment_id, - model_id, - train_state=None, - evaluation_job_name=None, - eval_state=None, - eval_scores={}, - input_model_id=None, - input_data_s3_prefix=None, - manifest_file_path=None, - eval_data_s3_path=None, - s3_model_output_path=None, - training_start_time=None, - training_end_time=None): + self, + experiment_id, + model_id, + train_state=None, + evaluation_job_name=None, + eval_state=None, + eval_scores={}, + input_model_id=None, + input_data_s3_prefix=None, + manifest_file_path=None, + eval_data_s3_path=None, + s3_model_output_path=None, + training_start_time=None, + training_end_time=None, + ): self.experiment_id = experiment_id self.model_id = model_id @@ -37,19 +39,19 @@ def __init__( def to_ddb_record(self): return { - 'experiment_id': self.experiment_id, - 'model_id': self.model_id, - 'train_state': self._train_state, - 'evaluation_job_name': self._evaluation_job_name, - 'eval_state': self._eval_state, - 'eval_scores': self._eval_scores, - 'input_model_id': self._input_model_id, - 'input_data_s3_prefix': self._input_data_s3_prefix, - 'manifest_file_path': self._manifest_file_path, - 'eval_data_s3_path': self._eval_data_s3_path, - 's3_model_output_path': self._s3_model_output_path, - 'training_start_time': self._training_start_time, - 'training_end_time': self._training_end_time + "experiment_id": self.experiment_id, + "model_id": self.model_id, + "train_state": self._train_state, + "evaluation_job_name": self._evaluation_job_name, + "eval_state": self._eval_state, + "eval_scores": self._eval_scores, + "input_model_id": self._input_model_id, + "input_data_s3_prefix": self._input_data_s3_prefix, + "manifest_file_path": self._manifest_file_path, + "eval_data_s3_path": self._eval_data_s3_path, + "s3_model_output_path": self._s3_model_output_path, + "training_start_time": self._training_start_time, + "training_end_time": self._training_end_time, } @classmethod @@ -67,15 +69,10 @@ def load_from_ddb_record(cls, record): record["eval_data_s3_path"], record["s3_model_output_path"], record["training_start_time"], - record["training_end_time"] - ) - - def add_new_training_job_info( - self, - input_model_id=None, - input_data_s3_prefix=None, - manifest_file_path=None - ): + record["training_end_time"], + ) + + def add_new_training_job_info(self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None): self._input_model_id = input_model_id self._input_data_s3_prefix = input_data_s3_prefix self._manifest_file_path = manifest_file_path @@ -87,10 +84,10 @@ def add_new_training_job_info( self._eval_scores = {} # eval score for a new model would always be empty. def add_new_evaluation_job_info( - self, - evaluation_job_name=None, - eval_data_s3_path=None, - ): + self, + evaluation_job_name=None, + eval_data_s3_path=None, + ): self._evaluation_job_name = evaluation_job_name self._eval_data_s3_path = eval_data_s3_path @@ -107,20 +104,16 @@ def model_in_terminal_state(self): return False def update_model_job_status( - self, - training_start_time=None, - training_end_time=None, - train_state=None, - s3_model_output_path=None - ): + self, training_start_time=None, training_end_time=None, train_state=None, s3_model_output_path=None + ): self._training_start_time = training_start_time self._training_end_time = training_end_time - self._train_state = train_state + self._train_state = train_state self._s3_model_output_path = s3_model_output_path def update_model_as_failed(self): self._train_state = "Failed" - + def eval_in_terminal_state(self): if self._eval_state: return self._eval_state.endswith("ed") @@ -130,17 +123,15 @@ def add_model_eval_scores(self, eval_score): if self._eval_scores is None: self._eval_scores = {} self._eval_scores[self._eval_data_s3_path] = eval_score - + def update_eval_job_state(self, eval_state): self._eval_state = eval_state - + def update_eval_job_as_failed(self): self._eval_state = "Failed" def is_train_completed(self): - if self._train_state and \ - self._train_state == "Completed" and \ - self._s3_model_output_path is not None: + if self._train_state and self._train_state == "Completed" and self._s3_model_output_path is not None: return True return False diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py index 692e4c60..608b3860 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/experiment_manager.py @@ -12,7 +12,7 @@ import sagemaker logging.basicConfig() -logger = logging.getLogger('orchestrator') +logger = logging.getLogger("orchestrator") logger.setLevel(logging.INFO) try: @@ -40,64 +40,68 @@ from orchestrator.resource_manager import ResourceManager from orchestrator.utils.cloudwatch_logger import CloudWatchLogger from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException -from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, \ - SageMakerHostingException, SageMakerTrainingJobException, WorkflowJoiningJobException, \ - EvalScoreNotAvailableException, InvalidUsageException - - +from orchestrator.exceptions.workflow_exceptions import ( + UnhandledWorkflowException, + SageMakerHostingException, + SageMakerTrainingJobException, + WorkflowJoiningJobException, + EvalScoreNotAvailableException, + InvalidUsageException, +) class HostingState(str, Enum): - PENDING = "PENDING" # A hosting update request is pending - DEPLOYING = "DEPLOYING" # A hosting update request is in process - DEPLOYED = "DEPLOYED" # Hosting update request was completed. - FAILED = "FAILED" # hosting update request failed. + PENDING = "PENDING" # A hosting update request is pending + DEPLOYING = "DEPLOYING" # A hosting update request is in process + DEPLOYED = "DEPLOYED" # Hosting update request was completed. + FAILED = "FAILED" # hosting update request failed. class TrainingState(str, Enum): - PENDING = "PENDING" # A new model/training job create request is made - TRAINING = "TRAINING" # Model/Training job is in status of 'Training' - TRAINED = "TRAINED" # Model/Training job has been completed - STOPPED = "STOPPED" # Model/Training job has been stopped - FAILED = "FAILED" # Model/Training job has been failed + PENDING = "PENDING" # A new model/training job create request is made + TRAINING = "TRAINING" # Model/Training job is in status of 'Training' + TRAINED = "TRAINED" # Model/Training job has been completed + STOPPED = "STOPPED" # Model/Training job has been stopped + FAILED = "FAILED" # Model/Training job has been failed class EvaluationState(str, Enum): - PENDING = "PENDING" # A new evaluation job create request is made - EVALUATING = "EVALUATING" # Evaluation job is in status of 'Evaluating' - EVALUATED = "EVALUATED" # Evaluation job has been completed - STOPPED = "STOPPED" # Evaluation job has been stopped - FAILED = "FAILED" # Evaluation job has been failed + PENDING = "PENDING" # A new evaluation job create request is made + EVALUATING = "EVALUATING" # Evaluation job is in status of 'Evaluating' + EVALUATED = "EVALUATED" # Evaluation job has been completed + STOPPED = "STOPPED" # Evaluation job has been stopped + FAILED = "FAILED" # Evaluation job has been failed class JoiningState(str, Enum): - PENDING = "PENDING" # A joining request is pending - RUNNING = "RUNNING" # A joining job is running - SUCCEEDED = "SUCCEEDED" # A joining job has been completed - FAILED = "FAILED" # A joining job has been failed - CANCELLED = "CANCELLED" # A joining job has been cancelled + PENDING = "PENDING" # A joining request is pending + RUNNING = "RUNNING" # A joining job is running + SUCCEEDED = "SUCCEEDED" # A joining job has been completed + FAILED = "FAILED" # A joining job has been failed + CANCELLED = "CANCELLED" # A joining job has been cancelled + -# Using SageMakerTrainingJob primary status +# Using SageMakerTrainingJob primary status TRAINING_JOB_STATUS_MAP = { "Pending": TrainingState.PENDING, "InProgress": TrainingState.TRAINING, "Stopping": TrainingState.TRAINING, "Stopped": TrainingState.STOPPED, "Failed": TrainingState.FAILED, - "Completed": TrainingState.TRAINED + "Completed": TrainingState.TRAINED, } -# Using SageMakerTrainingJob primary status +# Using SageMakerTrainingJob primary status EVALUATION_JOB_STATUS_MAP = { "Pending": EvaluationState.PENDING, "InProgress": EvaluationState.EVALUATING, "Stopping": EvaluationState.EVALUATING, "Stopped": EvaluationState.STOPPED, "Failed": EvaluationState.FAILED, - "Completed": EvaluationState.EVALUATED + "Completed": EvaluationState.EVALUATED, } -# Using SageMakerHostingEndpoint primary status +# Using SageMakerHostingEndpoint primary status HOSTING_ENDPOINT_STATUS_MAP = { "OutOfService": HostingState.FAILED, "Creating": HostingState.DEPLOYING, @@ -106,7 +110,7 @@ class JoiningState(str, Enum): "RollingBack": HostingState.DEPLOYING, "InService": HostingState.DEPLOYED, "Deleting": HostingState.DEPLOYING, - "Failed": HostingState.FAILED + "Failed": HostingState.FAILED, } @@ -118,10 +122,7 @@ class ExperimentManagerSyncThread(Thread): for the latest state and update the table. """ - def __init__( - self, - experiment_manager - ): + def __init__(self, experiment_manager): """Initialize a synchronization thread for the experiment Args: @@ -152,23 +153,23 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met Three thing happens here: a) Checks if current TrainingWorkflowMetadata needs an update. b) Fetches latest TrainingJob state from ModelDb for next_model_to_train - c) Updates ExperimentDb TrainingWorkflowMetadata with latest information. + c) Updates ExperimentDb TrainingWorkflowMetadata with latest information. d) Finally, updates the local ExperimentManager context to latest. - + Args: training_workflow_metadata (dict): A dictionary containing training workflow related metadata """ if training_workflow_metadata is None: - # A training request hasn't been made yet. + # A training request hasn't been made yet. # Nothing to proccess. Return. return - + next_model_to_train_id = training_workflow_metadata.get("next_model_to_train_id", None) training_state = training_workflow_metadata.get("training_state", None) if training_state is None: - # A training request hasn't been made yet. + # A training request hasn't been made yet. # Nothing to proccess. Return. return elif not training_state.endswith("ING"): @@ -177,14 +178,16 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met return elif training_state.endswith("ING") and next_model_to_train_id is None: # A training is in progress, but the training model-id is None! - logger.warn(f"Model Training in {training_state}, while next_model_to_train_id is None. " - "Training Workflow would be stuck if this continues." + logger.warn( + f"Model Training in {training_state}, while next_model_to_train_id is None. " + "Training Workflow would be stuck if this continues." ) return else: # A training is in progress. Fetch the status of that training job from ModelDb. training_job_record = self.model_db_client.get_model_record_with_retry( - self.experiment_id, next_model_to_train_id) + self.experiment_id, next_model_to_train_id + ) # Get updated TrainingWorkflowState in {new_training_state} if training_job_record is None: @@ -199,56 +202,60 @@ def _update_experiment_db_training_workflow_metadata(self, training_workflow_met if train_state_from_modeldb is not None: new_training_state = TRAINING_JOB_STATUS_MAP[train_state_from_modeldb] else: - # Since ModelDb training job state is None, + # Since ModelDb training job state is None, # keep the ExperimentDb TrainingWorkflowState same. - logger.warn(f"ModelDb has model-id {next_model_to_train_id} 's state as 'None'. " - "Training Worklow would be stuck if this continues." + logger.warn( + f"ModelDb has model-id {next_model_to_train_id} 's state as 'None'. " + "Training Worklow would be stuck if this continues." ) new_training_state = training_state expected_next_model_to_train_id = next_model_to_train_id # Generate new TrainingWorkflowState for ExperimentDb based on new_training_state if new_training_state == TrainingState.TRAINED: - training_workflow_metadata['last_trained_model_id'] = next_model_to_train_id - training_workflow_metadata['next_model_to_train_id'] = None - training_workflow_metadata['training_state'] = new_training_state + training_workflow_metadata["last_trained_model_id"] = next_model_to_train_id + training_workflow_metadata["next_model_to_train_id"] = None + training_workflow_metadata["training_state"] = new_training_state elif new_training_state == TrainingState.FAILED or new_training_state == TrainingState.STOPPED: # training_workflow_metadata['last_trained_model_id'] remains the same # training_workflow_metadata['next_model_to_train_id'] remains the same or change to None # update the ExperimentDb TrainingWorkflowState to Failed - training_workflow_metadata['training_state'] = new_training_state + training_workflow_metadata["training_state"] = new_training_state else: # training_workflow_metadata['last_trained_model_id'] remains the same # training_workflow_metadata['next_model_to_train_id'] remains the same # update the ExperimentDb TrainingWorkflowState to new_training_state - training_workflow_metadata['training_state'] = new_training_state + training_workflow_metadata["training_state"] = new_training_state # Try to save the update in ExperimentDb # This can update the status only if in the current record, # next_model_to_train_id == expected_next_model_to_train_id try: self.exp_db_client.update_training_workflow_metadata_with_validation( - self.experiment_id, - training_workflow_metadata, - expected_next_model_to_train_id + self.experiment_id, training_workflow_metadata, expected_next_model_to_train_id ) except Exception as e: if "ConditionalCheckFailedException" in str(e): - # Most likely Sync Thread went out of sync :( - # Just return here without updating local ExperimentManager. - logger.warn("Sync Thread trying to update ExperimentDb with old state. This should " - "get fixed in next run!" + # Most likely Sync Thread went out of sync :( + # Just return here without updating local ExperimentManager. + logger.warn( + "Sync Thread trying to update ExperimentDb with old state. This should " "get fixed in next run!" ) return logger.error("Failed to update ExperimentDb with latest information: " + str(e)) - raise UnhandledWorkflowException("Some error occurred while update ExperimentDb record TrainingWorkflowMetadata") + raise UnhandledWorkflowException( + "Some error occurred while update ExperimentDb record TrainingWorkflowMetadata" + ) # Finally, update local ExperimentManager with new states. - self.experiment_manager.experiment_record._last_trained_model_id = training_workflow_metadata['last_trained_model_id'] - self.experiment_manager.experiment_record._next_model_to_train_id = training_workflow_metadata['next_model_to_train_id'] - self.experiment_manager.experiment_record._training_state = training_workflow_metadata['training_state'] - + self.experiment_manager.experiment_record._last_trained_model_id = training_workflow_metadata[ + "last_trained_model_id" + ] + self.experiment_manager.experiment_record._next_model_to_train_id = training_workflow_metadata[ + "next_model_to_train_id" + ] + self.experiment_manager.experiment_record._training_state = training_workflow_metadata["training_state"] def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow_metadata): """ @@ -266,9 +273,8 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow # some evaluation request is in progress if evaluation_state is not None and evaluation_state.endswith("ING"): - evaluation_model_id = next_evaluation_job_id.split('-eval-')[0] - evaluation_job_record = self.model_db_client.get_model_record( - self.experiment_id, evaluation_model_id) + evaluation_model_id = next_evaluation_job_id.split("-eval-")[0] + evaluation_job_record = self.model_db_client.get_model_record(self.experiment_id, evaluation_model_id) # if evaluation model record exists in the model table if evaluation_job_record is not None: @@ -281,9 +287,7 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow self.experiment_manager.experiment_record._evaluation_state = evaluation_state # update table states via ddb client - self.exp_db_client.update_experiment_evaluation_state( - self.experiment_id, evaluation_state - ) + self.exp_db_client.update_experiment_evaluation_state(self.experiment_id, evaluation_state) if evaluation_state == EvaluationState.EVALUATED: self.experiment_manager.experiment_record._last_evaluation_job_id = next_evaluation_job_id @@ -292,20 +296,18 @@ def _update_experiment_db_evaluation_workflow_metadata(self, evaluation_workflow self.exp_db_client.update_experiment_last_evaluation_job_id( self.experiment_id, next_evaluation_job_id ) - self.exp_db_client.update_experiment_next_evaluation_job_id( - self.experiment_id, None - ) - + self.exp_db_client.update_experiment_next_evaluation_job_id(self.experiment_id, None) + # update latest_train/eval metrics to publish to CW self._update_metrics_from_latest_eval_job(next_evaluation_job_id) def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metadata): """Update the hosting workflow metadata in the experiment table - + Args: hosting_workflow_metadata (dict): A dictionary containing hosting workflow related metadata - """ + """ if hosting_workflow_metadata is None: return @@ -321,12 +323,8 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad model_id = predictor.get_hosted_model_id() assert model_id == last_hosted_model_id except Exception: - self.exp_db_client.update_experiment_hosting_state( - self.experiment_id, None - ) - self.exp_db_client.update_experiment_hosting_endpoint( - self.experiment_id, None - ) + self.exp_db_client.update_experiment_hosting_state(self.experiment_id, None) + self.exp_db_client.update_experiment_hosting_endpoint(self.experiment_id, None) self.experiment_manager.experiment_record._hosting_state = None self.experiment_manager.experiment_record._hosting_endpoint = None @@ -337,9 +335,7 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad # describe endpoint to get state of the deployment try: - sm_endpoint_info = self.sagemaker_client.describe_endpoint( - EndpointName=self.experiment_id - ) + sm_endpoint_info = self.sagemaker_client.describe_endpoint(EndpointName=self.experiment_id) except Exception: # Do not raise exception return @@ -348,16 +344,14 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad self.experiment_manager.experiment_record._hosting_state = hosting_state # update table states via ddb client - self.exp_db_client.update_experiment_hosting_state( - self.experiment_id, hosting_state - ) + self.exp_db_client.update_experiment_hosting_state(self.experiment_id, hosting_state) if hosting_state == HostingState.DEPLOYED: # update local record self.experiment_manager.experiment_record._hosting_endpoint = sm_endpoint_info.get("EndpointArn") self.experiment_manager.experiment_record._last_hosted_model_id = next_model_to_host_id self.experiment_manager.experiment_record._next_model_to_host_id = None - + # update DynamoDB record self.exp_db_client.update_experiment_hosting_endpoint( self.experiment_id, sm_endpoint_info.get("EndpointArn") @@ -365,9 +359,7 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad self.exp_db_client.update_experiment_last_hosted_model_id( self.experiment_id, next_model_to_host_id ) - self.exp_db_client.update_experiment_next_model_to_host_id( - self.experiment_id, None - ) + self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, None) self._update_metrics_from_latest_hosting_update(next_model_to_host_id) else: @@ -394,31 +386,27 @@ def _update_experiment_db_hosting_workflow_metadata(self, hosting_workflow_metad self.experiment_manager.experiment_record._hosting_state = hosting_state # update hosting_state in exp table - self.exp_db_client.update_experiment_hosting_state( - self.experiment_id, hosting_state - ) + self.exp_db_client.update_experiment_hosting_state(self.experiment_id, hosting_state) if hosting_state == HostingState.DEPLOYED: # update local record self.experiment_manager.experiment_record._last_hosted_model_id = next_model_to_host_id self.experiment_manager.experiment_record._next_model_to_host_id = None - + # update DynamoDB record self.exp_db_client.update_experiment_last_hosted_model_id( self.experiment_id, next_model_to_host_id ) - self.exp_db_client.update_experiment_next_model_to_host_id( - self.experiment_id, None - ) + self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, None) self._update_metrics_from_latest_hosting_update(next_model_to_host_id) def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metadata): """Update the joining workflow metadata in the experiment table - + Args: joining_workflow_metadata (dict): A dictionary containing joining workflow related metadata - """ + """ if joining_workflow_metadata is None: return @@ -427,8 +415,7 @@ def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metad # some joining job request is in progress if joining_state is not None and joining_state.endswith("ING"): - join_job_record = self.join_db_client.get_join_job_record( - self.experiment_id, next_join_job_id) + join_job_record = self.join_db_client.get_join_job_record(self.experiment_id, next_join_job_id) # if join job record exists in the join table if join_job_record is not None: @@ -441,35 +428,29 @@ def _update_experiment_db_joining_workflow_metadata(self, joining_workflow_metad self.experiment_manager.experiment_record._joining_state = joining_state # update table states via ddb client - self.exp_db_client.update_experiment_joining_state( - self.experiment_id, joining_state - ) + self.exp_db_client.update_experiment_joining_state(self.experiment_id, joining_state) if joining_state == JoiningState.SUCCEEDED: self.experiment_manager.experiment_record._last_joined_job_id = next_join_job_id self.experiment_manager.experiment_record._next_join_job_id = None - self.exp_db_client.update_experiment_last_joined_job_id( - self.experiment_id, next_join_job_id - ) - self.exp_db_client.update_experiment_next_join_job_id( - self.experiment_id, None - ) + self.exp_db_client.update_experiment_last_joined_job_id(self.experiment_id, next_join_job_id) + self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, None) def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id): """ Updates SyncThread's local information on every Evaluation Job complete run. - Also Emit CW metric for New Model Evaluation Scores plot, while updating + Also Emit CW metric for New Model Evaluation Scores plot, while updating local latest_trained_model_* information, for continuous CW puts (for Number plots) """ try: last_trained_model_id = self.experiment_manager.last_trained_model_id currently_hosted_model_id = self.experiment_manager.last_hosted_model_id - + if last_trained_model_id in latest_evaluation_job_id: # using in as latest_evaluation_job_id would be of format last_trained_model_id-{eval}-{timestamp} - # If the EvaluationJob was for latest Trained Model + # If the EvaluationJob was for latest Trained Model eval_score = self.get_latest_eval_score_for_model_id(last_trained_model_id) if eval_score == "n.a.": logger.debug("EvalScore from last run in n.a.") @@ -481,9 +462,7 @@ def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id): # Also publish this score once, for Eval Score over time Graph self.experiment_manager.cw_logger.publish_newly_trained_model_eval_information( - self.experiment_id, - last_trained_model_id, - eval_score + self.experiment_id, last_trained_model_id, eval_score ) elif currently_hosted_model_id in latest_evaluation_job_id: # using in as latest_evaluation_job_id would be of format currently_hosted_model_id-{eval}-{timestamp} @@ -497,8 +476,10 @@ def _update_metrics_from_latest_eval_job(self, latest_evaluation_job_id): self.latest_hosted_model_eval_score = eval_score else: # Evaluation Job not for latest-trained-model - logger.debug("Latest Evaluated Model doesn't match Latest Trained Model, or" - " Currently Hosted Model. Skipping reporting EvalScore") + logger.debug( + "Latest Evaluated Model doesn't match Latest Trained Model, or" + " Currently Hosted Model. Skipping reporting EvalScore" + ) return except Exception as e: @@ -520,18 +501,13 @@ def _update_metrics_from_latest_hosting_update(self, latest_hosted_model_id): # Also publish this score once, for Eval Score over time Graph self.experiment_manager.cw_logger.publish_latest_hosting_information( - self.experiment_id, - latest_hosted_model_id, - eval_score + self.experiment_id, latest_hosted_model_id, eval_score ) except Exception as e: logger.warn("Failed to emit latest training job eval metrics." + str(e)) def get_latest_eval_score_for_model_id(self, model_id): - model_record = self.model_db_client.get_model_record( - self.experiment_id, - model_id - ) + model_record = self.model_db_client.get_model_record(self.experiment_id, model_id) eval_score = "n.a." if model_record is not None: eval_keys = model_record["eval_scores"].keys() @@ -540,36 +516,32 @@ def get_latest_eval_score_for_model_id(self, model_id): return eval_score # sort eval score by s3 prefix as joining job is ordered by time eval_keys = sorted(eval_keys) - return model_record["eval_scores"][eval_keys[-1]] - else: + return model_record["eval_scores"][eval_keys[-1]] + else: return eval_score - + def emit_cloudwatch_metrics_for_training_and_hosting(self): try: # emit CloudWatch Training metrics if self.latest_trained_model_id and self.latest_trained_model_eval_score: self.experiment_manager.cw_logger.publish_latest_training_information( - self.experiment_id, - self.latest_trained_model_id, - self.latest_trained_model_eval_score + self.experiment_id, self.latest_trained_model_id, self.latest_trained_model_eval_score ) else: - #logger.debug("Train CW Metrics Not Set") + # logger.debug("Train CW Metrics Not Set") pass except Exception: logger.debug("Failed to publish CW Metrics for Training State") logger.debug(e) - try: + try: # emit CloudWatch Hosting metrics if self.latest_hosted_model_id and self.latest_hosted_model_eval_score: self.experiment_manager.cw_logger.publish_latest_hosting_information( - self.experiment_id, - self.latest_hosted_model_id, - self.latest_hosted_model_eval_score + self.experiment_id, self.latest_hosted_model_id, self.latest_hosted_model_eval_score ) else: - #logger.debug("Host CW Metrics Not Set") + # logger.debug("Host CW Metrics Not Set") pass except Exception: logger.debug("Failed to publish CW Metrics for Training State") @@ -600,7 +572,8 @@ def sync_experiment_state_with_ddb(self): next_model_to_train = ModelManager( model_db_client=self.model_db_client, experiment_id=self.experiment_id, - model_id=next_model_to_train_id) + model_id=next_model_to_train_id, + ) next_model_to_train.update_model_training_state() time.sleep(1) self._update_experiment_db_training_workflow_metadata(training_workflow_metadata) @@ -615,12 +588,17 @@ def sync_experiment_state_with_ddb(self): self.experiment_manager.next_model_to_evaluate.update_model_evaluation_state() else: # only init the ModelManager() if the evaluation job record already exists - if self.model_db_client.get_model_record(self.experiment_id, \ - next_evaluation_job_id.split('-eval-')[0]) is not None: + if ( + self.model_db_client.get_model_record( + self.experiment_id, next_evaluation_job_id.split("-eval-")[0] + ) + is not None + ): next_model_to_evaluate = ModelManager( model_db_client=self.model_db_client, experiment_id=self.experiment_id, - model_id=next_evaluation_job_id.split('-eval-')[0]) + model_id=next_evaluation_job_id.split("-eval-")[0], + ) next_model_to_evaluate.update_model_evaluation_state() time.sleep(1) self._update_experiment_db_evaluation_workflow_metadata(evaluation_workflow_metadata) @@ -643,7 +621,8 @@ def sync_experiment_state_with_ddb(self): next_join_job = JoinManager( join_db_client=self.join_db_client, experiment_id=self.experiment_id, - join_job_id=next_join_job_id) + join_job_id=next_join_job_id, + ) next_join_job.update_join_job_state() time.sleep(1) self._update_experiment_db_joining_workflow_metadata(joining_workflow_metadata) @@ -663,10 +642,10 @@ def run(self): logger.error(e) logger.warn("Resuming Sync in 10 seconds...") time.sleep(10) - time.sleep(.5) + time.sleep(0.5) -class ExperimentManager(): +class ExperimentManager: """ A experiment entity to manage different components in the continual learning iteration loops. One experiment will be initiated to solve a single RL problem. @@ -674,15 +653,16 @@ class ExperimentManager(): entity provides methods/functionalities for model training/evaluation/deployment and data joining. """ - - def __init__(self, - config, - experiment_id, - training_workflow_metadata={}, - hosting_workflow_metadata={}, - joining_workflow_metadata={}, - evaluation_workflow_metadata={} - ): + + def __init__( + self, + config, + experiment_id, + training_workflow_metadata={}, + hosting_workflow_metadata={}, + joining_workflow_metadata={}, + evaluation_workflow_metadata={}, + ): """Initialize/Reload an experiment entity to manage the workflow Args: @@ -696,12 +676,12 @@ def __init__(self, Return: sagemaker_rl.orchestrator.workflow.experiment_manager.ExperimentManager: A ``ExperimentManager`` object to manage the workflow - """ + """ self.boto_session = boto3.Session() self._region_name = self.boto_session.region_name self.account = self.boto_session.client("sts").get_caller_identity()["Account"] if self._region_name is None: - raise ValueError('Must setup AWS configuration with a valid region') + raise ValueError("Must setup AWS configuration with a valid region") # unique id common across all experiments in the account self.experiment_id = experiment_id @@ -709,7 +689,7 @@ def __init__(self, # load configs self.config = config self.image = self.config.get("image", None).replace("{AWS_REGION}", self._region_name) - + self.algor_config = self.config.get("algor", {}) self.local_mode = self.config.get("local_mode", True) if self.local_mode: @@ -721,62 +701,55 @@ def __init__(self, self.soft_deployment = self.config.get("soft_deployment", False) # load resource config and init shared resourced if not exists - self.resource_manager = ResourceManager(self.config.get("resource", {}), - boto_session=self.boto_session) + self.resource_manager = ResourceManager(self.config.get("resource", {}), boto_session=self.boto_session) self.resource_manager.create_shared_resource_if_not_exist() # init clients self.exp_db_client = self.resource_manager.exp_db_client self.model_db_client = self.resource_manager.model_db_client self.join_db_client = self.resource_manager.join_db_client - self.cw_logger = CloudWatchLogger( - self.boto_session.client("cloudwatch"), - self._region_name - ) + self.cw_logger = CloudWatchLogger(self.boto_session.client("cloudwatch"), self._region_name) self.sagemaker_client = self.sagemaker_session.sagemaker_client - + # init s3 client for rewards upload - self.s3_client = self.boto_session.client('s3') + self.s3_client = self.boto_session.client("s3") - # create a local JoinJobRecord object. + # create a local JoinJobRecord object. self.experiment_record = ExperimentRecord( experiment_id, training_workflow_metadata, hosting_workflow_metadata, joining_workflow_metadata, - evaluation_workflow_metadata + evaluation_workflow_metadata, ) self.next_model_to_train = None self.next_join_job = None self.next_model_to_evaluate = None - # Try to save new ExperimentRecord to ExperimentDb. If it throws + # Try to save new ExperimentRecord to ExperimentDb. If it throws # RecordAlreadyExistsException, re-read the ExperimentRecord from ExperimentDb, # and use it as initial state try: - self.exp_db_client.create_new_experiment_record( - self.experiment_record.to_ddb_record() - ) + self.exp_db_client.create_new_experiment_record(self.experiment_record.to_ddb_record()) except RecordAlreadyExistsException: - logger.warn(f"Experiment with name {self.experiment_id} already exists. " - "Reusing current state from ExperimentDb.") - experiment_record = self.exp_db_client.get_experiment_record( - experiment_id + logger.warn( + f"Experiment with name {self.experiment_id} already exists. " + "Reusing current state from ExperimentDb." ) + experiment_record = self.exp_db_client.get_experiment_record(experiment_id) self.experiment_record = ExperimentRecord.load_from_ddb_record(experiment_record) except Exception as e: logger.error("Unhandled Exception! " + str(e)) raise UnhandledWorkflowException("Something went wrong while creating a new experiment") try: - self.cw_logger.create_cloudwatch_dashboard_from_experiment_id( - self.experiment_id - ) + self.cw_logger.create_cloudwatch_dashboard_from_experiment_id(self.experiment_id) except Exception as e: logger.error("Unable to create CloudWatch Dashboard." + str(e)) - logger.error("To see metrics on CloudWatch, run bandit_experiment." - "cw_logger.create_cloudwatch_dashboard_from_experiment_id function again.") - + logger.error( + "To see metrics on CloudWatch, run bandit_experiment." + "cw_logger.create_cloudwatch_dashboard_from_experiment_id function again." + ) # start a daemon thread to sync ExperimentDb states to local states # the daemon thread will keep running till the session ends @@ -784,8 +757,8 @@ def __init__(self, # Run the thread in SageMaker mode only if not self.local_mode: - self.sync_thread.setDaemon(True) - self.sync_thread.start() + self.sync_thread.setDaemon(True) + self.sync_thread.start() def _sync_experiment_state_with_ddb(self): """ @@ -796,51 +769,51 @@ def _sync_experiment_state_with_ddb(self): self.sync_thread.sync_experiment_state_with_ddb() def _update_instance_type_for_local_mode(self): - """Update the instance type if running in 'local' mode - """ + """Update the instance type if running in 'local' mode""" self.config["resource"]["private_resource"]["hosting_fleet"]["instance_type"] = "local" self.config["resource"]["private_resource"]["training_fleet"]["instance_type"] = "local" self.config["resource"]["private_resource"]["evaluation_fleet"]["instance_type"] = "local" def _jsonify(self): - """Return a jsonify dict with metadata of the 'Experiment' object - """ + """Return a jsonify dict with metadata of the 'Experiment' object""" return self.experiment_record.to_ddb_record() def _get_prefix_and_relative_path(self, path_list): """Return shared prefix and relative paths given a list of paths - + Args: path_list (list): A list of string representing S3 paths - + Returns: (str, list): Return shared prefix and a list of relative paths """ # example of path: s3://custom-bucket/exp-1/exp-1-join-id-time-stamp/train # use s3 bucket as prefix # allow data from different experiments but in same account - parts = path_list[0].split('/') - shared_prefix = '/'.join(parts[0:3]) # s3://custom-bucket + parts = path_list[0].split("/") + shared_prefix = "/".join(parts[0:3]) # s3://custom-bucket key_path_list = [] for path in path_list: - parts = path.split('/') - prefix = '/'.join(parts[0:3]) + parts = path.split("/") + prefix = "/".join(parts[0:3]) if prefix != shared_prefix: - logger.error(f" Prefix `{prefix}` is different from the shared prefix '{shared_prefix}'. " - "Data in the list are not coming from same s3 bucket.") - object_path = '/'.join(parts[3:]) + logger.error( + f" Prefix `{prefix}` is different from the shared prefix '{shared_prefix}'. " + "Data in the list are not coming from same s3 bucket." + ) + object_path = "/".join(parts[3:]) key_path_list.append(object_path) return shared_prefix, key_path_list def _write_manifest_to_s3(self, manifest_file): """Upload manifest file to S3 bucket - + Args: manifest_file (dict): A json blob that contains manifest shared prefix and list of relative paths - + Returns: str: S3 data path for the uploaded manifest file """ @@ -852,31 +825,27 @@ def _write_manifest_to_s3(self, manifest_file): manifest_bucket_name = "sagemaker-{}-{}".format(region, account) timstamp = str(int(time.time())) manifest_s3_file_key = f"{self.experiment_id}/manifest_files/manifest-{timstamp}" - body = b'' - body += str(json.dumps(manifest_file, sort_keys=True, indent=4)).encode('utf_8') + body = b"" + body += str(json.dumps(manifest_file, sort_keys=True, indent=4)).encode("utf_8") try: - s3_client.put_object(Body=body, - Bucket=manifest_bucket_name, - Key=manifest_s3_file_key) + s3_client.put_object(Body=body, Bucket=manifest_bucket_name, Key=manifest_s3_file_key) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError("Failed to upload manifest data with error {}: {}".format( - error_code, message - )) - + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError("Failed to upload manifest data with error {}: {}".format(error_code, message)) + manifest_file_path = f"s3://{manifest_bucket_name}/{manifest_s3_file_key}" logger.info(f"Successfully upload manifest file to s3 bucket path `{manifest_file_path}'") return manifest_file_path def _generate_manifest(self, input_data_path_list): """Generate manifest file and upload it to S3 bucket - + Args: input_data_path_list (list): A list of strings representing input S3 data paths - + Returns: str: S3 data path for the uploaded manifest file """ @@ -890,7 +859,7 @@ def _generate_manifest(self, input_data_path_list): manifest = [] shared_prefix, key_path_list = self._get_prefix_and_relative_path(input_data_path_list) logger.info(f"Generating manifest file with shared prefix '{shared_prefix}/' ...") - manifest.append({'prefix': shared_prefix + '/'}) + manifest.append({"prefix": shared_prefix + "/"}) for relative_key_path in key_path_list: manifest.append(relative_key_path) @@ -902,10 +871,13 @@ def last_trained_model_id(self): if self.experiment_record._last_trained_model_id is None: logger.warning("No model has been trained. Please check later.") - if self.experiment_record._training_state is not None and \ - self.experiment_record._training_state.endswith("ING"): - logger.warning(f"A training job with model id '{self.experiment_record._next_model_to_train_id}' " - f"is running in state of '{self.experiment_record._training_state}'") + if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith( + "ING" + ): + logger.warning( + f"A training job with model id '{self.experiment_record._next_model_to_train_id}' " + f"is running in state of '{self.experiment_record._training_state}'" + ) return self.experiment_record._last_trained_model_id @@ -914,10 +886,13 @@ def last_evaluation_job_id(self): if self.experiment_record._last_evaluation_job_id is None: logger.warning("No model has been evaluated. Please check later.") - if self.experiment_record._evaluation_state is not None \ - and self.experiment_record._evaluation_state.endswith("ING"): - logger.warning(f"A evaluation job with job id '{self.experiment_record._next_evaluation_job_id}' " - f"is running in state of '{self.experiment_record._evaluation_state}'") + if self.experiment_record._evaluation_state is not None and self.experiment_record._evaluation_state.endswith( + "ING" + ): + logger.warning( + f"A evaluation job with job id '{self.experiment_record._next_evaluation_job_id}' " + f"is running in state of '{self.experiment_record._evaluation_state}'" + ) return self.experiment_record._last_evaluation_job_id @@ -926,10 +901,11 @@ def last_hosted_model_id(self): if self.experiment_record._last_hosted_model_id is None: logger.warning("No model has been hosted. Please deploy a model and check later.") - if self.experiment_record._hosting_state is not None \ - and self.experiment_record._hosting_state.endswith("ING"): - logger.warning(f"A deployment with model id '{self.experiment_record._next_model_to_host_id}' " - f"is running in state of '{self.experiment_record._hosting_state}'") + if self.experiment_record._hosting_state is not None and self.experiment_record._hosting_state.endswith("ING"): + logger.warning( + f"A deployment with model id '{self.experiment_record._next_model_to_host_id}' " + f"is running in state of '{self.experiment_record._hosting_state}'" + ) return self.experiment_record._last_hosted_model_id @@ -938,15 +914,16 @@ def last_joined_job_id(self): if self.experiment_record._last_joined_job_id is None: logger.warning("No joining job has been completed. Please check later.") - if self.experiment_record._joining_state is not None \ - and self.experiment_record._joining_state.endswith("ING"): - logger.warning(f"A joining job with job id '{self.experiment_record._next_join_job_id}' " - f"is running in state of '{self.experiment_record._joining_state}'") + if self.experiment_record._joining_state is not None and self.experiment_record._joining_state.endswith("ING"): + logger.warning( + f"A joining job with job id '{self.experiment_record._next_join_job_id}' " + f"is running in state of '{self.experiment_record._joining_state}'" + ) return self.experiment_record._last_joined_job_id @property - def last_joined_job_train_data(self): + def last_joined_job_train_data(self): record = self.join_db_client.get_join_job_record(self.experiment_id, self.last_joined_job_id) return record["output_joined_train_data_s3_path"] @@ -957,31 +934,32 @@ def last_joined_job_eval_data(self): def _get_hosting_environ_vars(self, model_id): """Return hosting endpoint environment variables - + Args: model_id (str): A unique string representing which model to be hosted by the endpoint - + Returns: dict: A dictionary containing environment variables of hosting endpoint """ - environ_vars = {"AWS_DEFAULT_REGION": self._region_name, - "EXPERIMENT_ID": self.experiment_id, - "EXP_METADATA_DYNAMO_TABLE": self.resource_manager.exp_db_table_name, - "MODEL_METADATA_DYNAMO_TABLE": self.resource_manager.model_db_table_name, - "MODEL_ID": model_id, - "AWS_REGION": self._region_name, - "FIREHOSE_STREAM": None, - # Set to true if inference logging is required. - "LOG_INFERENCE_DATA": str(not self.local_mode).lower(), - # For efficient soft model updates. - "MODEL_METADATA_POLLING": str(self.soft_deployment).lower() - } + environ_vars = { + "AWS_DEFAULT_REGION": self._region_name, + "EXPERIMENT_ID": self.experiment_id, + "EXP_METADATA_DYNAMO_TABLE": self.resource_manager.exp_db_table_name, + "MODEL_METADATA_DYNAMO_TABLE": self.resource_manager.model_db_table_name, + "MODEL_ID": model_id, + "AWS_REGION": self._region_name, + "FIREHOSE_STREAM": None, + # Set to true if inference logging is required. + "LOG_INFERENCE_DATA": str(not self.local_mode).lower(), + # For efficient soft model updates. + "MODEL_METADATA_POLLING": str(self.soft_deployment).lower(), + } return environ_vars def _setup_hosting_endpoint(self, model_id, wait, **kwargs): """Initiate a hosting endpoint deployment - + Args: model_id (str): A unique string representing which model to deploy wait (bool): Whether to wait until the deployment finished @@ -1004,24 +982,29 @@ def _setup_hosting_endpoint(self, model_id, wait, **kwargs): name=model_id, sagemaker_session=self.sagemaker_session, env=environ_vars, - **kwargs) + **kwargs, + ) hosting_instance_count = self.resource_manager.hosting_fleet_config.get("instance_count", 1) hosting_instance_type = self.resource_manager.hosting_fleet_config.get("instance_type", "local") try: - sagemaker_model.deploy(initial_instance_count=hosting_instance_count, - instance_type=hosting_instance_type, - endpoint_name=self.experiment_id, - wait=wait) + sagemaker_model.deploy( + initial_instance_count=hosting_instance_count, + instance_type=hosting_instance_type, + endpoint_name=self.experiment_id, + wait=wait, + ) except Exception as e: logger.error(f"Failed to deploy experiment {self.experiment_id}: " + str(e)) - raise UnhandledWorkflowException( "Some error occurred while setting up hosting endpoint. " - "Please check SageMaker console for more information.") + raise UnhandledWorkflowException( + "Some error occurred while setting up hosting endpoint. " + "Please check SageMaker console for more information." + ) def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True): """Update the model hosted in an existing endpoint - + Args: soft_deploy (bool): Whether to update the model hosted by the endpoint with soft deployment support @@ -1029,12 +1012,8 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True): to deploy/update """ # update 'next_model_to_host_id' and 'hosting_state' - self.exp_db_client.update_experiment_next_model_to_host_id( - self.experiment_id, model_id - ) - self.exp_db_client.update_experiment_hosting_state( - self.experiment_id, HostingState.PENDING - ) + self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, model_id) + self.exp_db_client.update_experiment_hosting_state(self.experiment_id, HostingState.PENDING) # soft deployment will happen once the 'next_model_host_id' is persisted into ExperimentDB if not soft_deploy: update_endpoint = True @@ -1054,8 +1033,10 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True): if closed: logger.info("Closed docker container[s] that was already running (maybe from previous job)") else: - logger.exception("Failed to close a docker container that was already running (maybe from " - "previous job). Please close it manually and retry.") + logger.exception( + "Failed to close a docker container that was already running (maybe from " + "previous job). Please close it manually and retry." + ) model_record = self.model_db_client.get_model_record(self.experiment_id, model_id) sagemaker_model = sagemaker.model.Model( @@ -1064,26 +1045,29 @@ def _update_model_in_endpoint(self, soft_deploy, model_id, wait=True): role=self.resource_manager.iam_role_arn, name=model_id, sagemaker_session=self.sagemaker_session, - env=environ_vars) + env=environ_vars, + ) hosting_instance_count = self.resource_manager.hosting_fleet_config.get("instance_count", 1) hosting_instance_type = self.resource_manager.hosting_fleet_config.get("instance_type", "local") try: - sagemaker_model.deploy(initial_instance_count=hosting_instance_count, - instance_type=hosting_instance_type, - endpoint_name=self.experiment_id, - update_endpoint=update_endpoint, - wait=wait) + sagemaker_model.deploy( + initial_instance_count=hosting_instance_count, + instance_type=hosting_instance_type, + endpoint_name=self.experiment_id, + update_endpoint=update_endpoint, + wait=wait, + ) except Exception as e: logger.error(e) pass def _check_if_model_ready(self, model_id): """Check if the model exists and already trained - + Args: model_id (str): A unique string representing which model to check - + Returns: bool: Whether the model exists and is already trained """ @@ -1093,24 +1077,23 @@ def _check_if_model_ready(self, model_id): return False # check if the model training is completed successfully to consume by next step - model_exist = self.model_db_client.check_model_record_exists( - self.experiment_id, model_id - ) + model_exist = self.model_db_client.check_model_record_exists(self.experiment_id, model_id) if not model_exist: - logger.error(f"Model with mode_id '{model_id}' was not found in model table. " - "Please create a model first") + logger.error( + f"Model with mode_id '{model_id}' was not found in model table. " "Please create a model first" + ) return False # 'model_id' found in table, check if the 'model_id' is trained model_to_deploy = ModelManager( - model_db_client=self.model_db_client, - experiment_id=self.experiment_id, - model_id=model_id - ) + model_db_client=self.model_db_client, experiment_id=self.experiment_id, model_id=model_id + ) if not model_to_deploy.model_record.is_train_completed(): - logger.warning(f"Model '{model_id}' is in status of " - f"{model_to_deploy.model_record._train_state}, Please check later.") + logger.warning( + f"Model '{model_id}' is in status of " + f"{model_to_deploy.model_record._train_state}, Please check later." + ) return False return True @@ -1118,21 +1101,23 @@ def _check_if_model_ready(self, model_id): def deploy_model(self, model_id, wait=True, **kwargs): """Deploy a new model by creating a new hosting endpoint or update the model hosted by an existing endpoint - + Args: model_id (str): A unique string representing which model to deploy/update wait (bool): Whether to wait until the deployment finish """ # TODO: add validation/instructions if multiple deployment - # request happened in th same experiment - + # request happened in th same experiment + # Sync experiment state if required self._sync_experiment_state_with_ddb() # check if 'model_id' is already hosted - if self.experiment_record._last_hosted_model_id == model_id \ - and self.experiment_record._hosting_state == HostingState.DEPLOYED: + if ( + self.experiment_record._last_hosted_model_id == model_id + and self.experiment_record._hosting_state == HostingState.DEPLOYED + ): logger.info(f"Model {model_id} is already being hosted. No deployment needed.") return @@ -1152,39 +1137,39 @@ def deploy_model(self, model_id, wait=True, **kwargs): if closed: logger.info("Closed docker container[s] that was already running (maybe from previous job).") else: - logger.exception("Failed to close a docker container that was already running (maybe from " - "previous job). Please close it manually and retry.") + logger.exception( + "Failed to close a docker container that was already running (maybe from " + "previous job). Please close it manually and retry." + ) else: logger.info("No hosting endpoint found, creating a new hosting endpoint.") # update 'next_model_to_host_id' and 'hosting_state' - self.exp_db_client.update_experiment_next_model_to_host_id( - self.experiment_id, model_id - ) - self.exp_db_client.update_experiment_hosting_state( - self.experiment_id, HostingState.PENDING - ) - + self.exp_db_client.update_experiment_next_model_to_host_id(self.experiment_id, model_id) + self.exp_db_client.update_experiment_hosting_state(self.experiment_id, HostingState.PENDING) + # starting hosting endpoint try: self._setup_hosting_endpoint(model_id, wait=wait, **kwargs) except Exception as e: logger.error(e) pass - + else: if self.experiment_record._hosting_state.endswith("ING"): logger.warning("Some deployment request is in progress, canceled this one") return elif self.experiment_record._hosting_state.endswith("ED"): self._update_model_in_endpoint(self.soft_deployment, model_id, wait=wait) - + # wait until exp ddb table updated if self.local_mode or wait: - deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \ - and self.experiment_record._last_hosted_model_id == model_id \ - and self.experiment_record._next_model_to_host_id is None - + deployed_state = ( + self.experiment_record._hosting_state == HostingState.DEPLOYED + and self.experiment_record._last_hosted_model_id == model_id + and self.experiment_record._next_model_to_host_id is None + ) + num_retries = 0 num_retries_blue_green_deployment = 0 max_retries = 100 @@ -1193,62 +1178,75 @@ def deploy_model(self, model_id, wait=True, **kwargs): # local mode is fast, 'num_retries' increases exponentially self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table hosting status to be updated...") - + if self.soft_deployment: time.sleep(10 * max_retries) - deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \ - and self.experiment_record._last_hosted_model_id == model_id \ - and self.experiment_record._next_model_to_host_id is None + deployed_state = ( + self.experiment_record._hosting_state == HostingState.DEPLOYED + and self.experiment_record._last_hosted_model_id == model_id + and self.experiment_record._next_model_to_host_id is None + ) num_retries += 1 - if num_retries >= max_retries and self.local_mode: - raise UnhandledWorkflowException(f"Deployment with model " - f"'{self.experiment_record._next_model_to_host_id}' was in " - f"state of '{self.experiment_record._hosting_state}'. Failed " - "to sync table status.") - + if num_retries >= max_retries and self.local_mode: + raise UnhandledWorkflowException( + f"Deployment with model " + f"'{self.experiment_record._next_model_to_host_id}' was in " + f"state of '{self.experiment_record._hosting_state}'. Failed " + "to sync table status." + ) + else: # blue-green deployment takes ~8 min, retry every 30 seconds time.sleep(30) - deployed_state = self.experiment_record._hosting_state == HostingState.DEPLOYED \ - and self.experiment_record._last_hosted_model_id == model_id \ - and self.experiment_record._next_model_to_host_id is None + deployed_state = ( + self.experiment_record._hosting_state == HostingState.DEPLOYED + and self.experiment_record._last_hosted_model_id == model_id + and self.experiment_record._next_model_to_host_id is None + ) num_retries_blue_green_deployment += 1 - - if num_retries_blue_green_deployment%2 == 0: - logger.debug(f"Waited {int(num_retries_blue_green_deployment / 2)} " - f"minutes for blue-green deployment...") - - if num_retries_blue_green_deployment >=30: # restrict maximum wait time to 15min - raise UnhandledWorkflowException(f"Deployment with model " - f"'{self.experiment_record._next_model_to_host_id}' was in " - f"state of '{self.experiment_record._hosting_state}'. Failed " - "to sync table status.") - + + if num_retries_blue_green_deployment % 2 == 0: + logger.debug( + f"Waited {int(num_retries_blue_green_deployment / 2)} " + f"minutes for blue-green deployment..." + ) + + if num_retries_blue_green_deployment >= 30: # restrict maximum wait time to 15min + raise UnhandledWorkflowException( + f"Deployment with model " + f"'{self.experiment_record._next_model_to_host_id}' was in " + f"state of '{self.experiment_record._hosting_state}'. Failed " + "to sync table status." + ) + if self.experiment_record._hosting_state == HostingState.FAILED: - raise SageMakerHostingException("Deployment with model " - f"'{self.experiment_record._next_model_to_host_id}' ended " - f"with state '{self.experiment_record._hosting_state}'. " - "Please check Sagemaker log for more information.") - + raise SageMakerHostingException( + "Deployment with model " + f"'{self.experiment_record._next_model_to_host_id}' ended " + f"with state '{self.experiment_record._hosting_state}'. " + "Please check Sagemaker log for more information." + ) + @property def predictor(self): if self.experiment_record._hosting_endpoint: - return Predictor(endpoint_name=self.experiment_id, - sagemaker_session=self.sagemaker_session) + return Predictor(endpoint_name=self.experiment_id, sagemaker_session=self.sagemaker_session) else: - logger.warning("Hosting endpoint is not ready yet. A deployment " - f"with model id '{self.experiment_record._next_model_to_host_id}' is in state of " - f"'{self.experiment_record._hosting_state}'. Please check later.") + logger.warning( + "Hosting endpoint is not ready yet. A deployment " + f"with model id '{self.experiment_record._next_model_to_host_id}' is in state of " + f"'{self.experiment_record._hosting_state}'. Please check later." + ) return None def ingest_rewards(self, rewards_buffer): """Upload rewards data in a rewards buffer to S3 bucket - + Args: rewards_buffer (list): A list of json blobs containing rewards data - + Returns: str: S3 data prefix path that contains the rewards file """ @@ -1256,37 +1254,33 @@ def ingest_rewards(self, rewards_buffer): rewards_bucket_name = self.resource_manager._create_s3_bucket_if_not_exist("sagemaker") timstamp = str(int(time.time())) rewards_s3_file_key = f"{self.experiment_id}/rewards_data/{self.experiment_id}-{timstamp}/rewards-{timstamp}" - body = b'' + body = b"" for reward in rewards_buffer: - body += str(json.dumps(reward) + '\n').encode('utf_8') + body += str(json.dumps(reward) + "\n").encode("utf_8") try: - self.s3_client.put_object(Body=body, - Bucket=rewards_bucket_name, - Key=rewards_s3_file_key) + self.s3_client.put_object(Body=body, Bucket=rewards_bucket_name, Key=rewards_s3_file_key) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError("Failed to upload rewards data with error {}: {}".format( - error_code, message - )) + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError("Failed to upload rewards data with error {}: {}".format(error_code, message)) rewards_file_path = f"s3://{rewards_bucket_name}/{rewards_s3_file_key}" logger.info("Waiting for reward data to be uploaded.") - waiter = self.s3_client.get_waiter('object_exists') + waiter = self.s3_client.get_waiter("object_exists") waiter.wait(Bucket=rewards_bucket_name, Key=rewards_s3_file_key) logger.info(f"Successfully upload reward files to s3 bucket path {rewards_file_path}") - reward_s3_prefix = '/'.join(rewards_file_path.split('/')[:-1]) + reward_s3_prefix = "/".join(rewards_file_path.split("/")[:-1]) return reward_s3_prefix def ingest_joined_data(self, joined_data_buffer, ratio=0.8): """Upload joined data in joined data buffer to S3 bucket - + Args: joined_data_buffer (list): A list of json blobs containing joined data @@ -1297,28 +1291,27 @@ def ingest_joined_data(self, joined_data_buffer, ratio=0.8): # update next_join_job_id and joining state next_join_job_id = JoinManager.name_next_join_job(experiment_id=self.experiment_id) - self.exp_db_client.update_experiment_next_join_job_id( - self.experiment_id, - next_join_job_id) - self.exp_db_client.update_experiment_joining_state( - self.experiment_id, - JoiningState.PENDING) - - self.next_join_job = JoinManager(join_db_client=self.join_db_client, - experiment_id=self.experiment_id, - join_job_id=next_join_job_id, - input_obs_data_s3_path="local-join-does-not-apply", - input_reward_data_s3_path="local-join-does-not-apply", - boto_session=self.boto_session) - + self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, next_join_job_id) + self.exp_db_client.update_experiment_joining_state(self.experiment_id, JoiningState.PENDING) + + self.next_join_job = JoinManager( + join_db_client=self.join_db_client, + experiment_id=self.experiment_id, + join_job_id=next_join_job_id, + input_obs_data_s3_path="local-join-does-not-apply", + input_reward_data_s3_path="local-join-does-not-apply", + boto_session=self.boto_session, + ) + logger.info("Started dummy local joining job...") - self.next_join_job.start_dummy_join(joined_data_buffer=joined_data_buffer, - ratio=ratio) + self.next_join_job.start_dummy_join(joined_data_buffer=joined_data_buffer, ratio=ratio) # this method can be invoked either in local/SM mode - succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \ - and self.experiment_record._last_joined_job_id == next_join_job_id \ - and self.experiment_record._next_join_job_id is None + succeeded_state = ( + self.experiment_record._joining_state == JoiningState.SUCCEEDED + and self.experiment_record._last_joined_job_id == next_join_job_id + and self.experiment_record._next_join_job_id is None + ) num_retries = 0 max_retries = 100 while not succeeded_state: @@ -1326,23 +1319,31 @@ def ingest_joined_data(self, joined_data_buffer, ratio=0.8): self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table joining status to be updated...") time.sleep(10 * max_retries) - succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \ - and self.experiment_record._last_joined_job_id == next_join_job_id \ - and self.experiment_record._next_join_job_id is None + succeeded_state = ( + self.experiment_record._joining_state == JoiningState.SUCCEEDED + and self.experiment_record._last_joined_job_id == next_join_job_id + and self.experiment_record._next_join_job_id is None + ) num_retries += 1 if num_retries >= max_retries: - raise UnhandledWorkflowException(f"Joining job '{self.experiment_record._next_join_job_id}' " - f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states.") - if self.experiment_record._joining_state == JoiningState.FAILED or \ - self.experiment_record._joining_state == JoiningState.CANCELLED: - raise WorkflowJoiningJobException(f"Joining job '{self.experiment_record._next_join_job_id}' " - f"ended with state '{self.experiment_record._joining_state}'. Please check if provided " - "joined_data_buffer was in correct data format.") - + raise UnhandledWorkflowException( + f"Joining job '{self.experiment_record._next_join_job_id}' " + f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states." + ) + if ( + self.experiment_record._joining_state == JoiningState.FAILED + or self.experiment_record._joining_state == JoiningState.CANCELLED + ): + raise WorkflowJoiningJobException( + f"Joining job '{self.experiment_record._next_join_job_id}' " + f"ended with state '{self.experiment_record._joining_state}'. Please check if provided " + "joined_data_buffer was in correct data format." + ) + def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True): """Start a joining job given rewards data path and observation data time window - + Args: rewards_s3_path (str): S3 data path containing the rewards data obs_time_window (int): Define a time window of past X hours to @@ -1355,25 +1356,24 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True): self._sync_experiment_state_with_ddb() if obs_time_window is None: - logger.warning(f"Start a join job to join reward data " - f"under '{rewards_s3_path}' with all the observation data") + logger.warning( + f"Start a join job to join reward data " f"under '{rewards_s3_path}' with all the observation data" + ) obs_end_time = None obs_start_time = None else: - logger.info(f"Start a join job to join reward data " - f"under '{rewards_s3_path}' with observation " - f"data in the past {obs_time_window} hours") + logger.info( + f"Start a join job to join reward data " + f"under '{rewards_s3_path}' with observation " + f"data in the past {obs_time_window} hours" + ) obs_end_time = datetime.utcnow() obs_start_time = obs_end_time - timedelta(hours=obs_time_window) # update next_join_job_id and joining state next_join_job_id = JoinManager.name_next_join_job(experiment_id=self.experiment_id) - self.exp_db_client.update_experiment_next_join_job_id( - self.experiment_id, - next_join_job_id) - self.exp_db_client.update_experiment_joining_state( - self.experiment_id, - JoiningState.PENDING) + self.exp_db_client.update_experiment_next_join_job_id(self.experiment_id, next_join_job_id) + self.exp_db_client.update_experiment_joining_state(self.experiment_id, JoiningState.PENDING) input_obs_data_s3_path = f"s3://{self.resource_manager.firehose_bucket}/{self.experiment_id}" input_obs_data_s3_path = f"{input_obs_data_s3_path}/inference_data" @@ -1381,14 +1381,16 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True): logger.info("Creating resource for joining job...") try: - self.next_join_job = JoinManager(join_db_client=self.join_db_client, - experiment_id=self.experiment_id, - join_job_id=next_join_job_id, - input_obs_data_s3_path=input_obs_data_s3_path, - obs_start_time=obs_start_time, - obs_end_time=obs_end_time, - input_reward_data_s3_path=rewards_s3_path, - boto_session=self.boto_session) + self.next_join_job = JoinManager( + join_db_client=self.join_db_client, + experiment_id=self.experiment_id, + join_job_id=next_join_job_id, + input_obs_data_s3_path=input_obs_data_s3_path, + obs_start_time=obs_start_time, + obs_end_time=obs_end_time, + input_reward_data_s3_path=rewards_s3_path, + boto_session=self.boto_session, + ) logger.info("Started joining job...") self.next_join_job.start_join(ratio=ratio, wait=wait) @@ -1398,35 +1400,45 @@ def join(self, rewards_s3_path, obs_time_window=None, ratio=0.8, wait=True): # wait until exp ddb table updated if self.local_mode or wait: - succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \ - and self.experiment_record._last_joined_job_id == next_join_job_id \ - and self.experiment_record._next_join_job_id is None - num_retries = 0 + succeeded_state = ( + self.experiment_record._joining_state == JoiningState.SUCCEEDED + and self.experiment_record._last_joined_job_id == next_join_job_id + and self.experiment_record._next_join_job_id is None + ) + num_retries = 0 max_retries = 100 - + while not succeeded_state: # Sync experiment state if required self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table joining status to be updated...") time.sleep(10 * num_retries) - succeeded_state = self.experiment_record._joining_state == JoiningState.SUCCEEDED \ - and self.experiment_record._last_joined_job_id == next_join_job_id \ - and self.experiment_record._next_join_job_id is None + succeeded_state = ( + self.experiment_record._joining_state == JoiningState.SUCCEEDED + and self.experiment_record._last_joined_job_id == next_join_job_id + and self.experiment_record._next_join_job_id is None + ) num_retries += 1 if num_retries > max_retries: - raise UnhandledWorkflowException(f"Joining job '{self.experiment_record._next_join_job_id}' " - f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states.") + raise UnhandledWorkflowException( + f"Joining job '{self.experiment_record._next_join_job_id}' " + f"was in state of '{self.experiment_record._joining_state}'. Failed to sync table states." + ) - if self.experiment_record._joining_state == JoiningState.FAILED or \ - self.experiment_record._joining_state == JoiningState.CANCELLED: - raise WorkflowJoiningJobException(f"Joining job '{self.experiment_record._next_join_job_id}' " - f"ended with state '{self.experiment_record._joining_state}'. Please check Athena queries logs " - "for more information.") + if ( + self.experiment_record._joining_state == JoiningState.FAILED + or self.experiment_record._joining_state == JoiningState.CANCELLED + ): + raise WorkflowJoiningJobException( + f"Joining job '{self.experiment_record._next_join_job_id}' " + f"ended with state '{self.experiment_record._joining_state}'. Please check Athena queries logs " + "for more information." + ) def initialize_first_model(self, wait=True, input_data_s3_prefix=None): """ Initializes the first Model training for an Experiment - + Args: wait (bool): Whether to wait until the training job finishes input_data_s3_prefix (str): S3 data path containing data @@ -1437,25 +1449,23 @@ def initialize_first_model(self, wait=True, input_data_s3_prefix=None): # experiment only allow one training job at a time, # validate no other training request is in progress - if self.experiment_record._training_state is not None \ - and self.experiment_record._training_state.endswith("ING"): - logger.error(f"A training request with model id '{self.experiment_record._next_model_to_train_id}' " - f"was in the state of '{self.experiment_record._training_state}'. " - "Wait until the training job finished or canceled the request.") + if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith( + "ING" + ): + logger.error( + f"A training request with model id '{self.experiment_record._next_model_to_train_id}' " + f"was in the state of '{self.experiment_record._training_state}'. " + "Wait until the training job finished or canceled the request." + ) raise InvalidUsageException("Please wait for old Training Job to Complete before requesting a new one!") else: # update next_model_to_train_id and training state next_model_to_train_id = ModelManager.name_next_model(experiment_id=self.experiment_id) logger.info(f"Next Model name would be {next_model_to_train_id}") - self.exp_db_client.update_experiment_next_model_to_train_id( - self.experiment_id, - next_model_to_train_id) - self.exp_db_client.update_experiment_training_state( - self.experiment_id, - TrainingState.PENDING) + self.exp_db_client.update_experiment_next_model_to_train_id(self.experiment_id, next_model_to_train_id) + self.exp_db_client.update_experiment_training_state(self.experiment_id, TrainingState.PENDING) logger.info(f"Start training job for model '{next_model_to_train_id}''") - # generate manifest file if input is a list manifest_file_path = None if isinstance(input_data_s3_prefix, list): @@ -1472,53 +1482,62 @@ def initialize_first_model(self, wait=True, input_data_s3_prefix=None): role=self.resource_manager.iam_role_arn, instance_config=self.resource_manager.training_fleet_config, boto_session=self.boto_session, - algor_config=self.algor_config - ) + algor_config=self.algor_config, + ) self.next_model_to_train.fit( wait=wait, input_model_id=None, input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, - logs=wait - ) + logs=wait, + ) except Exception as e: - logger.error(f"Failed to start new Model Training job for" - " ModelId {next_model_to_train_id}") + logger.error(f"Failed to start new Model Training job for" " ModelId {next_model_to_train_id}") logger.error(e) pass # wait until ExperimentDb state is updated if self.local_mode or wait: - trained_state = self.experiment_record._training_state == TrainingState.TRAINED \ - and self.experiment_record._last_trained_model_id == next_model_to_train_id \ - and self.experiment_record._next_model_to_train_id is None + trained_state = ( + self.experiment_record._training_state == TrainingState.TRAINED + and self.experiment_record._last_trained_model_id == next_model_to_train_id + and self.experiment_record._next_model_to_train_id is None + ) num_retries = 0 max_retries = 100 - + while not trained_state: # Sync experiment state if required self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table training status to be updated...") time.sleep(10 * num_retries) - trained_state = self.experiment_record._training_state == TrainingState.TRAINED \ - and self.experiment_record._last_trained_model_id == next_model_to_train_id \ - and self.experiment_record._next_model_to_train_id is None + trained_state = ( + self.experiment_record._training_state == TrainingState.TRAINED + and self.experiment_record._last_trained_model_id == next_model_to_train_id + and self.experiment_record._next_model_to_train_id is None + ) num_retries += 1 if num_retries >= max_retries: - raise UnhandledWorkflowException(f"Training job '{self.experiment_record._next_model_to_train_id}' " - f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED.") - if self.experiment_record._training_state == TrainingState.FAILED \ - or self.experiment_record._training_state == TrainingState.STOPPED: - raise SageMakerTrainingJobException(f"Training job '{self.experiment_record._next_model_to_train_id}' " - f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for " - "more information.") + raise UnhandledWorkflowException( + f"Training job '{self.experiment_record._next_model_to_train_id}' " + f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED." + ) + if ( + self.experiment_record._training_state == TrainingState.FAILED + or self.experiment_record._training_state == TrainingState.STOPPED + ): + raise SageMakerTrainingJobException( + f"Training job '{self.experiment_record._next_model_to_train_id}' " + f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for " + "more information." + ) def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id=None): """ Train a new model given the training data and a pretrained model - + Args: wait (bool): Whether to wait until the training finish input_data_s3_prefix (str): S3 data path containing data @@ -1531,8 +1550,10 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id= # use 'last_trained_model_id' by default as input model for next training if input_model_id is None and self.experiment_record._last_trained_model_id is not None: - logger.info(f"Use last trained model {self.experiment_record._last_trained_model_id} " - "as pre-trained model for training") + logger.info( + f"Use last trained model {self.experiment_record._last_trained_model_id} " + "as pre-trained model for training" + ) input_model_id = self.experiment_record._last_trained_model_id @@ -1543,11 +1564,14 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id= # experiment only allows one training job at a time, # validate no other training request is in progress - if self.experiment_record._training_state is not None and \ - self.experiment_record._training_state.endswith("ING"): - logger.error(f"A training request with model id '{self.experiment_record._next_model_to_train_id}' " - f"was in the state of '{self.experiment_record._training_state}'. " - "Please wait until the training job is finished.") + if self.experiment_record._training_state is not None and self.experiment_record._training_state.endswith( + "ING" + ): + logger.error( + f"A training request with model id '{self.experiment_record._next_model_to_train_id}' " + f"was in the state of '{self.experiment_record._training_state}'. " + "Please wait until the training job is finished." + ) raise InvalidUsageException("Please wait for old Training Job to Complete before requesting a new one!") else: # update next_model_to_train_id and training state @@ -1555,12 +1579,8 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id= logger.info(f"Starting training job for ModelId '{next_model_to_train_id}''") - self.exp_db_client.update_experiment_next_model_to_train_id( - self.experiment_id, - next_model_to_train_id) - self.exp_db_client.update_experiment_training_state( - self.experiment_id, - TrainingState.PENDING) + self.exp_db_client.update_experiment_next_model_to_train_id(self.experiment_id, next_model_to_train_id) + self.exp_db_client.update_experiment_training_state(self.experiment_id, TrainingState.PENDING) manifest_file_path = None if isinstance(input_data_s3_prefix, list): @@ -1576,22 +1596,26 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id= role=self.resource_manager.iam_role_arn, instance_config=self.resource_manager.training_fleet_config, boto_session=self.boto_session, - algor_config=self.algor_config - ) - self.next_model_to_train.fit(wait=wait, - input_model_id=input_model_id, - input_data_s3_prefix=input_data_s3_prefix, - manifest_file_path=manifest_file_path, - logs=wait) + algor_config=self.algor_config, + ) + self.next_model_to_train.fit( + wait=wait, + input_model_id=input_model_id, + input_data_s3_prefix=input_data_s3_prefix, + manifest_file_path=manifest_file_path, + logs=wait, + ) except Exception as e: logger.error(e) pass # wait until exp ddb table updated if self.local_mode or wait: - trained_state = self.experiment_record._training_state == TrainingState.TRAINED \ - and self.experiment_record._last_trained_model_id == next_model_to_train_id \ - and self.experiment_record._next_model_to_train_id is None + trained_state = ( + self.experiment_record._training_state == TrainingState.TRAINED + and self.experiment_record._last_trained_model_id == next_model_to_train_id + and self.experiment_record._next_model_to_train_id is None + ) num_retries = 0 max_retries = 100 while not trained_state: @@ -1599,23 +1623,31 @@ def train_next_model(self, wait=True, input_data_s3_prefix=None, input_model_id= self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table training status to be updated...") time.sleep(10 * num_retries) - trained_state = self.experiment_record._training_state == TrainingState.TRAINED \ - and self.experiment_record._last_trained_model_id == next_model_to_train_id \ - and self.experiment_record._next_model_to_train_id is None + trained_state = ( + self.experiment_record._training_state == TrainingState.TRAINED + and self.experiment_record._last_trained_model_id == next_model_to_train_id + and self.experiment_record._next_model_to_train_id is None + ) num_retries += 1 if num_retries >= max_retries: - raise UnhandledWorkflowException(f"Training job '{self.experiment_record._next_model_to_train_id}' " - f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED.") - if self.experiment_record._training_state == TrainingState.FAILED \ - or self.experiment_record._training_state == TrainingState.STOPPED: - raise SageMakerTrainingJobException(f"Training job '{self.experiment_record._next_model_to_train_id}' " - f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for " - "more information.") + raise UnhandledWorkflowException( + f"Training job '{self.experiment_record._next_model_to_train_id}' " + f"was in state of '{self.experiment_record._training_state}'. Expected it to be TRAINED." + ) + if ( + self.experiment_record._training_state == TrainingState.FAILED + or self.experiment_record._training_state == TrainingState.STOPPED + ): + raise SageMakerTrainingJobException( + f"Training job '{self.experiment_record._next_model_to_train_id}' " + f"ended in state of '{self.experiment_record._training_state}'. Please check Sagemaker logs for " + "more information." + ) def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait=True): """ Start an evaluation job to evaluate a model - + Args: input_data_s3_prefix (str): S3 data path containing data used for evaluation @@ -1629,8 +1661,9 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait if evaluate_model_id is None: if self.experiment_record._last_trained_model_id: # use 'last_trained_model_id' by default as input model for evaluation - logger.info(f"Using last trained model {self.experiment_record._last_trained_model_id}" - "for evaluation") + logger.info( + f"Using last trained model {self.experiment_record._last_trained_model_id}" "for evaluation" + ) evaluate_model_id = self.experiment_record._last_trained_model_id else: logger.error("Evaluation ModelId in None!") @@ -1650,24 +1683,23 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait # evaluate_model_id is still None. Raise an exception... raise InvalidUsageException("Please provide a valid ModelId to be evaluated") - if self.experiment_record._evaluation_state is not None \ - and self.experiment_record._evaluation_state.endswith("ING"): - logger.warning(f"A evaluation request with job id '{self.experiment_record._next_evaluation_job_id}' " + if self.experiment_record._evaluation_state is not None and self.experiment_record._evaluation_state.endswith( + "ING" + ): + logger.warning( + f"A evaluation request with job id '{self.experiment_record._next_evaluation_job_id}' " f"was in the state of '{self.experiment_record._evaluation_state}'. " - "Wait until the evaluation job finished or canceled the request.") + "Wait until the evaluation job finished or canceled the request." + ) raise InvalidUsageException("Please wait for old Evaluation Job to Complete before requesting a new one!") else: next_evaluation_job_id = f"{evaluate_model_id}-eval-{str(int(time.time()))}" logger.info(f"Evaluating model '{evaluate_model_id}' with evaluation job id '{next_evaluation_job_id}'") - self.exp_db_client.update_experiment_next_evaluation_job_id( - self.experiment_id, - next_evaluation_job_id) + self.exp_db_client.update_experiment_next_evaluation_job_id(self.experiment_id, next_evaluation_job_id) - self.exp_db_client.update_experiment_evaluation_state( - self.experiment_id, - EvaluationState.PENDING) + self.exp_db_client.update_experiment_evaluation_state(self.experiment_id, EvaluationState.PENDING) manifest_file_path = None if isinstance(input_data_s3_prefix, list): @@ -1686,55 +1718,65 @@ def evaluate_model(self, input_data_s3_prefix=None, evaluate_model_id=None, wait role=self.resource_manager.iam_role_arn, instance_config=self.resource_manager.evaluation_fleet_config, boto_session=self.boto_session, - algor_config=self.algor_config - ) + algor_config=self.algor_config, + ) self.next_model_to_evaluate.evaluate( input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, evaluation_job_name=next_evaluation_job_id, - local_mode = self.local_mode, + local_mode=self.local_mode, wait=wait, - logs=True - ) + logs=True, + ) except Exception as e: logger.error(e) pass # wait until exp ddb table updated if self.local_mode or wait: - evaluated_state = self.experiment_record._evaluation_state == EvaluationState.EVALUATED \ - and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id \ - and self.experiment_record._next_evaluation_job_id is None + evaluated_state = ( + self.experiment_record._evaluation_state == EvaluationState.EVALUATED + and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id + and self.experiment_record._next_evaluation_job_id is None + ) num_retries = 0 - max_retries = 100 + max_retries = 100 while not evaluated_state: # Sync experiment state if required self._sync_experiment_state_with_ddb() logger.debug("Waiting for experiment table evaluation status to be updated...") time.sleep(10 * num_retries) - evaluated_state = self.experiment_record._evaluation_state == EvaluationState.EVALUATED \ - and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id \ - and self.experiment_record._next_evaluation_job_id is None + evaluated_state = ( + self.experiment_record._evaluation_state == EvaluationState.EVALUATED + and self.experiment_record._last_evaluation_job_id == next_evaluation_job_id + and self.experiment_record._next_evaluation_job_id is None + ) num_retries += 1 if num_retries >= max_retries: - raise UnhandledWorkflowException(f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' " - f"was in state of '{self.experiment_record._evaluation_state}'. Failed to sync table states.") - if self.experiment_record._evaluation_state == EvaluationState.FAILED \ - or self.experiment_record._evaluation_state == EvaluationState.STOPPED: - raise SageMakerTrainingJobException(f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' " - f"ended in state of '{self.experiment_record._evaluation_state}'. Please check Sagemaker logs for " - "more information.") + raise UnhandledWorkflowException( + f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' " + f"was in state of '{self.experiment_record._evaluation_state}'. Failed to sync table states." + ) + if ( + self.experiment_record._evaluation_state == EvaluationState.FAILED + or self.experiment_record._evaluation_state == EvaluationState.STOPPED + ): + raise SageMakerTrainingJobException( + f"Evaluation job '{self.experiment_record._next_evaluation_job_id}' " + f"ended in state of '{self.experiment_record._evaluation_state}'. Please check Sagemaker logs for " + "more information." + ) def get_eval_score(self, evaluate_model_id=None, eval_data_path=None): """ Return evaluation score given model id and evaluation data path - + Args: evaluate_model_id (str): Model id used for evaluation eval_data_path (str): S3 data path of evaluation data - + Returns: float: evaluation score of given model and evaluation data """ @@ -1745,13 +1787,12 @@ def get_eval_score(self, evaluate_model_id=None, eval_data_path=None): if evaluate_model_id != self.experiment_record._last_trained_model_id: if not self._check_if_model_ready(evaluate_model_id): return - + # use last joined job's eval data by default if eval_data_path is None: eval_data_path = self.last_joined_job_eval_data - logger.info(f"Getting eval scores for model '{evaluate_model_id}'" - f" on eval data set '{eval_data_path}'") + logger.info(f"Getting eval scores for model '{evaluate_model_id}'" f" on eval data set '{eval_data_path}'") eval_score = "n.a." if not evaluate_model_id or not eval_data_path: @@ -1760,41 +1801,44 @@ def get_eval_score(self, evaluate_model_id=None, eval_data_path=None): else: model_record = self.model_db_client.get_model_record(self.experiment_id, evaluate_model_id) if model_record: - eval_scores_map = model_record.get('eval_scores', {}) + eval_scores_map = model_record.get("eval_scores", {}) eval_score = eval_scores_map.get(eval_data_path, eval_score) else: logger.warn(f"Model Record not found with ModelId: {evaluate_model_id}") pass if eval_score == "n.a.": - raise EvalScoreNotAvailableException(f"Evaluation score is not available for model '{evaluate_model_id}'" - f"with data '{eval_data_path}'.'") + raise EvalScoreNotAvailableException( + f"Evaluation score is not available for model '{evaluate_model_id}'" f"with data '{eval_data_path}'.'" + ) else: eval_score = float(eval_score) - logger.info(f"Evaluation score for model '{evaluate_model_id}'" - f"with data '{eval_data_path}' is {eval_score}.") + logger.info( + f"Evaluation score for model '{evaluate_model_id}'" f"with data '{eval_data_path}' is {eval_score}." + ) return eval_score - + def get_cloudwatch_dashboard_details(self): return self.cw_logger.get_cloudwatch_dashboard_details(self.experiment_id) - + def clean_resource(self, experiment_id): """Clean up resource of the given experiment, including hosting endpoint and firehose stream """ if not self.local_mode: self.resource_manager.delete_firehose_stream(experiment_id) - + # clean athena tables logger.info(f"Deleting athena tables for '{experiment_id}'...") last_join_job = JoinManager( join_db_client=self.join_db_client, experiment_id=self.experiment_id, - join_job_id=self.last_joined_job_id) + join_job_id=self.last_joined_job_id, + ) last_join_job._delete_obs_table_if_exist() last_join_job._delete_rewards_table_if_exist() - + logger.info(f"Deleting hosting endpoint '{experiment_id}'...") self.sagemaker_session.delete_endpoint_config(experiment_id) self.sagemaker_session.delete_endpoint(experiment_id) @@ -1807,26 +1851,18 @@ def clean_table_records(self, experiment_id): to be cleaned up """ # delete join job records from table - join_job_records = self.join_db_client.get_all_join_job_records_of_experiment( - experiment_id - ) + join_job_records = self.join_db_client.get_all_join_job_records_of_experiment(experiment_id) if join_job_records: self.join_db_client.batch_delete_items( - experiment_id, - [record["join_job_id"] for record in join_job_records] + experiment_id, [record["join_job_id"] for record in join_job_records] ) # delete model records from table - model_records = self.model_db_client.get_all_model_records_of_experiment( - experiment_id - ) + model_records = self.model_db_client.get_all_model_records_of_experiment(experiment_id) if model_records: - self.model_db_client.batch_delete_items( - experiment_id, - [record["model_id"] for record in model_records] - ) + self.model_db_client.batch_delete_items(experiment_id, [record["model_id"] for record in model_records]) # # exit sync thread self.sync_thread.thread_running.clear() @@ -1838,7 +1874,7 @@ def clean_table_records(self, experiment_id): def _close_existing_containers(self): """closing local running containers if exist - + Returns: (bool, bool): Whether a running container exist, Whether successfully close the container diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py index d64dab86..09ca9f6e 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/join_manager.py @@ -11,8 +11,7 @@ from orchestrator.clients.ddb.join_db_client import JoinDbClient from orchestrator.workflow.datatypes.join_job_record import JoinJobRecord from orchestrator.exceptions.ddb_client_exceptions import RecordAlreadyExistsException -from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, \ - JoinQueryIdsNotAvailableException +from orchestrator.exceptions.workflow_exceptions import UnhandledWorkflowException, JoinQueryIdsNotAvailableException logger = logging.getLogger("orchestrator") @@ -22,20 +21,22 @@ class JoinManager: will handle the joining job creation and joining job metadata management. """ + def __init__( - self, - join_db_client: JoinDbClient, - experiment_id, - join_job_id, - current_state=None, - input_obs_data_s3_path=None, - obs_start_time=None, - obs_end_time=None, - input_reward_data_s3_path=None, - output_joined_train_data_s3_path=None, - output_joined_eval_data_s3_path=None, - join_query_ids=[], - boto_session=None): + self, + join_db_client: JoinDbClient, + experiment_id, + join_job_id, + current_state=None, + input_obs_data_s3_path=None, + obs_start_time=None, + obs_end_time=None, + input_reward_data_s3_path=None, + output_joined_train_data_s3_path=None, + output_joined_eval_data_s3_path=None, + join_query_ids=[], + boto_session=None, + ): """Initialize a joining job entity in the current experiment Args: @@ -80,7 +81,7 @@ def __init__( self.query_s3_output_bucket = self._create_athena_s3_bucket_if_not_exist() self.athena_client = self.boto_session.client("athena") - # create a local JoinJobRecord object. + # create a local JoinJobRecord object. self.join_job_record = JoinJobRecord( experiment_id, join_job_id, @@ -91,8 +92,8 @@ def __init__( input_reward_data_s3_path, output_joined_train_data_s3_path, output_joined_eval_data_s3_path, - join_query_ids - ) + join_query_ids, + ) # create obs partitioned/non-partitioned table if not exists if input_obs_data_s3_path and input_obs_data_s3_path != "local-join-does-not-apply": @@ -104,28 +105,22 @@ def __init__( if obs_start_time and obs_end_time: self._add_time_partitions(obs_start_time, obs_end_time) - # try to save this record file. if it throws RecordAlreadyExistsException + # try to save this record file. if it throws RecordAlreadyExistsException # reload the record from JoinJobDb, and recreate try: - self.join_db_client.create_new_join_job_record( - self.join_job_record.to_ddb_record() - ) + self.join_db_client.create_new_join_job_record(self.join_job_record.to_ddb_record()) except RecordAlreadyExistsException: logger.debug("Join job already exists. Reloading from join job record.") - join_job_record = self.join_db_client.get_join_job_record( - experiment_id, - join_job_id - ) + join_job_record = self.join_db_client.get_join_job_record(experiment_id, join_job_id) self.join_job_record = JoinJobRecord.load_from_ddb_record(join_job_record) except Exception as e: logger.error("Unhandled Exception! " + str(e)) raise UnhandledWorkflowException("Something went wrong while creating a new join job") def _jsonify(self): - """Return a jsonify dict with metadata of the 'JoinJob' object - """ + """Return a jsonify dict with metadata of the 'JoinJob' object""" return self.join_job_record.to_ddb_record() - + @classmethod def name_next_join_job(cls, experiment_id): """Generate unique join job id of a new joining job in the experiment @@ -149,11 +144,11 @@ def _formatted_table_name(self, table_name_string): """ # athena does not allow special characters other than '_' # replace all special characters with '_' - return re.sub('[^A-Za-z0-9]+', '_', table_name_string) + return re.sub("[^A-Za-z0-9]+", "_", table_name_string) def _create_athena_s3_bucket_if_not_exist(self): """Create s3 bucket for athena data if not exists - Use sagemaker-{region}-{account_id} bucket to store data + Use sagemaker-{region}-{account_id} bucket to store data Returns: str: s3 bucket name for athena @@ -171,9 +166,7 @@ def _create_athena_s3_bucket_if_not_exist(self): if region == "us-east-1": s3.create_bucket(Bucket=s3_bucket_name) else: - s3.create_bucket( - Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region} - ) + s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={"LocationConstraint": region}) logger.info("Successfully create S3 bucket '{}' for athena queries".format(s3_bucket_name)) except ClientError as e: error_code = e.response["Error"]["Code"] @@ -181,9 +174,7 @@ def _create_athena_s3_bucket_if_not_exist(self): if error_code == "BucketAlreadyOwnedByYou": pass - elif ( - error_code == "OperationAborted" and "conflicting conditional operation" in message - ): + elif error_code == "OperationAborted" and "conflicting conditional operation" in message: # If this bucket is already being concurrently created, we don't need to create it again. pass elif error_code == "TooManyBuckets": @@ -191,18 +182,17 @@ def _create_athena_s3_bucket_if_not_exist(self): s3.meta.client.head_bucket(Bucket=s3_bucket_name) else: raise - - s3_waiter = s3_client.get_waiter('bucket_exists') + + s3_waiter = s3_client.get_waiter("bucket_exists") s3_waiter.wait(Bucket=s3_bucket_name) return s3_bucket_name def _create_obs_table_if_not_exist(self): - """Create athena table for observation data if not exists - """ + """Create athena table for observation data if not exists""" # create both partitioned and non-partitioned table for obs data # ensure input path ending with '/' input_obs_data_s3_path = self.join_job_record.get_input_obs_data_s3_path() - input_obs_data_s3_path = input_obs_data_s3_path.strip('/')+'/' + input_obs_data_s3_path = input_obs_data_s3_path.strip("/") + "/" query_string = f""" CREATE EXTERNAL TABLE IF NOT EXISTS {self.obs_table_partitioned} ( @@ -238,8 +228,10 @@ def _create_obs_table_if_not_exist(self): query_id = self._start_query(query_string, s3_output_path) self.wait_query_to_finish(query_id) - logger.debug(f"Successfully create observation table " - f"'{self.obs_table_non_partitioned}' and '{self.obs_table_partitioned}' for query") + logger.debug( + f"Successfully create observation table " + f"'{self.obs_table_non_partitioned}' and '{self.obs_table_partitioned}' for query" + ) def _delete_obs_table_if_exist(self): query_string = f""" @@ -257,12 +249,11 @@ def _delete_obs_table_if_exist(self): self.wait_query_to_finish(query_id) def _create_rewards_table_if_not_exist(self): - """Create athena table for rewards data if not exists - """ + """Create athena table for rewards data if not exists""" # create table if not exists # ensure input path ending with '/' input_reward_data_s3_path = self.join_job_record.get_input_reward_data_s3_path() - input_reward_data_s3_path = input_reward_data_s3_path.strip('/')+'/' + input_reward_data_s3_path = input_reward_data_s3_path.strip("/") + "/" query_string = f""" CREATE EXTERNAL TABLE IF NOT EXISTS {self.rewards_table} ( @@ -288,7 +279,7 @@ def _create_rewards_table_if_not_exist(self): self.wait_query_to_finish(query_id) logger.debug(f"Successfully update s3 location of rewards table '{self.rewards_table}'") - + def _delete_rewards_table_if_exist(self): query_string = f""" DROP TABLE IF EXISTS {self.rewards_table} @@ -309,20 +300,20 @@ def _add_time_partitions(self, start_time, end_time): input_obs_data_s3_path = self.join_job_record.get_input_obs_data_s3_path() # Adding partitions for each hour - partition_string_list = [] + partition_string_list = [] time_delta = end_time - start_time days = time_delta.days seconds = time_delta.seconds - hours = int(days*24 + seconds/3600) + hours = int(days * 24 + seconds / 3600) for i in range(hours + 1): - dt = start_time + timedelta(hours=i) + dt = start_time + timedelta(hours=i) dt_str = dt.strftime("%Y-%m-%d-%H") bucket_dt_str = dt.strftime("%Y/%m/%d/%H") partition_string = f"PARTITION (dt = '{dt_str}') LOCATION '{input_obs_data_s3_path}/{bucket_dt_str}/'" partition_string_list.append(partition_string) query_string = f"ALTER TABLE {self.obs_table_partitioned} ADD IF NOT EXISTS" - + for partition_string in partition_string_list: query_string = f""" {query_string}\n{partition_string}""" @@ -389,13 +380,13 @@ def _get_join_query_string(self, ratio=0.8, train_data=True, start_time=None, en query_sample_string = f"SELECT * FROM joined_table WHERE joined_table.sample_prob <= {ratio}" else: query_sample_string = f"SELECT * FROM joined_table WHERE joined_table.sample_prob > {ratio}" - + query_string = f""" {query_string_prefix} {query_sample_string}""" - + return query_string - + def _start_query(self, query_string, s3_output_path): """Start query with given query string and output path @@ -411,16 +402,14 @@ def _start_query(self, query_string, s3_output_path): response = self.athena_client.start_query_execution( QueryString=query_string, ResultConfiguration={ - 'OutputLocation': s3_output_path, - } - ) - query_id = response['QueryExecutionId'] + "OutputLocation": s3_output_path, + }, + ) + query_id = response["QueryExecutionId"] except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError("Failed to submit athena query with error {}: {}".format( - error_code, message - )) + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError("Failed to submit athena query with error {}: {}".format(error_code, message)) return query_id def wait_query_to_finish(self, query_id): @@ -429,28 +418,28 @@ def wait_query_to_finish(self, query_id): Args: query_id (str): query id of Athena query """ - status = 'QUEUED' - while status == 'RUNNING' or status == 'QUEUED': + status = "QUEUED" + while status == "RUNNING" or status == "QUEUED": try: - response = self.athena_client.get_query_execution( - QueryExecutionId=query_id - ) - status = response['QueryExecution']['Status']['State'] + response = self.athena_client.get_query_execution(QueryExecutionId=query_id) + status = response["QueryExecution"]["Status"]["State"] logger.debug(f"Waiting query to finish...") time.sleep(5) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format( - error_code, message - )) - - if status == 'FAILED': - raise RuntimeError(f"Query failed with reason: {response['QueryExecution']['Status']['StateChangeReason']}") - elif status == 'CANCELLED': + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError( + "Failed to retrieve athena query status with error {}: {}".format(error_code, message) + ) + + if status == "FAILED": + raise RuntimeError( + f"Query failed with reason: {response['QueryExecution']['Status']['StateChangeReason']}" + ) + elif status == "CANCELLED": logger.warning("Query was cancelled...") - elif status == 'SUCCEEDED': - logger.debug("Query finished successfully") + elif status == "SUCCEEDED": + logger.debug("Query finished successfully") def get_query_status(self, query_id): """Return query status given query ID @@ -462,18 +451,14 @@ def get_query_status(self, query_id): str: Status of the query """ try: - response = self.athena_client.get_query_execution( - QueryExecutionId=query_id - ) - status = response['QueryExecution']['Status']['State'] + response = self.athena_client.get_query_execution(QueryExecutionId=query_id) + status = response["QueryExecution"]["Status"]["State"] except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format( - error_code, message - )) + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + raise RuntimeError("Failed to retrieve athena query status with error {}: {}".format(error_code, message)) return status - + def start_join(self, ratio=0.8, wait=True): """Start Athena queries for the joining @@ -486,23 +471,21 @@ def start_join(self, ratio=0.8, wait=True): obs_start_time, obs_end_time = self.join_job_record.get_obs_start_end_time() - join_query_for_train_data = self._get_join_query_string(ratio=ratio, - train_data=True, start_time=obs_start_time, end_time=obs_end_time) - join_query_for_eval_data = self._get_join_query_string(ratio=ratio, - train_data=False, start_time=obs_start_time, end_time=obs_end_time) + join_query_for_train_data = self._get_join_query_string( + ratio=ratio, train_data=True, start_time=obs_start_time, end_time=obs_end_time + ) + join_query_for_eval_data = self._get_join_query_string( + ratio=ratio, train_data=False, start_time=obs_start_time, end_time=obs_end_time + ) - s3_output_path = f"s3://{self.query_s3_output_bucket}/" \ - f"{self.experiment_id}/joined_data/{self.join_job_id}" + s3_output_path = f"s3://{self.query_s3_output_bucket}/" f"{self.experiment_id}/joined_data/{self.join_job_id}" logger.info(f"Joined data will be stored under {s3_output_path}") - join_query_id_for_train = self._start_query(join_query_for_train_data, f"{s3_output_path}/train") join_query_id_for_eval = self._start_query(join_query_for_eval_data, f"{s3_output_path}/eval") # updates join table states vid ddb client - self.join_db_client.update_join_job_current_state( - self.experiment_id, self.join_job_id, 'PENDING' - ) + self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, "PENDING") self.join_db_client.update_join_job_output_joined_train_data_s3_path( self.experiment_id, self.join_job_id, f"{s3_output_path}/train" ) @@ -526,8 +509,8 @@ def _val_list_to_csv_byte_string(self, val_list): Return: str: A string in csv format, concatenated by ',' """ - val_str_list = list(map(lambda x: f"\"{x}\"", val_list)) - return str(','.join(val_str_list) + '\n').encode('utf_8') + val_str_list = list(map(lambda x: f'"{x}"', val_list)) + return str(",".join(val_str_list) + "\n").encode("utf_8") def _upload_data_buffer_as_joined_data_format(self, data_buffer, s3_bucket, s3_prefix): """Upload joined data buffer to s3 bucket @@ -553,24 +536,20 @@ def _upload_data_buffer_as_joined_data_format(self, data_buffer, s3_bucket, s3_p s3_client = self.boto_session.client("s3") try: - logger.info("_upload_data_buffer_as_joined_data_format put s3://{}/{}".format( - s3_bucket, joined_data_s3_file_key - )) - s3_client.put_object(Body=body, - Bucket=s3_bucket, - Key=joined_data_s3_file_key) + logger.info( + "_upload_data_buffer_as_joined_data_format put s3://{}/{}".format(s3_bucket, joined_data_s3_file_key) + ) + s3_client.put_object(Body=body, Bucket=s3_bucket, Key=joined_data_s3_file_key) except ClientError as e: - error_code = e.response['Error']['Code'] - message = e.response['Error']['Message'] - logger.error("Failed to upload local joined data with error {}: {}".format( - error_code, message - )) + error_code = e.response["Error"]["Code"] + message = e.response["Error"]["Message"] + logger.error("Failed to upload local joined data with error {}: {}".format(error_code, message)) return None joined_data_file_path = f"s3://{s3_bucket}/{joined_data_s3_file_key}" logger.debug("Waiting for local joined data to be uploaded.") - waiter = s3_client.get_waiter('object_exists') + waiter = s3_client.get_waiter("object_exists") waiter.wait(Bucket=s3_bucket, Key=joined_data_s3_file_key) logger.debug(f"Successfully upload local joined data files to s3 bucket path {joined_data_file_path}") @@ -596,14 +575,11 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8): else: joined_eval_data_buffer.append(record) - s3_output_path = f"s3://{self.query_s3_output_bucket}/" \ - f"{self.experiment_id}/joined_data/{self.join_job_id}" + s3_output_path = f"s3://{self.query_s3_output_bucket}/" f"{self.experiment_id}/joined_data/{self.join_job_id}" logger.info(f"Joined data will be stored under {s3_output_path}") # updates join table states vid ddb client - self.join_db_client.update_join_job_current_state( - self.experiment_id, self.join_job_id, 'PENDING' - ) + self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, "PENDING") self.join_db_client.update_join_job_output_joined_train_data_s3_path( self.experiment_id, self.join_job_id, f"{s3_output_path}/train" ) @@ -615,12 +591,14 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8): joined_train_data_path = self._upload_data_buffer_as_joined_data_format( joined_train_data_buffer, self.query_s3_output_bucket, - f"{self.experiment_id}/joined_data/{self.join_job_id}/train") + f"{self.experiment_id}/joined_data/{self.join_job_id}/train", + ) joined_eval_data_path = self._upload_data_buffer_as_joined_data_format( joined_eval_data_buffer, self.query_s3_output_bucket, - f"{self.experiment_id}/joined_data/{self.join_job_id}/eval") + f"{self.experiment_id}/joined_data/{self.join_job_id}/eval", + ) # dummy join finished, update joining job state if joined_train_data_path and joined_eval_data_path: @@ -628,28 +606,26 @@ def start_dummy_join(self, joined_data_buffer, ratio=0.8): else: current_state = "FAILED" - self.join_db_client.update_join_job_current_state( - self.experiment_id, self.join_job_id, current_state - ) - + self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, current_state) + def update_join_job_state(self): for num_retries in range(3): try: - join_job_record = self.join_db_client.get_join_job_record( - self.experiment_id, self.join_job_id - ) + join_job_record = self.join_db_client.get_join_job_record(self.experiment_id, self.join_job_id) self._update_join_table_states(join_job_record) except Exception as e: if num_retries >= 2: - current_state = 'FAILED' + current_state = "FAILED" self.join_db_client.update_join_job_current_state( self.experiment_id, self.join_job_id, current_state ) logger.error(f"Failing join job '{self.join_job_id}'...") return else: - logger.warn(f"Received exception '{e}' while updating join " - "job status. This exception will be ignored, and retried.") + logger.warn( + f"Received exception '{e}' while updating join " + "job status. This exception will be ignored, and retried." + ) time.sleep(5) continue @@ -664,7 +640,7 @@ def _update_join_table_states(self, join_job_record): """ if join_job_record is None: return - + current_state = join_job_record.get("current_state", None) join_query_ids = join_job_record.get("join_query_ids", []) @@ -673,8 +649,9 @@ def _update_join_table_states(self, join_job_record): return if not join_query_ids: - raise JoinQueryIdsNotAvailableException(f"Query ids for Joining job " - f"'{self.join_job_id}' cannot be found.") + raise JoinQueryIdsNotAvailableException( + f"Query ids for Joining job " f"'{self.join_job_id}' cannot be found." + ) query_states = [] @@ -682,22 +659,14 @@ def _update_join_table_states(self, join_job_record): query_states.append(self.get_query_status(query_id)) # only 'SUCCEEDED' if both queries are 'SUCCEEDED' - if query_states[0] == 'SUCCEEDED' and query_states[1] == 'SUCCEEDED': - current_state = 'SUCCEEDED' - elif 'FAILED' in query_states: - current_state = 'FAILED' - elif 'CANCELLED' in query_states: - current_state = 'CANCELLED' + if query_states[0] == "SUCCEEDED" and query_states[1] == "SUCCEEDED": + current_state = "SUCCEEDED" + elif "FAILED" in query_states: + current_state = "FAILED" + elif "CANCELLED" in query_states: + current_state = "CANCELLED" else: - current_state = 'RUNNING' + current_state = "RUNNING" # update table states via ddb client - self.join_db_client.update_join_job_current_state( - self.experiment_id, self.join_job_id, current_state - ) - - - - - - + self.join_db_client.update_join_job_current_state(self.experiment_id, self.join_job_id, current_state) diff --git a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py index dc92a68a..122c2e3c 100644 --- a/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py +++ b/09_deploy/common/sagemaker_rl/orchestrator/workflow/manager/model_manager.py @@ -22,50 +22,52 @@ from src.vw_utils import EVAL_CHANNEL logger = logging.getLogger("orchestrator") - - + + class CaptureStdout(list): def __enter__(self): self._stdout = sys.stdout sys.stdout = self._stringio = StringIO() return self + def __exit__(self, type, value, traceback): self.extend(self._stringio.getvalue().splitlines()) - del self._stringio # free up some memory + del self._stringio # free up some memory sys.stdout = self._stdout - + # Capture the exception and don't throw it back for graceful exit. return True -class ModelManager(): +class ModelManager: """A model entity with the given experiment. This class will handle the model creation, model training, model evaluation and model metadata management. """ def __init__( - self, - model_db_client: ModelDbClient, - experiment_id, - model_id, - image=None, - role=None, - instance_config={}, - boto_session=None, - algor_config={}, - train_state=None, - evaluation_job_name=None, - eval_state=None, - eval_scores={}, - input_model_id=None, - rl_estimator=None, - input_data_s3_prefix=None, - manifest_file_path=None, - eval_data_s3_path=None, - s3_model_output_path=None, - training_start_time=None, - training_end_time=None): + self, + model_db_client: ModelDbClient, + experiment_id, + model_id, + image=None, + role=None, + instance_config={}, + boto_session=None, + algor_config={}, + train_state=None, + evaluation_job_name=None, + eval_state=None, + eval_scores={}, + input_model_id=None, + rl_estimator=None, + input_data_s3_prefix=None, + manifest_file_path=None, + eval_data_s3_path=None, + s3_model_output_path=None, + training_start_time=None, + training_end_time=None, + ): """Initialize a model entity in the current experiment Args: @@ -83,7 +85,7 @@ def __init__( configuration for the model training/evaluation job. boto_session (boto3.session.Session): A session stores configuration state and allows you to create service clients and resources. - algor_config (dict): A dictionary that specify the algorithm type + algor_config (dict): A dictionary that specify the algorithm type and hyper parameters of the training/evaluation job. train_state (str): State of the model training job. evaluation_job_name (str): Job name for Latest Evaluation Job for this model @@ -95,7 +97,7 @@ def __init__( a SageMaker Training Job. input_data_s3_prefix (str): Input data path for the data source of the model training job. - s3_model_output_path (str): Output data path of model artifact for the + s3_model_output_path (str): Output data path of model artifact for the model training job. training_start_time (str): Starting timestamp of the model training job. training_end_time (str): Finished timestamp of the model training job. @@ -120,7 +122,7 @@ def __init__( self.instance_count = self.instance_config.get("instance_count", 1) self.algor_params = self.algor_config.get("algorithms_parameters", {}) - # create a local ModelRecord object. + # create a local ModelRecord object. self.model_record = ModelRecord( experiment_id, model_id, @@ -134,21 +136,16 @@ def __init__( eval_data_s3_path, s3_model_output_path, training_start_time, - training_end_time - ) - - # try to save this record file. if it throws RecordAlreadyExistsException + training_end_time, + ) + + # try to save this record file. if it throws RecordAlreadyExistsException # reload the record from ModelDb, and recreate try: - self.model_db_client.create_new_model_record( - self.model_record.to_ddb_record() - ) + self.model_db_client.create_new_model_record(self.model_record.to_ddb_record()) except RecordAlreadyExistsException: logger.debug("Model already exists. Reloading from model record.") - model_record = self.model_db_client.get_model_record( - experiment_id, - model_id - ) + model_record = self.model_db_client.get_model_record(experiment_id, model_id) self.model_record = ModelRecord.load_from_ddb_record(model_record) except Exception as e: logger.error("Unhandled Exception! " + str(e)) @@ -158,7 +155,7 @@ def __init__( boto_session = boto3.Session() self.boto_session = boto_session - if self.instance_type == 'local': + if self.instance_type == "local": self.sagemaker_session = LocalSession() else: self.sagemaker_session = sagemaker.session.Session(self.boto_session) @@ -198,30 +195,28 @@ def _get_rl_estimator_args(self, eval=False): job_types = "evaluation_jobs" if eval else "training_jobs" sagemaker_bucket = self.sagemaker_session.default_bucket() - output_path = f"s3://{sagemaker_bucket}/{self.experiment_id}/{job_types}/" + output_path = f"s3://{sagemaker_bucket}/{self.experiment_id}/{job_types}/" metric_definitions = [ - { - 'Name': 'average_loss', - 'Regex': 'average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$' - } - ] - - args = dict(entry_point=entry_point, - source_dir='src', - dependencies=["common/sagemaker_rl"], - image_uri=self.image, - role=self.role, - sagemaker_session=self.sagemaker_session, - instance_type=self.instance_type, - instance_count=self.instance_count, - metric_definitions=metric_definitions, - hyperparameters=self.algor_params, - output_path=output_path, - code_location=output_path.strip('/') - ) - - if self.instance_type == 'local': + {"Name": "average_loss", "Regex": "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$"} + ] + + args = dict( + entry_point=entry_point, + source_dir="src", + dependencies=["common/sagemaker_rl"], + image_uri=self.image, + role=self.role, + sagemaker_session=self.sagemaker_session, + instance_type=self.instance_type, + instance_count=self.instance_count, + metric_definitions=metric_definitions, + hyperparameters=self.algor_params, + output_path=output_path, + code_location=output_path.strip("/"), + ) + + if self.instance_type == "local": logger.info(f"{estimator_type} job will be executed in 'local' mode") else: logger.info(f"{estimator_type} job will be executed in 'SageMaker' mode") @@ -231,29 +226,19 @@ def _fit_first_model(self, input_data_s3_prefix=None, manifest_file_path=None, w """ A Estimator fit() call to initiate the first model of the experiment """ - - + rl_estimator_args = self._get_rl_estimator_args() self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: input_data = sagemaker.session.s3_input( - s3_data=manifest_file_path, - input_mode='File', - s3_data_type='ManifestFile' - ) + s3_data=manifest_file_path, input_mode="File", s3_data_type="ManifestFile" + ) self.rl_estimator.fit(job_name=self.model_id, inputs=input_data, wait=wait, logs=logs) else: - self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait,logs=logs) - - def fit( - self, - input_model_id=None, - input_data_s3_prefix=None, - manifest_file_path=None, - wait=False, - logs=True - ): + self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait, logs=logs) + + def fit(self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True): """A Estimator fit() call to start a model training job. Args: @@ -269,52 +254,39 @@ def fit( self.model_record.add_new_training_job_info( input_model_id=input_model_id, input_data_s3_prefix=input_data_s3_prefix, - manifest_file_path=manifest_file_path + manifest_file_path=manifest_file_path, ) self.model_db_client.update_model_record(self._jsonify()) if input_model_id is None: self._fit_first_model( - input_data_s3_prefix=input_data_s3_prefix, - manifest_file_path=manifest_file_path, - wait=wait, - logs=logs) + input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, wait=wait, logs=logs + ) else: # use 'input_model_id' as pretrained model for training - input_model_record = self.model_db_client.get_model_record( - self.experiment_id, - input_model_id - ) + input_model_record = self.model_db_client.get_model_record(self.experiment_id, input_model_id) model_artifact_path = input_model_record.get("s3_model_output_path") rl_estimator_args = self._get_rl_estimator_args() - rl_estimator_args['model_channel_name'] = 'pretrained_model' - rl_estimator_args['model_uri'] = model_artifact_path + rl_estimator_args["model_channel_name"] = "pretrained_model" + rl_estimator_args["model_uri"] = model_artifact_path self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: - inputs = sagemaker.session.s3_input( - s3_data=manifest_file_path, - s3_data_type='ManifestFile' - ) + inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile") else: inputs = input_data_s3_prefix - self.rl_estimator.fit( - job_name=self.model_id, - inputs=inputs, - wait=wait, - logs=logs - ) + self.rl_estimator.fit(job_name=self.model_id, inputs=inputs, wait=wait, logs=logs) def evaluate( - self, - input_data_s3_prefix=None, - manifest_file_path=None, - evaluation_job_name=None, - local_mode=True, - wait=False, - logs=True - ): + self, + input_data_s3_prefix=None, + manifest_file_path=None, + evaluation_job_name=None, + local_mode=True, + wait=False, + logs=True, + ): """A Estimator fit() call to start a model evaluation job. Args: @@ -331,33 +303,29 @@ def evaluate( # Model object has already been initialized with up-to-date DDb record. model_artifact_path = self.model_record.get_model_artifact_path() rl_estimator_args = self._get_rl_estimator_args(eval=True) - rl_estimator_args['model_channel_name'] = 'pretrained_model' - rl_estimator_args['model_uri'] = model_artifact_path + rl_estimator_args["model_channel_name"] = "pretrained_model" + rl_estimator_args["model_uri"] = model_artifact_path if manifest_file_path: - inputs = sagemaker.session.s3_input( - s3_data=manifest_file_path, - s3_data_type='ManifestFile' - ) + inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile") if local_mode: rl_estimator_args["hyperparameters"].update({"local_mode_manifest": True}) else: inputs = input_data_s3_prefix - + # (dict[str, str] or dict[str, sagemaker.session.s3_input]) for evaluation channel eval_channel_inputs = {EVAL_CHANNEL: inputs} self.rl_estimator = RLEstimator(**rl_estimator_args) - # update to save eval_data_s3_path in DDb as well, or + # update to save eval_data_s3_path in DDb as well, or # update to read from SM describe call... maybe will not work in local mode but. eval_data_s3_path = manifest_file_path if (manifest_file_path is not None) else input_data_s3_prefix # we keep eval job state as pending, before the SM job has been submitted. # the syncer function should update this state, based on SM job status. self.model_record.add_new_evaluation_job_info( - evaluation_job_name=evaluation_job_name, - eval_data_s3_path=eval_data_s3_path + evaluation_job_name=evaluation_job_name, eval_data_s3_path=eval_data_s3_path ) self.model_db_client.update_model_record(self._jsonify()) @@ -369,26 +337,16 @@ def evaluate( # Capture eval score by regex expression # log should contain only one "average loss = some number" pattern with CaptureStdout() as log_output: - self.rl_estimator.fit( - job_name=evaluation_job_name, - inputs=eval_channel_inputs, - wait=wait, - logs=logs - ) + self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs) - self.log_output = '\n'.join(log_output) + self.log_output = "\n".join(log_output) logger.debug(self.log_output) else: - self.rl_estimator.fit( - job_name=evaluation_job_name, - inputs=eval_channel_inputs, - wait=wait, - logs=logs - ) + self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs) def update_model_training_state(self): self._update_model_table_training_states() - + def update_model_evaluation_state(self): self._update_model_table_evaluation_states() @@ -411,59 +369,57 @@ def _update_model_table_training_states(self): # need not do anything. self.model_db_client.update_model_record(self._jsonify()) return self._jsonify() - + # Else, try and fetch updated SageMaker TrainingJob status sm_job_info = {} - - max_describe_retries = 100 + + max_describe_retries = 100 sleep_between_describe_retries = 10 - + for i in range(max_describe_retries): try: - sm_job_info = self.sagemaker_client.describe_training_job( - TrainingJobName=self.model_id) + sm_job_info = self.sagemaker_client.describe_training_job(TrainingJobName=self.model_id) except Exception as e: if "ValidationException" in str(e): if i > max_describe_retries: # max attempts for DescribeTrainingJob. Fail with ValidationException - logger.warn(f"Looks like SageMaker Job was not submitted successfully." - f" Failing Training Job with ModelId {self.model_id}" + logger.warn( + f"Looks like SageMaker Job was not submitted successfully." + f" Failing Training Job with ModelId {self.model_id}" ) self.model_record.update_model_as_failed() self.model_db_client.update_model_as_failed(self._jsonify()) return - else: + else: time.sleep(sleep_between_describe_retries) continue else: - # Do not raise exception, most probably throttling. - logger.warn(f"Failed to check SageMaker Training Job state for ModelId {self.model_id}." - " This exception will be ignored, and retried." + # Do not raise exception, most probably throttling. + logger.warn( + f"Failed to check SageMaker Training Job state for ModelId {self.model_id}." + " This exception will be ignored, and retried." ) logger.debug(e) time.sleep(sleep_between_describe_retries) return self._jsonify() - train_state = sm_job_info.get('TrainingJobStatus', "Pending") - training_start_time = sm_job_info.get('TrainingStartTime', None) + train_state = sm_job_info.get("TrainingJobStatus", "Pending") + training_start_time = sm_job_info.get("TrainingStartTime", None) training_end_time = sm_job_info.get("TrainingEndTime", None) if training_start_time is not None: - training_start_time = training_start_time.strftime("%Y-%m-%d %H:%M:%S") + training_start_time = training_start_time.strftime("%Y-%m-%d %H:%M:%S") if training_end_time is not None: - training_end_time = training_end_time.strftime("%Y-%m-%d %H:%M:%S") - - model_artifacts = sm_job_info.get('ModelArtifacts', None) + training_end_time = training_end_time.strftime("%Y-%m-%d %H:%M:%S") + + model_artifacts = sm_job_info.get("ModelArtifacts", None) if model_artifacts is not None: s3_model_output_path = model_artifacts.get("S3ModelArtifacts", None) else: s3_model_output_path = None self.model_record.update_model_job_status( - training_start_time, - training_end_time, - train_state, - s3_model_output_path + training_start_time, training_end_time, train_state, s3_model_output_path ) self.model_db_client.update_model_job_state(self._jsonify()) @@ -481,63 +437,65 @@ def _update_model_table_evaluation_states(self): """ if self.model_record.eval_in_terminal_state(): - self.model_db_client.update_model_record( - self._jsonify() - ) + self.model_db_client.update_model_record(self._jsonify()) return self._jsonify() - + # Try and fetch updated SageMaker Training Job Status sm_eval_job_info = {} - - max_describe_retries = 100 + + max_describe_retries = 100 sleep_between_describe_retries = 10 for i in range(max_describe_retries): try: sm_eval_job_info = self.sagemaker_client.describe_training_job( - TrainingJobName=self.model_record._evaluation_job_name) + TrainingJobName=self.model_record._evaluation_job_name + ) except Exception as e: if "ValidationException" in str(e): print(e) if i > max_describe_retries: # 3rd attempt for DescribeTrainingJob with validation failure - logger.warn("Looks like SageMaker Job was not submitted successfully." - f" Failing EvaluationJob {self.model_record._evaluation_job_name}" + logger.warn( + "Looks like SageMaker Job was not submitted successfully." + f" Failing EvaluationJob {self.model_record._evaluation_job_name}" ) self.model_record.update_eval_job_as_failed() self.model_db_client.update_model_eval_as_failed(self._jsonify()) return - else: + else: time.sleep(sleep_between_describe_retries) continue else: - # Do not raise exception, most probably throttling. - logger.warn("Failed to check SageMaker Training Job state for EvaluationJob: " - f" {self.model_record._evaluation_job_name}. This exception will be ignored," - " and retried." + # Do not raise exception, most probably throttling. + logger.warn( + "Failed to check SageMaker Training Job state for EvaluationJob: " + f" {self.model_record._evaluation_job_name}. This exception will be ignored," + " and retried." ) time.sleep(sleep_between_describe_retries) return self._jsonify() - - eval_state = sm_eval_job_info.get('TrainingJobStatus', 'Pending') - if eval_state == 'Completed': + eval_state = sm_eval_job_info.get("TrainingJobStatus", "Pending") + if eval_state == "Completed": eval_score = "n.a." if self.local_mode: - rgx = re.compile('average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$', re.M) + rgx = re.compile("average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$", re.M) eval_score_rgx = rgx.findall(self.log_output) - + if len(eval_score_rgx) == 0: logger.warning("No eval score available from vw job log.") else: - eval_score = eval_score_rgx[0][0] # [('eval_score', '')] + eval_score = eval_score_rgx[0][0] # [('eval_score', '')] else: attempts = 0 - while eval_score == 'n.a.' and attempts < 4: + while eval_score == "n.a." and attempts < 4: try: - metric_df = TrainingJobAnalytics(self.model_record._evaluation_job_name, ['average_loss']).dataframe() - eval_score = str(metric_df[metric_df['metric_name'] == 'average_loss']['value'][0]) + metric_df = TrainingJobAnalytics( + self.model_record._evaluation_job_name, ["average_loss"] + ).dataframe() + eval_score = str(metric_df[metric_df["metric_name"] == "average_loss"]["value"][0]) except Exception: # to avoid throttling time.sleep(5) @@ -549,4 +507,4 @@ def _update_model_table_evaluation_states(self): else: # update eval state via ddb client self.model_record.update_eval_job_state(eval_state) - self.model_db_client.update_model_eval_job_state(self._jsonify()) \ No newline at end of file + self.model_db_client.update_model_eval_job_state(self._jsonify()) diff --git a/09_deploy/common/sagemaker_rl/ray_launcher.py b/09_deploy/common/sagemaker_rl/ray_launcher.py index f787859d..2a64a670 100644 --- a/09_deploy/common/sagemaker_rl/ray_launcher.py +++ b/09_deploy/common/sagemaker_rl/ray_launcher.py @@ -28,13 +28,14 @@ class Cluster(Enum): for Neural Network training and secondary cluster has CPU instances for rollouts. For single machine or homogeneous cluster, primary is the default type. """ + Primary = "primary" Secondary = "secondary" class SageMakerRayLauncher(object): """Base class for SageMaker RL applications using Ray-RLLib. - Customers should sub-class this, fill in the required methods, and + Customers should sub-class this, fill in the required methods, and call .train_main() to start a training process. Example:: @@ -47,7 +48,7 @@ def create_environment(env_config): class MyLauncher(SageMakerRayLauncher): def register_env_creator(self): register_env("RoboschoolHumanoid-v1", create_environment) - + def get_experiment_config(self): return { "training": { @@ -81,16 +82,14 @@ def _get_cluster_type(self): return Cluster.Secondary def register_env_creator(self): - """Sub-classes must implement this. - """ + """Sub-classes must implement this.""" raise NotImplementedError("Subclasses should implement this to call ray.tune.registry.register_env") def get_experiment_config(self): raise NotImplementedError("Subclasses must define the experiment config to pass to ray.tune.run_experiments") def customize_experiment_config(self, config): - """Applies command-line hyperparameters to the config. - """ + """Applies command-line hyperparameters to the config.""" # TODO: use ConfigList from Coach launcher, and share customization code. hyperparams_dict = json.loads(os.environ.get("SM_HPS", "{}")) @@ -98,7 +97,7 @@ def customize_experiment_config(self, config): # TODO: move this to before customer-specified so they can override hyperparams_dict["rl.training.local_dir"] = INTERMEDIATE_DIR hyperparams_dict["rl.training.checkpoint_at_end"] = True - hyperparams_dict["rl.training.checkpoint_freq"] = config['training'].get('checkpoint_freq', 10) + hyperparams_dict["rl.training.checkpoint_freq"] = config["training"].get("checkpoint_freq", 10) self.hyperparameters = ConfigurationList() # TODO: move to shared for name, value in hyperparams_dict.items(): @@ -132,9 +131,9 @@ def ray_init_config(self): return config master_ip = get_ip_from_host(host_name=self.host_name) self.start_ray_cluster(master_ip) - self.sage_cluster_communicator.write_host_config(ip=master_ip, - host_name="%s:%s" % ( - self.cluster_type.value, self.host_name)) + self.sage_cluster_communicator.write_host_config( + ip=master_ip, host_name="%s:%s" % (self.cluster_type.value, self.host_name) + ) self.sage_cluster_communicator.create_s3_signal("%s:%s" % (self.cluster_type.value, self.host_name)) print("Waiting for %s worker nodes to join!" % (len(all_wokers_host_names))) self.sage_cluster_communicator.wait_for_signals(all_wokers_host_names) @@ -158,13 +157,17 @@ def ray_init_config(self): def start_ray_cluster(self, master_ip): if ray.__version__ >= "0.6.5": - p = subprocess.Popen("ray start --head --redis-port=6379 --node-ip-address=%s" % master_ip, - shell=True, - stderr=subprocess.STDOUT) + p = subprocess.Popen( + "ray start --head --redis-port=6379 --node-ip-address=%s" % master_ip, + shell=True, + stderr=subprocess.STDOUT, + ) else: - p = subprocess.Popen("ray start --head --redis-port=6379 --no-ui --node-ip-address=%s" % master_ip, - shell=True, - stderr=subprocess.STDOUT) + p = subprocess.Popen( + "ray start --head --redis-port=6379 --no-ui --node-ip-address=%s" % master_ip, + shell=True, + stderr=subprocess.STDOUT, + ) time.sleep(3) if p.poll() != 0: @@ -172,11 +175,18 @@ def start_ray_cluster(self, master_ip): def join_ray_cluster(self, master_ip, node_ip): if ray.__version__ >= "0.8.2": - p = subprocess.Popen("ray start --address=%s:6379" % (master_ip), - shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + p = subprocess.Popen( + "ray start --address=%s:6379" % (master_ip), + shell=True, + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE, + ) else: - p = subprocess.Popen("ray start --redis-address=%s:6379 --node-ip-address=%s" % (master_ip, node_ip), - shell=True, stderr=subprocess.STDOUT) + p = subprocess.Popen( + "ray start --redis-address=%s:6379 --node-ip-address=%s" % (master_ip, node_ip), + shell=True, + stderr=subprocess.STDOUT, + ) time.sleep(3) if p.poll() != 0: raise RuntimeError("Could not join Ray server running at %s:6379" % master_ip) @@ -196,8 +206,9 @@ def copy_checkpoints_to_model_output(self): checkpoints.sort(key=natural_keys) latest_checkpoints = checkpoints[-2:] - validation = sum(1 if x.endswith("tune_metadata") or x.endswith("extra_data") else 0 for x in - latest_checkpoints) + validation = sum( + 1 if x.endswith("tune_metadata") or x.endswith("extra_data") else 0 for x in latest_checkpoints + ) if ray.__version__ >= "0.6.5": if validation is not 1: @@ -254,8 +265,8 @@ def save_checkpoint_and_serving_model(self, algorithm=None, env_string=None): def set_up_checkpoint(self, config=None): try: - checkpoint_dir = config['training']['restore'] - print("Found checkpoint dir %s in user config." %checkpoint_dir) + checkpoint_dir = config["training"]["restore"] + print("Found checkpoint dir %s in user config." % checkpoint_dir) return config except KeyError: pass @@ -269,13 +280,15 @@ def set_up_checkpoint(self, config=None): print("checkpoint_dir is {}".format(checkpoint_dir)) checkpoint_dir_contents = os.listdir(checkpoint_dir) if len(checkpoint_dir_contents) not in [2, 3]: - raise RuntimeError(f"Unexpected files {checkpoint_dir_contents} in checkpoint dir. " - "Please check ray documents for the correct checkpoint format.") + raise RuntimeError( + f"Unexpected files {checkpoint_dir_contents} in checkpoint dir. " + "Please check ray documents for the correct checkpoint format." + ) validation = 0 checkpoint_file_in_container = "" for filename in checkpoint_dir_contents: - is_tune_metadata= filename.endswith("tune_metadata") + is_tune_metadata = filename.endswith("tune_metadata") is_extra_data = filename.endswith("extra_data") is_checkpoint_meta = is_tune_metadata + is_extra_data validation += is_checkpoint_meta @@ -288,20 +301,21 @@ def set_up_checkpoint(self, config=None): else: if validation is not 2: raise RuntimeError("Failed to find .tune_metadata or .extra_data to restore checkpoint") - + if checkpoint_file_in_container: - print("Found checkpoint: %s. Setting `restore` path in ray config." %checkpoint_file_in_container) - config['training']['restore'] = checkpoint_file_in_container + print("Found checkpoint: %s. Setting `restore` path in ray config." % checkpoint_file_in_container) + config["training"]["restore"] = checkpoint_file_in_container else: - print("No valid checkpoint found in %s. Training from scratch." %checkpoint_dir) + print("No valid checkpoint found in %s. Training from scratch." % checkpoint_dir) return config - + def _checkpoint_dir_finder(self, current_dir=None): current_dir_subfolders = os.walk(current_dir).__next__()[1] if len(current_dir_subfolders) > 1: - raise RuntimeError(f"Multiple folders detected: '{current_dir_subfolders}'." - "Please provide one checkpoint only." ) + raise RuntimeError( + f"Multiple folders detected: '{current_dir_subfolders}'." "Please provide one checkpoint only." + ) elif not current_dir_subfolders: return current_dir return self._checkpoint_dir_finder(os.path.join(current_dir, *current_dir_subfolders)) @@ -322,11 +336,12 @@ def launch(self): experiment_config = self.get_experiment_config() experiment_config = self.customize_experiment_config(experiment_config) experiment_config = self.set_up_checkpoint(experiment_config) - - print("Important! Ray with version <=7.2 may report \"Did not find checkpoint file\" even if the", - "experiment is actually restored successfully. If restoration is expected, please check", - "\"training_iteration\" in the experiment info to confirm." - ) + + print( + 'Important! Ray with version <=7.2 may report "Did not find checkpoint file" even if the', + "experiment is actually restored successfully. If restoration is expected, please check", + '"training_iteration" in the experiment info to confirm.', + ) run_experiments(experiment_config) all_wokers_host_names = self.get_all_host_names()[1:] # If distributed job, send TERMINATION_SIGNAL to all workers. @@ -335,12 +350,10 @@ def launch(self): algo = experiment_config["training"]["run"] env_string = experiment_config["training"]["config"]["env"] - self.save_checkpoint_and_serving_model(algorithm=algo, - env_string=env_string) + self.save_checkpoint_and_serving_model(algorithm=algo, env_string=env_string) @classmethod def train_main(cls): - """main function that kicks things off - """ + """main function that kicks things off""" launcher = cls() launcher.launch() diff --git a/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py b/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py index 6a2e3184..cd47d95e 100644 --- a/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py +++ b/09_deploy/common/sagemaker_rl/sage_cluster_communicator.py @@ -5,7 +5,7 @@ import time -class SageClusterCommunicator(): +class SageClusterCommunicator: def __init__(self): bucket = os.environ.get("SM_HP_S3_BUCKET", None) prefix = os.environ.get("SM_HP_S3_PREFIX", None) @@ -20,7 +20,7 @@ def __init__(self): def get_client(self): session = boto3.session.Session() - return session.client('s3', region_name=self.aws_region) + return session.client("s3", region_name=self.aws_region) def _get_s3_key(self, key): return os.path.normpath(self.s3_prefix + "/config/" + key) @@ -39,10 +39,10 @@ def _find_s3_output_path(self): tuple (bucket, prefix) """ module_dir_s3_path = self._required_environment_param("module_dir") - if not module_dir_s3_path.startswith('s3://'): + if not module_dir_s3_path.startswith("s3://"): raise ValueError('Unexpected format for module_dir_s3_path. Expected "s3://...') bucket_prefix = module_dir_s3_path.replace("s3://", "") - bucket, key = bucket_prefix.split('/', 1) + bucket, key = bucket_prefix.split("/", 1) prefix = "/".join(key.split("/")[:-2]) if prefix == "": # {bucket}/{job_name}/source/sourcedir.tar.gz structure not present @@ -51,7 +51,7 @@ def _find_s3_output_path(self): def create_s3_signal(self, signal): s3_client = self.get_client() - s3_client.upload_fileobj(io.BytesIO(b''), self.s3_bucket, self._get_s3_key(signal)) + s3_client.upload_fileobj(io.BytesIO(b""), self.s3_bucket, self._get_s3_key(signal)) def wait_for_signals(self, signals, timeout=600, sleep_time=5): if len(signals) == 0: @@ -69,7 +69,8 @@ def wait_for_signals(self, signals, timeout=600, sleep_time=5): time_elapsed += sleep_time if time_elapsed >= timeout: raise RuntimeError( - "Could not find all the signals: %s for last %s seconds" % (signals, time_elapsed)) + "Could not find all the signals: %s for last %s seconds" % (signals, time_elapsed) + ) else: print("Received all signal[s]: %s" % signals) return @@ -79,7 +80,7 @@ def write_host_config(self, ip, host_name): data = {"IP": ip, "HOST_NAME": host_name} json_blob = json.dumps(data) file_handle = io.BytesIO(json_blob.encode()) - file_handle_done = io.BytesIO(b'done') + file_handle_done = io.BytesIO(b"done") s3_client.upload_fileobj(file_handle, self.s3_bucket, self._get_s3_key(self.ip_key)) s3_client.upload_fileobj(file_handle_done, self.s3_bucket, self._get_s3_key(self.done_file_key)) @@ -87,7 +88,7 @@ def get_master_config(self): s3_client = self.get_client() self._wait_for_ip_upload() try: - s3_client.download_file(self.s3_bucket, self._get_s3_key(self.ip_key), 'ip.json') + s3_client.download_file(self.s3_bucket, self._get_s3_key(self.ip_key), "ip.json") with open("ip.json") as f: json_obj = json.load(f) ip = json_obj["IP"] @@ -122,9 +123,7 @@ def download_file(self, s3_key, local_path): def upload_file(self, s3_key, local_path): s3_client = self.get_client() try: - s3_client.upload_file(Filename=local_path, - Bucket=self.s3_bucket, - Key=s3_key) + s3_client.upload_file(Filename=local_path, Bucket=self.s3_bucket, Key=s3_key) return True except Exception as e: return False diff --git a/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py b/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py index 26a1b45f..dd258940 100644 --- a/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py +++ b/09_deploy/common/sagemaker_rl/stable_baselines_launcher.py @@ -20,7 +20,7 @@ def reward(self, _reward): return _reward * self.scale -class SagemakerStableBaselinesLauncher(): +class SagemakerStableBaselinesLauncher: """ Sagemaker's Stable Baselines Launcher. """ @@ -32,23 +32,22 @@ def __init__(self, env, output_path, model, num_timesteps): self._num_timesteps = num_timesteps def _train(self): - """Train the RL model - """ + """Train the RL model""" self._model.learn(total_timesteps=self._num_timesteps) def _predict(self, model, video_path): - """Run predictions on trained RL model. - """ + """Run predictions on trained RL model.""" - vr = VideoRecorder(env=self._env, path="{}/rl_out.mp4".format(video_path, str(MPI.COMM_WORLD.Get_rank())), - enabled=True) + vr = VideoRecorder( + env=self._env, path="{}/rl_out.mp4".format(video_path, str(MPI.COMM_WORLD.Get_rank())), enabled=True + ) obs = self._env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = self._env.step(action) if dones: obs = self._env.reset() - self._env.render(mode='rgb_array') + self._env.render(mode="rgb_array") vr.capture_frame() vr.close() self._env.close() @@ -66,33 +65,59 @@ class SagemakerStableBaselinesPPO1Launcher(SagemakerStableBaselinesLauncher): Sagemaker's Stable Baselines PPO1 Launcher. """ - def __init__(self, env, output_path, timesteps_per_actorbatch, - clip_param, entcoeff, optim_epochs, - optim_stepsize, optim_batchsize, - gamma, lam, schedule, - verbose, num_timesteps): + def __init__( + self, + env, + output_path, + timesteps_per_actorbatch, + clip_param, + entcoeff, + optim_epochs, + optim_stepsize, + optim_batchsize, + gamma, + lam, + schedule, + verbose, + num_timesteps, + ): print( "Initializing PPO with output_path: {} and Hyper Params [timesteps_per_actorbatch: {},clip_param: {}, " "entcoeff: {}, optim_epochs: {}, optim_stepsize: {}, optim_batchsize: {}, gamma: {}, lam: {}, " - "schedule: {}, verbose: {}, num_timesteps: {}]".format(output_path, timesteps_per_actorbatch, - clip_param, entcoeff, optim_epochs, - optim_stepsize, optim_batchsize, - gamma, lam, schedule, - verbose, num_timesteps)) - super().__init__(env, output_path, - PPO1(policy=MlpPolicy, - env=env, - gamma=gamma, - timesteps_per_actorbatch=timesteps_per_actorbatch, - clip_param=clip_param, - entcoeff=entcoeff, - optim_epochs=optim_epochs, - optim_stepsize=optim_stepsize, - optim_batchsize=optim_batchsize, - lam=lam, - schedule=schedule, - verbose=verbose), - num_timesteps) + "schedule: {}, verbose: {}, num_timesteps: {}]".format( + output_path, + timesteps_per_actorbatch, + clip_param, + entcoeff, + optim_epochs, + optim_stepsize, + optim_batchsize, + gamma, + lam, + schedule, + verbose, + num_timesteps, + ) + ) + super().__init__( + env, + output_path, + PPO1( + policy=MlpPolicy, + env=env, + gamma=gamma, + timesteps_per_actorbatch=timesteps_per_actorbatch, + clip_param=clip_param, + entcoeff=entcoeff, + optim_epochs=optim_epochs, + optim_stepsize=optim_stepsize, + optim_batchsize=optim_batchsize, + lam=lam, + schedule=schedule, + verbose=verbose, + ), + num_timesteps, + ) def create_env(env_id, output_path, seed=0): diff --git a/09_deploy/common/sagemaker_rl/tf_serving_utils.py b/09_deploy/common/sagemaker_rl/tf_serving_utils.py index bf867c48..55ceaafd 100644 --- a/09_deploy/common/sagemaker_rl/tf_serving_utils.py +++ b/09_deploy/common/sagemaker_rl/tf_serving_utils.py @@ -5,12 +5,13 @@ tf = try_import_tf() + def atoi(text): return int(text) if text.isdigit() else text def natural_keys(text): - return [atoi(c) for c in re.split('(\d+)', text)] + return [atoi(c) for c in re.split("(\d+)", text)] def change_permissions_recursive(path, mode): @@ -33,18 +34,16 @@ def export_tf_serving(agent, output_dir): output_signature["actions"] = tf.saved_model.utils.build_tensor_info(policy.sampler) output_signature["logits"] = tf.saved_model.utils.build_tensor_info(policy.logits) - signature_def = ( - tf.saved_model.signature_def_utils.build_signature_def( - input_signature, output_signature, - tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) - signature_def_key = (tf.saved_model.signature_constants. - DEFAULT_SERVING_SIGNATURE_DEF_KEY) + signature_def = tf.saved_model.signature_def_utils.build_signature_def( + input_signature, output_signature, tf.saved_model.signature_constants.PREDICT_METHOD_NAME + ) + signature_def_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def_map = {signature_def_key: signature_def} with policy.sess.graph.as_default(): builder = tf.saved_model.builder.SavedModelBuilder(os.path.join(output_dir, "1")) builder.add_meta_graph_and_variables( - policy.sess, [tf.saved_model.tag_constants.SERVING], - signature_def_map=signature_def_map) + policy.sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=signature_def_map + ) builder.save() print("Saved TensorFlow serving model!") diff --git a/09_deploy/src/eval-cfa-vw.py b/09_deploy/src/eval-cfa-vw.py index 022671b4..a28d111d 100644 --- a/09_deploy/src/eval-cfa-vw.py +++ b/09_deploy/src/eval-cfa-vw.py @@ -17,8 +17,8 @@ def main(): """ Evaluate a Vowpal Wabbit (VW) model by performing counterfactual analysis (CFA) """ - channel_names = json.loads(os.environ['SM_CHANNELS']) - hyperparameters = json.loads(os.environ['SM_HPS']) + channel_names = json.loads(os.environ["SM_CHANNELS"]) + hyperparameters = json.loads(os.environ["SM_HPS"]) local_mode_manifest = bool(hyperparameters.get("local_mode_manifest", False)) num_arms = int(hyperparameters.get("num_arms", 0)) cfa_type = hyperparameters.get("cfa_type", "dr") @@ -33,8 +33,7 @@ def main(): model_folder = os.environ[f"SM_CHANNEL_{MODEL_CHANNEL.upper()}"] _, weights_path = extract_model(model_folder) vw_load_model_args = f"-i {weights_path}" - vw_model = VWModel(cli_args=f"{vw_load_model_args}", - model_path=None, test_only=False, quiet_mode=False) + vw_model = VWModel(cli_args=f"{vw_load_model_args}", model_path=None, test_only=False, quiet_mode=False) vw_model.start() # Different CFA policies in VW @@ -42,13 +41,12 @@ def main(): if cfa_type not in cfa_type_candidate: raise ValueError(f"Customer Error: Counterfactual algorithm must be in {cfa_type_candidate}.") if cfa_type == "dm": - logging.warning(f"Direct method can not be used for evaluation -- it is biased." - "Resetting to dr.") + logging.warning(f"Direct method can not be used for evaluation -- it is biased." "Resetting to dr.") cfa_type = "dr" vw_cfa_args = f"--cb {num_arms} --eval --cb_type {cfa_type}" # Set test_only=False as VW differentiates "test" with "evaluation" - vw_cfa = VWModel(cli_args=f"{vw_cfa_args}", test_only=False, quiet_mode=False) + vw_cfa = VWModel(cli_args=f"{vw_cfa_args}", test_only=False, quiet_mode=False) vw_cfa.start() if EVAL_CHANNEL not in channel_names: @@ -65,16 +63,16 @@ def main(): manifest_file = files[0] logging.info(f"Trying to download files using manifest file {manifest_file}.") download_manifest_data(manifest_file, eval_data_dir) - + eval_files = [i for i in eval_data_dir.rglob("*") if i.is_file() and i.suffix == ".csv"] logging.info("Processing evaluation data: %s" % eval_files) - + data_reader = CSVReader(input_files=eval_files) data_iterator = data_reader.get_iterator() - + if MODEL_CHANNEL not in channel_names: raise ValueError("No model to be evaluated. Should at least provide current model.") - + # Perform counterfactual analysis count = 0 for experience in data_iterator: @@ -85,20 +83,22 @@ def main(): predicted_action_probs = vw_model.predict(context_vector=experience_context) n_choices = len(predicted_action_probs) predicted_action = np.random.choice(n_choices, p=predicted_action_probs) + 1 - - vw_cfa.evaluate(context_vector=experience_context, - action=experience["action"], - cost=1 - experience["reward"], - probability=experience["action_prob"], - label=predicted_action) + + vw_cfa.evaluate( + context_vector=experience_context, + action=experience["action"], + cost=1 - experience["reward"], + probability=experience["action_prob"], + label=predicted_action, + ) count += 1 vw_model.close(prediction_only=True) stdout = vw_cfa.close() print(stdout.decode()) - + logging.info(f"Model evaluated using {count} data instances.") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/09_deploy/src/io_utils.py b/09_deploy/src/io_utils.py index 0db9bd48..3009eae6 100644 --- a/09_deploy/src/io_utils.py +++ b/09_deploy/src/io_utils.py @@ -21,7 +21,7 @@ def validate_experience(experience): return True -class CSVReader(): +class CSVReader: """Reader object that loads experiences from CSV file chunks. The input files will be read from in an random order.""" @@ -38,7 +38,7 @@ def get_iterator(self): yield line_dict -class JsonLinesReader(): +class JsonLinesReader: """Reader object that loads experiences from JSON file chunks. The input files will be read from in an random order.""" @@ -58,7 +58,7 @@ def get_experience(self): return experience def _try_parse(self, line): - if line is None or line.strip() == '': + if line is None or line.strip() == "": return None try: line_json = json.loads(line.strip()) @@ -68,8 +68,7 @@ def _try_parse(self, line): assert "prob" in line_json, "prob not found in record" return line_json except Exception: - logger.exception("Ignoring corrupt json record in {}: {}".format( - self.cur_file, line)) + logger.exception("Ignoring corrupt json record in {}: {}".format(self.cur_file, line)) return None def _next_line(self): @@ -89,8 +88,7 @@ def _next_line(self): if not line: logger.debug("Ignoring empty file {}".format(self.cur_file)) if not line: - raise ValueError("Failed to read next line from files: {}".format( - self.files)) + raise ValueError("Failed to read next line from files: {}".format(self.files)) return line def _next_file(self): @@ -143,7 +141,7 @@ def download_manifest_data(manifest_file_path, output_dir): with open(manifest_file_path.as_posix()) as f: manifest = json.load(f) s3_prefix = manifest[0]["prefix"] - s3 = boto3.client('s3') + s3 = boto3.client("s3") for file in manifest[1:]: s3_uri = os.path.join(s3_prefix, file) bucket, key, file_name = parse_s3_uri(s3_uri) diff --git a/09_deploy/src/train-vw.py b/09_deploy/src/train-vw.py index 0808364f..7ce5d3c4 100644 --- a/09_deploy/src/train-vw.py +++ b/09_deploy/src/train-vw.py @@ -15,9 +15,9 @@ def main(): """ Train a Vowpal Wabbit (VW) model through C++ process. """ - - channel_names = json.loads(os.environ['SM_CHANNELS']) - hyperparameters = json.loads(os.environ['SM_HPS']) + + channel_names = json.loads(os.environ["SM_CHANNELS"]) + hyperparameters = json.loads(os.environ["SM_HPS"]) num_arms = int(hyperparameters.get("num_arms", 0)) num_policies = int(hyperparameters.get("num_policies", 3)) exploration_policy = hyperparameters.get("exploration_policy", "egreedy").lower() @@ -33,7 +33,7 @@ def main(): valid_policies = ["egreedy", "bag", "cover"] if exploration_policy not in valid_policies: raise ValueError(f"Customer Error: exploration_policy must be one of {valid_policies}.") - + if exploration_policy == "egreedy": vw_args_base = f"--cb_explore {num_arms} --epsilon {epsilon}" else: @@ -42,28 +42,35 @@ def main(): # No training data. Initialize and save a random model if TRAIN_CHANNEL not in channel_names: logging.info("No training data found. Saving a randomly initialized model!") - vw_model = VWModel(cli_args=f"{vw_args_base} -f {MODEL_OUTPUT_PATH}", - model_path=None, test_only=False, quiet_mode=False) + vw_model = VWModel( + cli_args=f"{vw_args_base} -f {MODEL_OUTPUT_PATH}", model_path=None, test_only=False, quiet_mode=False + ) vw_model.start() vw_model.close() save_vw_metadata(meta=vw_args_base) - + # If training data is present else: if MODEL_CHANNEL not in channel_names: - logging.info(f"No pre-trained model has been specified in channel {MODEL_CHANNEL}." - f"Training will start from scratch.") + logging.info( + f"No pre-trained model has been specified in channel {MODEL_CHANNEL}." + f"Training will start from scratch." + ) vw_args = f"{vw_args_base}" else: # Load the pre-trained model for training. - model_folder = os.environ[f'SM_CHANNEL_{MODEL_CHANNEL.upper()}'] + model_folder = os.environ[f"SM_CHANNEL_{MODEL_CHANNEL.upper()}"] _, weights_path = extract_model(model_folder) logging.info(f"Loading model from {weights_path}") vw_args = f"{vw_args_base} -i {weights_path}" - + # Init a class that communicates with C++ VW process using pipes - vw_model = VWModel(cli_args=f"{vw_args} -f {MODEL_OUTPUT_PATH} --save_resume", - model_path=None, test_only=False, quiet_mode=False) + vw_model = VWModel( + cli_args=f"{vw_args} -f {MODEL_OUTPUT_PATH} --save_resume", + model_path=None, + test_only=False, + quiet_mode=False, + ) vw_model.start() # Load training data @@ -79,17 +86,19 @@ def main(): is_valid = validate_experience(experience) if not is_valid: continue - vw_model.learn(context_vector=json.loads(experience["observation"]), - action=experience["action"], - cost=1 - experience["reward"], - probability=experience["action_prob"]) + vw_model.learn( + context_vector=json.loads(experience["observation"]), + action=experience["action"], + cost=1 - experience["reward"], + probability=experience["action_prob"], + ) count += 1 - + stdout = vw_model.close() print(stdout.decode()) save_vw_metadata(meta=vw_args_base) logging.info(f"Model learned using {count} training experiences.") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/09_deploy/src/vw_model.py b/09_deploy/src/vw_model.py index 3294f82f..39b688e3 100644 --- a/09_deploy/src/vw_model.py +++ b/09_deploy/src/vw_model.py @@ -24,7 +24,7 @@ def __init__(self): class VWModel: def __init__(self, model_path=None, cli_args="", test_only=True, quiet_mode=True): - """ VWModel object starts a VW CLI process and communicates with it using pipes + """VWModel object starts a VW CLI process and communicates with it using pipes Args: model_path (str): location of the model weights cli_args (str): additional args to pass to VW @@ -72,11 +72,14 @@ def start(self): # note bufsize=1 will make sure we immediately flush each output # line so that we can keep scoring the model. # bufsize=1 means line buffered. - self.current_proc = subprocess.Popen(self.cmd, bufsize=1, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=False) + self.current_proc = subprocess.Popen( + self.cmd, + bufsize=1, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=False, + ) self.logger.info("Started VW process!") @@ -89,7 +92,7 @@ def start(self): raise VWError("Cannot load the model with the provided arguments: %s" % e) def learn(self, context_vector, action, cost, probability): - """ Learn on a given experience + """Learn on a given experience Args: context_vector (list or np.array): A vector of context features action (int): The action ID that was taken (starts with 1) @@ -101,7 +104,7 @@ def learn(self, context_vector, action, cost, probability): parsed_example = self.parse_example(context_vector) + "\n" parsed_example = f"{action}:{cost}:{probability} {parsed_example}" - + if self.current_proc is None: raise VWError("trying to learn model when current_proc is None") @@ -134,11 +137,11 @@ def predict(self, context_vector): self.current_proc.stdout.flush() scores = np.array(list(map(float, self.current_proc.stdout.readline().split()))) - scores = (scores / scores.sum()) + scores = scores / scores.sum() return scores - + def evaluate(self, context_vector, action, cost, probability, label): - """ Used when evaluating a policy offline using logged bandits dataset + """Used when evaluating a policy offline using logged bandits dataset Args: context_vector (list or np.array): A vector of context features action (int): The action ID that was taken (starts with 1) by the old policy @@ -148,7 +151,7 @@ def evaluate(self, context_vector, action, cost, probability, label): """ parsed_example = self.parse_example(context_vector) + "\n" parsed_example = f"{label} {action}:{cost}:{probability} {parsed_example}" - + # TODO: Error handling in parsing the given example if self.current_proc is None: raise VWError("trying to score model when current_proc is None") @@ -157,7 +160,7 @@ def evaluate(self, context_vector, action, cost, probability, label): raise VWModelDown() self.current_proc.stdin.write(parsed_example.encode()) - + # we need to flush to score & collect the score # otherwise one needs to wait for the process to end self.current_proc.stdin.flush() @@ -166,10 +169,10 @@ def evaluate(self, context_vector, action, cost, probability, label): # VW will make a prediction on each eval instance. # To avoid PIPE overflow self.current_proc.stdout.readline() - + @staticmethod def parse_example(context_vector): - """ Parses the list of context features + """Parses the list of context features Args: context_vector (list or np.array): A vector of context features Returns: @@ -181,15 +184,13 @@ def parse_example(context_vector): @staticmethod def load_vw_model(metadata_loc, weights_loc, test_only=True, quiet_mode=True): - """ Initialize vw model with given metadata and weights locations - """ + """Initialize vw model with given metadata and weights locations""" with open(metadata_loc) as f: metadata = f.read().strip() return VWModel(model_path=weights_loc, cli_args=metadata, test_only=test_only, quiet_mode=quiet_mode) def close(self, prediction_only=False): - """ Close the VW process - """ + """Close the VW process""" training_info = "" if self.current_proc is not None: self.current_proc.stdin.close() diff --git a/09_deploy/src/vw_utils.py b/09_deploy/src/vw_utils.py index e41b11a8..ffe1a2d0 100644 --- a/09_deploy/src/vw_utils.py +++ b/09_deploy/src/vw_utils.py @@ -4,7 +4,7 @@ TRAIN_CHANNEL = "training" EVAL_CHANNEL = "evaluation" MODEL_CHANNEL = "pretrained_model" -MODEL_OUTPUT_DIR = os.environ.get('SM_MODEL_DIR', "/opt/ml/model") +MODEL_OUTPUT_DIR = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") MODEL_OUTPUT_PATH = os.path.join(MODEL_OUTPUT_DIR, "vw.model") diff --git a/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb b/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb index 13462c1b..912e9bd2 100644 --- a/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb +++ b/10_pipeline/01_Create_SageMaker_Pipeline_BERT_Reviews.ipynb @@ -32,12 +32,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -54,9 +54,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "pipeline_name = 'BERT-pipeline-{}'.format(timestamp)" + "pipeline_name = \"BERT-pipeline-{}\".format(timestamp)" ] }, { @@ -78,12 +79,13 @@ "from smexperiments.experiment import Experiment\n", "\n", "pipeline_experiment = Experiment.create(\n", - " experiment_name=pipeline_name,\n", - " description='Amazon Customer Reviews BERT Pipeline Experiment', \n", - " sagemaker_boto_client=sm)\n", + " experiment_name=pipeline_name,\n", + " description=\"Amazon Customer Reviews BERT Pipeline Experiment\",\n", + " sagemaker_boto_client=sm,\n", + ")\n", "\n", "pipeline_experiment_name = pipeline_experiment.experiment_name\n", - "print('Pipeline experiment name: {}'.format(pipeline_experiment_name))" + "print(\"Pipeline experiment name: {}\".format(pipeline_experiment_name))" ] }, { @@ -111,12 +113,12 @@ "import time\n", "from smexperiments.trial import Trial\n", "\n", - "pipeline_trial = Trial.create(trial_name='trial-{}'.format(timestamp),\n", - " experiment_name=pipeline_experiment_name,\n", - " sagemaker_boto_client=sm)\n", + "pipeline_trial = Trial.create(\n", + " trial_name=\"trial-{}\".format(timestamp), experiment_name=pipeline_experiment_name, sagemaker_boto_client=sm\n", + ")\n", "\n", "pipeline_trial_name = pipeline_trial.trial_name\n", - "print('Trial name: {}'.format(pipeline_trial_name))" + "print(\"Trial name: {}\".format(pipeline_trial_name))" ] }, { @@ -221,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n", + "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n", "print(raw_input_data_s3_uri)" ] }, @@ -241,6 +243,7 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", "input_data = ParameterString(\n", @@ -248,15 +251,9 @@ " default_value=raw_input_data_s3_uri,\n", ")\n", "\n", - "processing_instance_count = ParameterInteger(\n", - " name=\"ProcessingInstanceCount\",\n", - " default_value=1\n", - ")\n", + "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n", "\n", - "processing_instance_type = ParameterString(\n", - " name=\"ProcessingInstanceType\",\n", - " default_value=\"ml.c5.2xlarge\"\n", - ")\n", + "processing_instance_type = ParameterString(name=\"ProcessingInstanceType\", default_value=\"ml.c5.2xlarge\")\n", "\n", "max_seq_length = ParameterInteger(\n", " name=\"MaxSeqLength\",\n", @@ -267,7 +264,7 @@ " name=\"BalanceDataset\",\n", " default_value=\"True\",\n", ")\n", - " \n", + "\n", "train_split_percentage = ParameterFloat(\n", " name=\"TrainSplitPercentage\",\n", " default_value=0.90,\n", @@ -288,10 +285,7 @@ " default_value=\"reviews-feature-store-\" + str(timestamp),\n", ")\n", "\n", - "feature_group_name = ParameterString(\n", - " name=\"FeatureGroupName\",\n", - " default_value=\"reviews-feature-group-\" + str(timestamp)\n", - ")" + "feature_group_name = ParameterString(name=\"FeatureGroupName\", default_value=\"reviews-feature-group-\" + str(timestamp))" ] }, { @@ -324,12 +318,13 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " env={'AWS_DEFAULT_REGION': region}, \n", - " )" + "processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " env={\"AWS_DEFAULT_REGION\": region},\n", + ")" ] }, { @@ -341,45 +336,56 @@ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "from sagemaker.workflow.steps import ProcessingStep\n", "\n", - "processing_inputs=[\n", - " ProcessingInput(\n", - " input_name='raw-input-data',\n", - " source=input_data,\n", - " destination='/opt/ml/processing/input/data/',\n", - " s3_data_distribution_type='ShardedByS3Key'\n", - " )\n", + "processing_inputs = [\n", + " ProcessingInput(\n", + " input_name=\"raw-input-data\",\n", + " source=input_data,\n", + " destination=\"/opt/ml/processing/input/data/\",\n", + " s3_data_distribution_type=\"ShardedByS3Key\",\n", + " )\n", "]\n", "\n", - "processing_outputs=[\n", - " ProcessingOutput(output_name='bert-train',\n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/bert/train',\n", - " ),\n", - " ProcessingOutput(output_name='bert-validation',\n", - " s3_upload_mode='EndOfJob', \n", - " source='/opt/ml/processing/output/bert/validation',\n", - " ),\n", - " ProcessingOutput(output_name='bert-test',\n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/bert/test',\n", - " ),\n", - "] \n", + "processing_outputs = [\n", + " ProcessingOutput(\n", + " output_name=\"bert-train\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/train\",\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-validation\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/validation\",\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"bert-test\",\n", + " s3_upload_mode=\"EndOfJob\",\n", + " source=\"/opt/ml/processing/output/bert/test\",\n", + " ),\n", + "]\n", "\n", "processing_step = ProcessingStep(\n", - " name='Processing', \n", - " code='preprocess-scikit-text-to-bert-feature-store.py',\n", + " name=\"Processing\",\n", + " code=\"preprocess-scikit-text-to-bert-feature-store.py\",\n", " processor=processor,\n", " inputs=processing_inputs,\n", " outputs=processing_outputs,\n", - " job_arguments=['--train-split-percentage', str(train_split_percentage.default_value), \n", - " '--validation-split-percentage', str(validation_split_percentage.default_value),\n", - " '--test-split-percentage', str(test_split_percentage.default_value),\n", - " '--max-seq-length', str(max_seq_length.default_value),\n", - " '--balance-dataset', str(balance_dataset.default_value),\n", - " '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),\n", - " '--feature-group-name', str(feature_group_name.default_value)\n", - " ]\n", - ") \n", + " job_arguments=[\n", + " \"--train-split-percentage\",\n", + " str(train_split_percentage.default_value),\n", + " \"--validation-split-percentage\",\n", + " str(validation_split_percentage.default_value),\n", + " \"--test-split-percentage\",\n", + " str(test_split_percentage.default_value),\n", + " \"--max-seq-length\",\n", + " str(max_seq_length.default_value),\n", + " \"--balance-dataset\",\n", + " str(balance_dataset.default_value),\n", + " \"--feature-store-offline-prefix\",\n", + " str(feature_store_offline_prefix.default_value),\n", + " \"--feature-group-name\",\n", + " str(feature_group_name.default_value),\n", + " ],\n", + ")\n", "\n", "print(processing_step)" ] @@ -422,15 +428,9 @@ "metadata": {}, "outputs": [], "source": [ - "train_instance_type = ParameterString(\n", - " name=\"TrainInstanceType\",\n", - " default_value=\"ml.c5.9xlarge\"\n", - ")\n", + "train_instance_type = ParameterString(name=\"TrainInstanceType\", default_value=\"ml.c5.9xlarge\")\n", "\n", - "train_instance_count = ParameterInteger(\n", - " name=\"TrainInstanceCount\",\n", - " default_value=1\n", - ")" + "train_instance_count = ParameterInteger(name=\"TrainInstanceCount\", default_value=1)" ] }, { @@ -447,56 +447,26 @@ "metadata": {}, "outputs": [], "source": [ - "epochs = ParameterInteger(\n", - " name=\"Epochs\",\n", - " default_value=1\n", - ")\n", - " \n", - "learning_rate = ParameterFloat(\n", - " name=\"LearningRate\",\n", - " default_value=0.00001\n", - ") \n", - " \n", - "epsilon = ParameterFloat(\n", - " name=\"Epsilon\",\n", - " default_value=0.00000001\n", - ")\n", - " \n", - "train_batch_size = ParameterInteger(\n", - " name=\"TrainBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "validation_batch_size = ParameterInteger(\n", - " name=\"ValidationBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "test_batch_size = ParameterInteger(\n", - " name=\"TestBatchSize\",\n", - " default_value=128\n", - ")\n", - " \n", - "train_steps_per_epoch = ParameterInteger(\n", - " name=\"TrainStepsPerEpoch\",\n", - " default_value=50\n", - ")\n", - " \n", - "validation_steps = ParameterInteger(\n", - " name=\"ValidationSteps\",\n", - " default_value=50\n", - ")\n", - " \n", - "test_steps = ParameterInteger(\n", - " name=\"TestSteps\",\n", - " default_value=50\n", - ")\n", - " \n", - "train_volume_size = ParameterInteger(\n", - " name=\"TrainVolumeSize\",\n", - " default_value=1024\n", - ") \n", - " \n", + "epochs = ParameterInteger(name=\"Epochs\", default_value=1)\n", + "\n", + "learning_rate = ParameterFloat(name=\"LearningRate\", default_value=0.00001)\n", + "\n", + "epsilon = ParameterFloat(name=\"Epsilon\", default_value=0.00000001)\n", + "\n", + "train_batch_size = ParameterInteger(name=\"TrainBatchSize\", default_value=128)\n", + "\n", + "validation_batch_size = ParameterInteger(name=\"ValidationBatchSize\", default_value=128)\n", + "\n", + "test_batch_size = ParameterInteger(name=\"TestBatchSize\", default_value=128)\n", + "\n", + "train_steps_per_epoch = ParameterInteger(name=\"TrainStepsPerEpoch\", default_value=50)\n", + "\n", + "validation_steps = ParameterInteger(name=\"ValidationSteps\", default_value=50)\n", + "\n", + "test_steps = ParameterInteger(name=\"TestSteps\", default_value=50)\n", + "\n", + "train_volume_size = ParameterInteger(name=\"TrainVolumeSize\", default_value=1024)\n", + "\n", "use_xla = ParameterString(\n", " name=\"UseXLA\",\n", " default_value=\"True\",\n", @@ -506,7 +476,7 @@ " name=\"UseAMP\",\n", " default_value=\"True\",\n", ")\n", - " \n", + "\n", "freeze_bert_layer = ParameterString(\n", " name=\"FreezeBERTLayer\",\n", " default_value=\"False\",\n", @@ -516,7 +486,7 @@ " name=\"EnableSageMakerDebugger\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "enable_checkpointing = ParameterString(\n", " name=\"EnableCheckpointing\",\n", " default_value=\"False\",\n", @@ -526,7 +496,7 @@ " name=\"EnableTensorboard\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "input_mode = ParameterString(\n", " name=\"InputMode\",\n", " default_value=\"File\",\n", @@ -541,7 +511,7 @@ " name=\"RunTest\",\n", " default_value=\"False\",\n", ")\n", - " \n", + "\n", "run_sample_predictions = ParameterString(\n", " name=\"RunSamplePredictions\",\n", " default_value=\"False\",\n", @@ -562,10 +532,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -601,36 +571,39 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size, \n", - " py_version='py37',\n", - " framework_version='2.3.1',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " metric_definitions=metrics_definitions,\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py37\",\n", + " framework_version=\"2.3.1\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " metric_definitions=metrics_definitions,\n", + ")" ] }, { @@ -652,27 +625,21 @@ "from sagemaker.workflow.steps import TrainingStep\n", "\n", "training_step = TrainingStep(\n", - " name='Train',\n", + " name=\"Train\",\n", " estimator=estimator,\n", " inputs={\n", - " 'train': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-train'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", + " \"train\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-train\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " \"validation\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-validation\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", " ),\n", - " 'validation': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-validation'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", + " \"test\": TrainingInput(\n", + " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\"bert-test\"].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", " ),\n", - " 'test': TrainingInput(\n", - " s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[\n", - " 'bert-test'\n", - " ].S3Output.S3Uri,\n", - " content_type='text/csv'\n", - " ) \n", " },\n", ")\n", "\n", @@ -726,12 +693,14 @@ "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", - "evaluation_processor = SKLearnProcessor(framework_version='0.23-1',\n", - " role=role,\n", - " instance_type=processing_instance_type,\n", - " instance_count=processing_instance_count,\n", - " env={'AWS_DEFAULT_REGION': region},\n", - " max_runtime_in_seconds=7200)" + "evaluation_processor = SKLearnProcessor(\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " env={\"AWS_DEFAULT_REGION\": region},\n", + " max_runtime_in_seconds=7200,\n", + ")" ] }, { @@ -742,7 +711,7 @@ }, "outputs": [], "source": [ - "!pygmentize evaluate_model_metrics.py\n" + "!pygmentize evaluate_model_metrics.py" ] }, { @@ -762,11 +731,7 @@ "source": [ "from sagemaker.workflow.properties import PropertyFile\n", "\n", - "evaluation_report = PropertyFile(\n", - " name='EvaluationReport',\n", - " output_name='metrics',\n", - " path='evaluation.json'\n", - ")" + "evaluation_report = PropertyFile(name=\"EvaluationReport\", output_name=\"metrics\", path=\"evaluation.json\")" ] }, { @@ -776,27 +741,28 @@ "outputs": [], "source": [ "evaluation_step = ProcessingStep(\n", - " name='EvaluateModel',\n", + " name=\"EvaluateModel\",\n", " processor=evaluation_processor,\n", - " code='evaluate_model_metrics.py',\n", + " code=\"evaluate_model_metrics.py\",\n", " inputs=[\n", " ProcessingInput(\n", " source=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n", - " destination='/opt/ml/processing/input/model'\n", + " destination=\"/opt/ml/processing/input/model\",\n", " ),\n", " ProcessingInput(\n", - " source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri,\n", - " destination='/opt/ml/processing/input/data'\n", - " )\n", + " source=processing_step.properties.ProcessingInputs[\"raw-input-data\"].S3Input.S3Uri,\n", + " destination=\"/opt/ml/processing/input/data\",\n", + " ),\n", " ],\n", " outputs=[\n", - " ProcessingOutput(output_name='metrics', \n", - " s3_upload_mode='EndOfJob',\n", - " source='/opt/ml/processing/output/metrics/'),\n", + " ProcessingOutput(\n", + " output_name=\"metrics\", s3_upload_mode=\"EndOfJob\", source=\"/opt/ml/processing/output/metrics/\"\n", + " ),\n", " ],\n", " job_arguments=[\n", - " '--max-seq-length', str(max_seq_length.default_value),\n", - " ],\n", + " \"--max-seq-length\",\n", + " str(max_seq_length.default_value),\n", + " ],\n", " property_files=[evaluation_report],\n", ")" ] @@ -814,14 +780,14 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.model_metrics import MetricsSource, ModelMetrics \n", + "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n", "\n", "model_metrics = ModelMetrics(\n", " model_statistics=MetricsSource(\n", " s3_uri=\"{}/evaluation.json\".format(\n", " evaluation_step.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", " ),\n", - " content_type=\"application/json\"\n", + " content_type=\"application/json\",\n", " )\n", ")\n", "\n", @@ -853,20 +819,11 @@ "metadata": {}, "outputs": [], "source": [ - "model_approval_status = ParameterString(\n", - " name=\"ModelApprovalStatus\",\n", - " default_value=\"PendingManualApproval\"\n", - ")\n", + "model_approval_status = ParameterString(name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\")\n", "\n", - "deploy_instance_type = ParameterString(\n", - " name=\"DeployInstanceType\",\n", - " default_value=\"ml.m5.4xlarge\"\n", - ")\n", + "deploy_instance_type = ParameterString(name=\"DeployInstanceType\", default_value=\"ml.m5.4xlarge\")\n", "\n", - "deploy_instance_count = ParameterInteger(\n", - " name=\"DeployInstanceCount\",\n", - " default_value=1\n", - ")" + "deploy_instance_count = ParameterInteger(name=\"DeployInstanceCount\", default_value=1)" ] }, { @@ -892,7 +849,7 @@ " version=\"2.3.1\",\n", " py_version=\"py37\",\n", " instance_type=deploy_instance_type,\n", - " image_scope=\"inference\"\n", + " image_scope=\"inference\",\n", ")\n", "print(inference_image_uri)" ] @@ -907,10 +864,10 @@ "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", - "# entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n", - "# source_dir='src',\n", + " # entry_point='inference.py', # Adds a Repack Step: https://github.com/aws/sagemaker-python-sdk/blob/01c6ee3a9ec1831e935e86df58cf70bc92ed1bbe/src/sagemaker/workflow/_utils.py#L44\n", + " # source_dir='src',\n", " estimator=estimator,\n", - " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n", + " image_uri=inference_image_uri, # we have to specify, by default it's using training image\n", " model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " content_types=[\"application/jsonlines\"],\n", " response_types=[\"application/jsonlines\"],\n", @@ -918,7 +875,7 @@ " transform_instances=[\"ml.c5.18xlarge\"],\n", " model_package_group_name=model_package_group_name,\n", " approval_status=model_approval_status,\n", - " model_metrics=model_metrics\n", + " model_metrics=model_metrics,\n", ")" ] }, @@ -939,7 +896,7 @@ "source": [ "from sagemaker.model import Model\n", "\n", - "model_name = 'bert-model-{}'.format(timestamp)\n", + "model_name = \"bert-model-{}\".format(timestamp)\n", "\n", "model = Model(\n", " name=model_name,\n", @@ -959,7 +916,7 @@ "from sagemaker.inputs import CreateModelInput\n", "\n", "create_inputs = CreateModelInput(\n", - " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n", + " instance_type=deploy_instance_type, # \"ml.m5.4xlarge\",\n", ")" ] }, @@ -1001,10 +958,7 @@ "metadata": {}, "outputs": [], "source": [ - "min_accuracy_value = ParameterFloat(\n", - " name=\"MinAccuracyValue\",\n", - " default_value=0.01\n", - ")" + "min_accuracy_value = ParameterFloat(name=\"MinAccuracyValue\", default_value=0.01)" ] }, { @@ -1025,14 +979,14 @@ " property_file=evaluation_report,\n", " json_path=\"metrics.accuracy.value\",\n", " ),\n", - " right=min_accuracy_value # accuracy\n", + " right=min_accuracy_value, # accuracy\n", ")\n", "\n", "minimum_accuracy_condition_step = ConditionStep(\n", " name=\"AccuracyCondition\",\n", " conditions=[minimum_accuracy_condition],\n", - " if_steps=[register_step, create_step], # success, continue with model registration\n", - " else_steps=[], # fail, end the pipeline\n", + " if_steps=[register_step, create_step], # success, continue with model registration\n", + " else_steps=[], # fail, end the pipeline\n", ")" ] }, @@ -1108,7 +1062,7 @@ " min_accuracy_value,\n", " model_approval_status,\n", " deploy_instance_type,\n", - " deploy_instance_count\n", + " deploy_instance_count,\n", " ],\n", " steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],\n", " sagemaker_session=sess,\n", @@ -1196,16 +1150,16 @@ " parameters=dict(\n", " InputData=raw_input_data_s3_uri,\n", " ProcessingInstanceCount=1,\n", - " ProcessingInstanceType='ml.c5.2xlarge',\n", + " ProcessingInstanceType=\"ml.c5.2xlarge\",\n", " MaxSeqLength=64,\n", - " BalanceDataset='True',\n", + " BalanceDataset=\"True\",\n", " TrainSplitPercentage=0.9,\n", " ValidationSplitPercentage=0.05,\n", " TestSplitPercentage=0.05,\n", - " FeatureStoreOfflinePrefix='reviews-feature-store-'+str(timestamp),\n", - " FeatureGroupName='reviews-feature-group-'+str(timestamp),\n", + " FeatureStoreOfflinePrefix=\"reviews-feature-store-\" + str(timestamp),\n", + " FeatureGroupName=\"reviews-feature-group-\" + str(timestamp),\n", " LearningRate=0.000012,\n", - " TrainInstanceType='ml.c5.9xlarge',\n", + " TrainInstanceType=\"ml.c5.9xlarge\",\n", " TrainInstanceCount=1,\n", " Epochs=1,\n", " Epsilon=0.00000001,\n", @@ -1216,20 +1170,20 @@ " ValidationSteps=50,\n", " TestSteps=50,\n", " TrainVolumeSize=1024,\n", - " UseXLA='True',\n", - " UseAMP='True',\n", - " FreezeBERTLayer='False',\n", - " EnableSageMakerDebugger='False',\n", - " EnableCheckpointing='False',\n", - " EnableTensorboard='False',\n", - " InputMode='File',\n", - " RunValidation='True',\n", - " RunTest='False',\n", - " RunSamplePredictions='False', \n", + " UseXLA=\"True\",\n", + " UseAMP=\"True\",\n", + " FreezeBERTLayer=\"False\",\n", + " EnableSageMakerDebugger=\"False\",\n", + " EnableCheckpointing=\"False\",\n", + " EnableTensorboard=\"False\",\n", + " InputMode=\"File\",\n", + " RunValidation=\"True\",\n", + " RunTest=\"False\",\n", + " RunSamplePredictions=\"False\",\n", " MinAccuracyValue=0.01,\n", - " ModelApprovalStatus='PendingManualApproval', \n", - " DeployInstanceType='ml.m5.4xlarge',\n", - " DeployInstanceCount=1 \n", + " ModelApprovalStatus=\"PendingManualApproval\",\n", + " DeployInstanceType=\"ml.m5.4xlarge\",\n", + " DeployInstanceCount=1,\n", " )\n", ")\n", "\n", @@ -1270,7 +1224,7 @@ "metadata": {}, "outputs": [], "source": [ - "execution_run_name = execution_run['PipelineExecutionDisplayName']\n", + "execution_run_name = execution_run[\"PipelineExecutionDisplayName\"]\n", "print(execution_run_name)" ] }, @@ -1280,7 +1234,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = execution_run['PipelineExecutionArn']\n", + "pipeline_execution_arn = execution_run[\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -1345,20 +1299,20 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", - "# print('Executions for our pipeline...')\n", - "# print(pipeline_execution_status)\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", + " # print('Executions for our pipeline...')\n", + " # print(pipeline_execution_status)\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -1377,7 +1331,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -1387,7 +1341,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -1411,7 +1365,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -1441,8 +1395,8 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_name=None\n", - "training_job_name=None" + "processing_job_name = None\n", + "training_job_name = None" ] }, { @@ -1456,15 +1410,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -1486,7 +1440,7 @@ "outputs": [], "source": [ "# -aws-processing-job is the default name assigned by ProcessingJob\n", - "processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)\n", + "processing_job_tc = \"{}-aws-processing-job\".format(processing_job_name)\n", "print(processing_job_tc)" ] }, @@ -1514,10 +1468,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = sm.associate_trial_component(\n", - " TrialComponentName=processing_job_tc,\n", - " TrialName=pipeline_trial_name\n", - ")" + "response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=pipeline_trial_name)" ] }, { @@ -1527,7 +1478,7 @@ "outputs": [], "source": [ "# -aws-training-job is the default name assigned by TrainingJob\n", - "training_job_tc = '{}-aws-training-job'.format(training_job_name)\n", + "training_job_tc = \"{}-aws-training-job\".format(training_job_name)\n", "print(training_job_tc)" ] }, @@ -1537,10 +1488,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = sm.associate_trial_component(\n", - " TrialComponentName=training_job_tc,\n", - " TrialName=pipeline_trial_name\n", - ")" + "response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=pipeline_trial_name)" ] }, { @@ -1560,9 +1508,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"balance_dataset\": str(balance_dataset), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"balance_dataset\": str(balance_dataset),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1574,9 +1524,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"train_split_percentage\": str(train_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"train_split_percentage\": str(train_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1588,9 +1540,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"validation_split_percentage\": str(validation_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"validation_split_percentage\": str(validation_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1602,9 +1556,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"test_split_percentage\": str(test_split_percentage), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"test_split_percentage\": str(test_split_percentage),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1616,9 +1572,11 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_tracker.log_parameters({\n", - " \"max_seq_length\": str(max_seq_length), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"max_seq_length\": str(max_seq_length),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1630,11 +1588,13 @@ "metadata": {}, "outputs": [], "source": [ - "time.sleep(5) # avoid throttling exception \n", + "time.sleep(5) # avoid throttling exception\n", "\n", - "processing_job_tracker.log_parameters({\n", - " \"feature_store_offline_prefix\": str(feature_store_offline_prefix), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"feature_store_offline_prefix\": str(feature_store_offline_prefix),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1646,11 +1606,13 @@ "metadata": {}, "outputs": [], "source": [ - "time.sleep(5) # avoid throttling exception \n", + "time.sleep(5) # avoid throttling exception\n", "\n", - "processing_job_tracker.log_parameters({\n", - " \"feature_group_name\": str(feature_group_name), \n", - "})\n", + "processing_job_tracker.log_parameters(\n", + " {\n", + " \"feature_group_name\": str(feature_group_name),\n", + " }\n", + ")\n", "\n", "# must save after logging\n", "processing_job_tracker.trial_component.save()" @@ -1671,9 +1633,10 @@ "source": [ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", - "time.sleep(30) # avoid throttling exception\n", + "time.sleep(30) # avoid throttling exception\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", diff --git a/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb b/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb index 2225889e..8293053f 100644 --- a/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb +++ b/10_pipeline/02_Evaluate_Pipeline_Execution.ipynb @@ -24,12 +24,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -79,20 +79,20 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", - "# print('Executions for our pipeline...')\n", - "# print(pipeline_execution_status)\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", + " # print('Executions for our pipeline...')\n", + " # print(pipeline_execution_status)\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -109,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -119,7 +119,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -149,14 +149,16 @@ "metadata": {}, "outputs": [], "source": [ - "#for execution_step in reversed(execution.list_steps()):\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", - " if execution_step['StepName'] == 'EvaluateModel':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + "# for execution_step in reversed(execution.list_steps()):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", + " if execution_step[\"StepName\"] == \"EvaluateModel\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", "\n", "describe_evaluation_processing_job_response = sm.describe_processing_job(ProcessingJobName=processing_job_name)\n", "\n", - "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n", + "evaluation_metrics_s3_uri = describe_evaluation_processing_job_response[\"ProcessingOutputConfig\"][\"Outputs\"][0][\n", + " \"S3Output\"\n", + "][\"S3Uri\"]\n", "evaluation_metrics_s3_uri" ] }, @@ -169,9 +171,7 @@ "import json\n", "from pprint import pprint\n", "\n", - "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(\n", - " evaluation_metrics_s3_uri\n", - "))\n", + "evaluation_json = sagemaker.s3.S3Downloader.read_file(\"{}/evaluation.json\".format(evaluation_metrics_s3_uri))\n", "\n", "pprint(json.loads(evaluation_json))" ] @@ -189,15 +189,15 @@ "metadata": {}, "outputs": [], "source": [ - "training_job_arn=None\n", + "training_job_arn = None\n", "\n", - "for execution_step in steps['PipelineExecutionSteps']:\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", " if execution_step[\"StepName\"] == \"Train\":\n", " training_job_arn = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"]\n", - " \n", + "\n", " break\n", - " \n", - "training_job_name = training_job_arn.split('/')[-1]\n", + "\n", + "training_job_name = training_job_arn.split(\"/\")[-1]\n", "print(training_job_name)" ] }, @@ -207,7 +207,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)['ModelArtifacts']['S3ModelArtifacts']" + "model_tar_s3_uri = sm.describe_training_job(TrainingJobName=training_job_name)[\"ModelArtifacts\"][\"S3ModelArtifacts\"]" ] }, { @@ -225,8 +225,8 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -p ./model \n", - "!tar -zxvf model.tar.gz -C ./model " + "!mkdir -p ./model\n", + "!tar -zxvf model.tar.gz -C ./model" ] }, { @@ -263,8 +263,8 @@ "metadata": {}, "outputs": [], "source": [ - "processing_job_name=None\n", - "training_job_name=None" + "processing_job_name = None\n", + "training_job_name = None" ] }, { @@ -278,15 +278,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -309,9 +309,10 @@ "source": [ "from sagemaker.analytics import ExperimentAnalytics\n", "\n", - "time.sleep(30) # avoid throttling exception\n", + "time.sleep(30) # avoid throttling exception\n", "\n", "import pandas as pd\n", + "\n", "pd.set_option(\"max_colwidth\", 500)\n", "\n", "experiment_analytics = ExperimentAnalytics(\n", diff --git a/10_pipeline/03_Register_Deploy_Model.ipynb b/10_pipeline/03_Register_Deploy_Model.ipynb index 66cb0842..fe59ccdc 100644 --- a/10_pipeline/03_Register_Deploy_Model.ipynb +++ b/10_pipeline/03_Register_Deploy_Model.ipynb @@ -28,12 +28,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -65,20 +65,20 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", - "# print('Executions for our pipeline...')\n", - "# print(pipeline_execution_status)\n", + " executions_response = sm.list_pipeline_executions(PipelineName=pipeline_name)[\"PipelineExecutionSummaries\"]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", + " # print('Executions for our pipeline...')\n", + " # print(pipeline_execution_status)\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -135,9 +135,9 @@ "metadata": {}, "outputs": [], "source": [ - "for execution_step in steps['PipelineExecutionSteps']:\n", - " if execution_step['StepName'] == 'RegisterModel':\n", - " model_package_arn = execution_step['Metadata']['RegisterModel']['Arn']\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", + " if execution_step[\"StepName\"] == \"RegisterModel\":\n", + " model_package_arn = execution_step[\"Metadata\"][\"RegisterModel\"][\"Arn\"]\n", " break\n", "print(model_package_arn)" ] @@ -150,7 +150,7 @@ "source": [ "model_package_update_response = sm.update_model_package(\n", " ModelPackageArn=model_package_arn,\n", - " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n", + " ModelApprovalStatus=\"Approved\", # Other options are Rejected and PendingManualApproval\n", ")" ] }, @@ -167,13 +167,13 @@ "metadata": {}, "outputs": [], "source": [ - "for execution_step in steps['PipelineExecutionSteps']:\n", - " if execution_step['StepName'] == 'CreateModel':\n", - " model_arn = execution_step['Metadata']['Model']['Arn']\n", + "for execution_step in steps[\"PipelineExecutionSteps\"]:\n", + " if execution_step[\"StepName\"] == \"CreateModel\":\n", + " model_arn = execution_step[\"Metadata\"][\"Model\"][\"Arn\"]\n", " break\n", "print(model_arn)\n", "\n", - "model_name = model_arn.split('/')[-1]\n", + "model_name = model_arn.split(\"/\")[-1]\n", "print(model_name)" ] }, @@ -192,13 +192,14 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "model_from_registry_name = 'bert-model-from-registry-{}'.format(timestamp)\n", + "model_from_registry_name = \"bert-model-from-registry-{}\".format(timestamp)\n", "print(\"Model from registry name : {}\".format(model_from_registry_name))\n", "\n", "model_registry_package_container = {\n", - " 'ModelPackageName': model_package_arn,\n", + " \"ModelPackageName\": model_package_arn,\n", "}" ] }, @@ -208,12 +209,10 @@ "metadata": {}, "outputs": [], "source": [ - "from pprint import pprint \n", + "from pprint import pprint\n", "\n", "create_model_from_registry_respose = sm.create_model(\n", - " ModelName = model_from_registry_name,\n", - " ExecutionRoleArn = role,\n", - " PrimaryContainer = model_registry_package_container\n", + " ModelName=model_from_registry_name, ExecutionRoleArn=role, PrimaryContainer=model_registry_package_container\n", ")\n", "pprint(create_model_from_registry_respose)" ] @@ -224,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_from_registry_arn = create_model_from_registry_respose['ModelArn']\n", + "model_from_registry_arn = create_model_from_registry_respose[\"ModelArn\"]\n", "model_from_registry_arn" ] }, @@ -234,17 +233,21 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint_config_name = 'bert-model-from-registry-epc-{}'.format(timestamp)\n", + "endpoint_config_name = \"bert-model-from-registry-epc-{}\".format(timestamp)\n", "print(endpoint_config_name)\n", "\n", "create_endpoint_config_response = sm.create_endpoint_config(\n", - " EndpointConfigName = endpoint_config_name,\n", - " ProductionVariants=[{\n", - " 'InstanceType':'ml.m5.4xlarge',\n", - " 'InitialVariantWeight':1,\n", - " 'InitialInstanceCount':1,\n", - " 'ModelName': model_name,\n", - " 'VariantName':'AllTraffic'}])" + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"InstanceType\": \"ml.m5.4xlarge\",\n", + " \"InitialVariantWeight\": 1,\n", + " \"InitialInstanceCount\": 1,\n", + " \"ModelName\": model_name,\n", + " \"VariantName\": \"AllTraffic\",\n", + " }\n", + " ],\n", + ")" ] }, { @@ -253,13 +256,13 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_endpoint_name = 'bert-model-from-registry-ep-{}'.format(timestamp)\n", + "pipeline_endpoint_name = \"bert-model-from-registry-ep-{}\".format(timestamp)\n", "print(\"EndpointName={}\".format(pipeline_endpoint_name))\n", "\n", "create_endpoint_response = sm.create_endpoint(\n", - " EndpointName=pipeline_endpoint_name,\n", - " EndpointConfigName=endpoint_config_name)\n", - "print(create_endpoint_response['EndpointArn'])" + " EndpointName=pipeline_endpoint_name, EndpointConfigName=endpoint_config_name\n", + ")\n", + "print(create_endpoint_response[\"EndpointArn\"])" ] }, { @@ -270,7 +273,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker REST Endpoint'.format(region, pipeline_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker REST Endpoint'.format(\n", + " region, pipeline_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -288,7 +297,7 @@ "source": [ "%%time\n", "\n", - "waiter = sm.get_waiter('endpoint_in_service')\n", + "waiter = sm.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=pipeline_endpoint_name)" ] }, @@ -317,20 +326,20 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", " display(viz.show(pipeline_execution_step=execution_step))\n", - " time.sleep(5)\n" + " time.sleep(5)" ] }, { @@ -351,14 +360,16 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=pipeline_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer()) " + "predictor = TensorFlowPredictor(\n", + " endpoint_name=pipeline_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -374,15 +385,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -393,12 +401,14 @@ "source": [ "import csv\n", "\n", - "df_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df_reviews = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "\n", - "df_sample_reviews = df_reviews[['review_body', 'star_rating']].sample(n=50)\n", + "df_sample_reviews = df_reviews[[\"review_body\", \"star_rating\"]].sample(n=50)\n", "df_sample_reviews = df_sample_reviews.reset_index(drop=True)\n", "df_sample_reviews.shape" ] @@ -420,14 +430,14 @@ "source": [ "import pandas as pd\n", "\n", + "\n", "def predict(review_body):\n", - " inputs = [\n", - " {\"features\": [review_body]}\n", - " ]\n", + " inputs = [{\"features\": [review_body]}]\n", " predicted_classes = predictor.predict(inputs)\n", - " return predicted_classes[0]['predicted_label']\n", - " \n", - "df_sample_reviews['predicted_class'] = df_sample_reviews['review_body'].map(predict)\n", + " return predicted_classes[0][\"predicted_label\"]\n", + "\n", + "\n", + "df_sample_reviews[\"predicted_class\"] = df_sample_reviews[\"review_body\"].map(predict)\n", "df_sample_reviews.head(5)" ] }, diff --git a/10_pipeline/airflow/00_Create_S3_Bucket.ipynb b/10_pipeline/airflow/00_Create_S3_Bucket.ipynb index 0c8274b9..96981663 100644 --- a/10_pipeline/airflow/00_Create_S3_Bucket.ipynb +++ b/10_pipeline/airflow/00_Create_S3_Bucket.ipynb @@ -17,21 +17,21 @@ "\n", "session = boto3.session.Session()\n", "region = session.region_name\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", - "airflow_bucket_name = 'airflow-'+region+'-'+account_id\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", + "airflow_bucket_name = \"airflow-\" + region + \"-\" + account_id\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n", "s3.create_bucket(Bucket=airflow_bucket_name)\n", "\n", "response = s3.put_public_access_block(\n", - " Bucket = airflow_bucket_name,\n", + " Bucket=airflow_bucket_name,\n", " PublicAccessBlockConfiguration={\n", - " 'BlockPublicAcls': True,\n", - " 'IgnorePublicAcls': True,\n", - " 'BlockPublicPolicy': True,\n", - " 'RestrictPublicBuckets': True\n", - " }\n", - ")\n" + " \"BlockPublicAcls\": True,\n", + " \"IgnorePublicAcls\": True,\n", + " \"BlockPublicPolicy\": True,\n", + " \"RestrictPublicBuckets\": True,\n", + " },\n", + ")" ] }, { @@ -40,9 +40,9 @@ "metadata": {}, "outputs": [], "source": [ - "s3_mwaa_private_path = 's3://{}'.format(airflow_bucket_name)\n", - "s3_mwaa_dags_private_path = 's3://{}/dags'.format(airflow_bucket_name)\n", - "s3_mwaa_pipeline_private_path = 's3://{}/dags/pipeline'.format(airflow_bucket_name)\n", + "s3_mwaa_private_path = \"s3://{}\".format(airflow_bucket_name)\n", + "s3_mwaa_dags_private_path = \"s3://{}/dags\".format(airflow_bucket_name)\n", + "s3_mwaa_pipeline_private_path = \"s3://{}/dags/pipeline\".format(airflow_bucket_name)\n", "print(s3_mwaa_private_path)" ] }, @@ -52,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_s3_bucket_passed=False" + "setup_s3_bucket_passed = False" ] }, { @@ -61,7 +61,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Default bucket: {}'.format(airflow_bucket_name))" + "print(\"Default bucket: {}\".format(airflow_bucket_name))" ] }, { @@ -105,9 +105,9 @@ "try:\n", " response = s3.head_bucket(Bucket=airflow_bucket_name)\n", " print(response)\n", - " setup_s3_bucket_passed=True\n", + " setup_s3_bucket_passed = True\n", "except ClientError as e:\n", - " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))" + " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))" ] }, { diff --git a/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb b/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb index 9a07c5f3..74af3409 100644 --- a/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb +++ b/10_pipeline/airflow/01_Setup_Airflow_Dependencies.ipynb @@ -20,17 +20,17 @@ "session = boto3.session.Session()\n", "region = session.region_name\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", "\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n", "\n", - "airflow_env_name = 'mwaa-'+region+'-'+account_id\n", - "airflow_vpc_name = 'mwaa-vpc'+region+'-'+account_id" + "airflow_env_name = \"mwaa-\" + region + \"-\" + account_id\n", + "airflow_vpc_name = \"mwaa-vpc\" + region + \"-\" + account_id" ] }, { @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_s3_bucket_passed=False\n", + "setup_s3_bucket_passed = False\n", "%store -r airflow_bucket_name\n", "%store airflow_env_name\n", "%store airflow_vpc_name" @@ -58,9 +58,9 @@ "try:\n", " response = s3.head_bucket(Bucket=airflow_bucket_name)\n", " print(response)\n", - " setup_s3_bucket_passed=True\n", + " setup_s3_bucket_passed = True\n", "except ClientError as e:\n", - " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))" + " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))" ] }, { @@ -85,13 +85,13 @@ "metadata": {}, "outputs": [], "source": [ - "with open('./dags/config.py', 'r') as f:\n", + "with open(\"./dags/config.py\", \"r\") as f:\n", " lines = f.readlines()\n", "\n", - "with open('./dags/config.py', 'w') as f:\n", + "with open(\"./dags/config.py\", \"w\") as f:\n", " for line in lines:\n", - " line = line.replace('{0}', region)\n", - " line = line.replace('{1}', bucket)\n", + " line = line.replace(\"{0}\", region)\n", + " line = line.replace(\"{1}\", bucket)\n", " f.write(line)" ] }, @@ -148,20 +148,24 @@ "metadata": {}, "outputs": [], "source": [ - "#Check number of policies attached to TeamRole, we need to have nine\n", - "iam = boto3.resource('iam')\n", - "iam_client = boto3.client('iam')\n", - "team_role_arn = iam.Role('TeamRole').arn\n", + "# Check number of policies attached to TeamRole, we need to have nine\n", + "iam = boto3.resource(\"iam\")\n", + "iam_client = boto3.client(\"iam\")\n", + "team_role_arn = iam.Role(\"TeamRole\").arn\n", "\n", - "team_role = iam.Role('TeamRole')\n", + "team_role = iam.Role(\"TeamRole\")\n", "\n", - "aws_managed_policies = [p for p in team_role.attached_policies.all() ] \n", + "aws_managed_policies = [p for p in team_role.attached_policies.all()]\n", "\n", - "if(len(aws_managed_policies) >= 10): \n", - " print('You have: {} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA VPC Creation Policy.'.format(len(aws_managed_policies)))\n", + "if len(aws_managed_policies) >= 10:\n", + " print(\n", + " \"You have: {} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA VPC Creation Policy.\".format(\n", + " len(aws_managed_policies)\n", + " )\n", + " )\n", " print(\"Please do NOT continue unless until you run this and get a Success message\")\n", "else:\n", - " print(\"Success! Please Continue...\") " + " print(\"Success! Please Continue...\")" ] }, { @@ -170,9 +174,9 @@ "metadata": {}, "outputs": [], "source": [ - "mwaa_vpc_policy_json = open('./src/mwaa_vpc_policy.json', 'r').read()\n", - "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace('{0}',region)\n", - "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace('{1}',account_id)" + "mwaa_vpc_policy_json = open(\"./src/mwaa_vpc_policy.json\", \"r\").read()\n", + "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace(\"{0}\", region)\n", + "mwaa_vpc_policy_json = mwaa_vpc_policy_json.replace(\"{1}\", account_id)" ] }, { @@ -190,10 +194,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam_client.create_policy(\n", - " PolicyName='mwaa_vpc_policy',\n", - " PolicyDocument=mwaa_vpc_policy_json\n", - ")\n", + "response = iam_client.create_policy(PolicyName=\"mwaa_vpc_policy\", PolicyDocument=mwaa_vpc_policy_json)\n", "\n", "mwaa_vpc_policy_arn = response[\"Policy\"][\"Arn\"]" ] @@ -211,36 +212,33 @@ "metadata": {}, "outputs": [], "source": [ - "cloudformation = boto3.resource('cloudformation')\n", + "cloudformation = boto3.resource(\"cloudformation\")\n", "\n", - "mwaa_vpc_template_yaml = open('./cfn/mwaa_vpc_template.yaml', 'r').read()\n", + "mwaa_vpc_template_yaml = open(\"./cfn/mwaa_vpc_template.yaml\", \"r\").read()\n", "\n", "response = cloudformation.create_stack(\n", - " StackName='mwaa-vpc-stack',\n", + " StackName=\"mwaa-vpc-stack\",\n", " TemplateBody=mwaa_vpc_template_yaml,\n", " Parameters=[\n", - " {\n", - " 'ParameterKey': 'EnvironmentName',\n", - " 'ParameterValue': airflow_vpc_name\n", - " },\n", + " {\"ParameterKey\": \"EnvironmentName\", \"ParameterValue\": airflow_vpc_name},\n", " ],\n", " ResourceTypes=[\n", - " 'AWS::EC2::VPC',\n", + " \"AWS::EC2::VPC\",\n", " ],\n", - " OnFailure='ROLLBACK',\n", - " EnableTerminationProtection=False\n", + " OnFailure=\"ROLLBACK\",\n", + " EnableTerminationProtection=False,\n", ")\n", "\n", - "stack_status = 'IN_PROGRESS'\n", + "stack_status = \"IN_PROGRESS\"\n", "\n", - "print ('Starting deployment of VPC {}. \\n'.format(airflow_vpc_name))\n", + "print(\"Starting deployment of VPC {}. \\n\".format(airflow_vpc_name))\n", "\n", - "while stack_status != 'CREATE_COMPLETE':\n", - " stack_status = cloudformation.Stack('mwaa-vpc-stack').stack_status\n", + "while stack_status != \"CREATE_COMPLETE\":\n", + " stack_status = cloudformation.Stack(\"mwaa-vpc-stack\").stack_status\n", " time.sleep(30)\n", " print(\"Still waiting....\")\n", "\n", - "print ('\\n Sucess! VPC {} has been deployed sucessfully.'.format(airflow_vpc_name))" + "print(\"\\n Sucess! VPC {} has been deployed sucessfully.\".format(airflow_vpc_name))" ] }, { @@ -249,21 +247,21 @@ "metadata": {}, "outputs": [], "source": [ - "vpc_outputs = cloudformation.Stack('mwaa-vpc-stack').outputs\n", + "vpc_outputs = cloudformation.Stack(\"mwaa-vpc-stack\").outputs\n", "\n", "airflow_sg_id = None\n", - "for output in vpc_outputs: \n", - " if output['OutputKey'] == 'IngressSecurityGroup': \n", - " airflow_sg_id = output['OutputValue'] \n", + "for output in vpc_outputs:\n", + " if output[\"OutputKey\"] == \"IngressSecurityGroup\":\n", + " airflow_sg_id = output[\"OutputValue\"]\n", " break\n", - " \n", - "subnet_index_list = ['PrivateSubnet1', 'PrivateSubnet2']\n", + "\n", + "subnet_index_list = [\"PrivateSubnet1\", \"PrivateSubnet2\"]\n", "airflow_subnet_ids = []\n", "\n", "for output in vpc_outputs:\n", " for subnet_index in subnet_index_list:\n", - " if output['OutputKey'] == subnet_index: \n", - " airflow_subnet_ids.append(output['OutputValue']) \n" + " if output[\"OutputKey\"] == subnet_index:\n", + " airflow_subnet_ids.append(output[\"OutputValue\"])" ] }, { @@ -289,11 +287,8 @@ "metadata": {}, "outputs": [], "source": [ - "#Remove MWAA VPC Policy only needed for VPC Creation\n", - "response = iam_client.detach_role_policy(\n", - " RoleName=\"TeamRole\",\n", - " PolicyArn=mwaa_vpc_policy_arn\n", - ")" + "# Remove MWAA VPC Policy only needed for VPC Creation\n", + "response = iam_client.detach_role_policy(RoleName=\"TeamRole\", PolicyArn=mwaa_vpc_policy_arn)" ] }, { @@ -302,13 +297,17 @@ "metadata": {}, "outputs": [], "source": [ - "#Check number of policies attached to TeamRole, we need to have nine\n", - "team_role = iam.Role('TeamRole')\n", + "# Check number of policies attached to TeamRole, we need to have nine\n", + "team_role = iam.Role(\"TeamRole\")\n", "\n", - "aws_managed_policies = [p for p in team_role.attached_policies.all() ] \n", + "aws_managed_policies = [p for p in team_role.attached_policies.all()]\n", "\n", - "if(len(aws_managed_policies) >= 10): \n", - " print('You have: {0} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA Policy.'.format(len(aws_managed_policies)))\n", + "if len(aws_managed_policies) >= 10:\n", + " print(\n", + " \"You have: {0} policies attached to TeamRole, you need downsize to 9 Policies so that we can add an MWAA Policy.\".format(\n", + " len(aws_managed_policies)\n", + " )\n", + " )\n", " print(\"Please do NOT continue unless until you run this and get a Success message\")\n", "else:\n", " print(\"Success! Please Continue...\")" @@ -320,13 +319,13 @@ "metadata": {}, "outputs": [], "source": [ - "mwaa_policy_json = open('./src/mwaa_policy.json', 'r').read()\n", - "mwaa_policy_json = mwaa_policy_json.replace('{0}',region)\n", - "mwaa_policy_json = mwaa_policy_json.replace('{1}',account_id)\n", - "mwaa_policy_json = mwaa_policy_json.replace('{2}',airflow_env_name)\n", - "mwaa_policy_json = mwaa_policy_json.replace('{3}',airflow_bucket_name)\n", + "mwaa_policy_json = open(\"./src/mwaa_policy.json\", \"r\").read()\n", + "mwaa_policy_json = mwaa_policy_json.replace(\"{0}\", region)\n", + "mwaa_policy_json = mwaa_policy_json.replace(\"{1}\", account_id)\n", + "mwaa_policy_json = mwaa_policy_json.replace(\"{2}\", airflow_env_name)\n", + "mwaa_policy_json = mwaa_policy_json.replace(\"{3}\", airflow_bucket_name)\n", "\n", - "mwaa_assume_policy_json = open('./src/mwaa_assume_policy.json', 'r').read()" + "mwaa_assume_policy_json = open(\"./src/mwaa_assume_policy.json\", \"r\").read()" ] }, { @@ -335,20 +334,11 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam_client.create_policy(\n", - " PolicyName='mwaa_policy',\n", - " PolicyDocument=mwaa_policy_json\n", - ")\n", + "response = iam_client.create_policy(PolicyName=\"mwaa_policy\", PolicyDocument=mwaa_policy_json)\n", "\n", - "response = iam_client.attach_role_policy(\n", - " RoleName=\"TeamRole\",\n", - " PolicyArn=response[\"Policy\"][\"Arn\"]\n", - ")\n", + "response = iam_client.attach_role_policy(RoleName=\"TeamRole\", PolicyArn=response[\"Policy\"][\"Arn\"])\n", "\n", - "response = iam_client.update_assume_role_policy(\n", - " RoleName=\"TeamRole\",\n", - " PolicyDocument=mwaa_assume_policy_json\n", - ")" + "response = iam_client.update_assume_role_policy(RoleName=\"TeamRole\", PolicyDocument=mwaa_assume_policy_json)" ] }, { @@ -420,4 +410,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb b/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb index 58ac8cce..8253620f 100644 --- a/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb +++ b/10_pipeline/airflow/02_Create_Airflow_Environment.ipynb @@ -18,9 +18,9 @@ "\n", "session = boto3.session.Session()\n", "region = session.region_name\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)" + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)" ] }, { @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "setup_s3_bucket_passed=False\n", + "setup_s3_bucket_passed = False\n", "%store -r airflow_bucket_name\n", "%store -r s3_mwaa_private_path\n", "%store -r s3_mwaa_dags_private_path\n", @@ -62,9 +62,9 @@ "try:\n", " response = s3.head_bucket(Bucket=airflow_bucket_name)\n", " print(response)\n", - " setup_s3_bucket_passed=True\n", + " setup_s3_bucket_passed = True\n", "except ClientError as e:\n", - " print('[ERROR] Cannot find bucket {} in {} due to {}.'.format(airflow_bucket_name, response, e))" + " print(\"[ERROR] Cannot find bucket {} in {} due to {}.\".format(airflow_bucket_name, response, e))" ] }, { @@ -89,49 +89,34 @@ "metadata": {}, "outputs": [], "source": [ - "mwaa = boto3.client('mwaa')\n", + "mwaa = boto3.client(\"mwaa\")\n", "\n", - "s3_mwaa_bucket_arn= 'arn:aws:s3:::{}'.format(airflow_bucket_name)\n", + "s3_mwaa_bucket_arn = \"arn:aws:s3:::{}\".format(airflow_bucket_name)\n", "\n", "airflow_env_arn = mwaa.create_environment(\n", - " DagS3Path='dags',\n", + " DagS3Path=\"dags\",\n", " ExecutionRoleArn=team_role_arn,\n", - " AirflowVersion='1.10.12',\n", - " WebserverAccessMode='PUBLIC_ONLY',\n", + " AirflowVersion=\"1.10.12\",\n", + " WebserverAccessMode=\"PUBLIC_ONLY\",\n", " LoggingConfiguration={\n", - " 'DagProcessingLogs': {\n", - " 'Enabled': True,\n", - " 'LogLevel': 'ERROR'\n", - " },\n", - " 'SchedulerLogs': {\n", - " 'Enabled': True,\n", - " 'LogLevel': 'ERROR'\n", - " },\n", - " 'TaskLogs': {\n", - " 'Enabled': True,\n", - " 'LogLevel': 'INFO'\n", - " },\n", - " 'WebserverLogs': {\n", - " 'Enabled': True,\n", - " 'LogLevel': 'ERROR'\n", - " },\n", - " 'WorkerLogs': {\n", - " 'Enabled': True,\n", - " 'LogLevel': 'ERROR'\n", - " }\n", + " \"DagProcessingLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n", + " \"SchedulerLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n", + " \"TaskLogs\": {\"Enabled\": True, \"LogLevel\": \"INFO\"},\n", + " \"WebserverLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n", + " \"WorkerLogs\": {\"Enabled\": True, \"LogLevel\": \"ERROR\"},\n", " },\n", " MaxWorkers=3,\n", " Name=airflow_env_name,\n", " NetworkConfiguration={\n", - " 'SecurityGroupIds': [\n", + " \"SecurityGroupIds\": [\n", " airflow_sg_id,\n", " ],\n", - " 'SubnetIds': airflow_subnet_ids\n", + " \"SubnetIds\": airflow_subnet_ids,\n", " },\n", - " RequirementsS3ObjectVersion='latest',\n", - " RequirementsS3Path='requirements.txt',\n", + " RequirementsS3ObjectVersion=\"latest\",\n", + " RequirementsS3Path=\"requirements.txt\",\n", " SourceBucketArn=s3_mwaa_bucket_arn,\n", - " EnvironmentClass='mw1.small'\n", + " EnvironmentClass=\"mw1.small\",\n", ")\n", "\n", "%store airflow_env_arn" @@ -151,30 +136,29 @@ "outputs": [], "source": [ "def get_airflow_check():\n", - " response = mwaa.get_environment(\n", - " Name=airflow_env_name\n", - " )\n", + " response = mwaa.get_environment(Name=airflow_env_name)\n", " mwaa_status = response[\"Environment\"][\"Status\"]\n", " return mwaa_status\n", "\n", + "\n", "mwaa_status = \"CREATING\"\n", "\n", - "print('Checking to see if MWAA Env: {} is ready.'.format(airflow_env_name))\n", + "print(\"Checking to see if MWAA Env: {} is ready.\".format(airflow_env_name))\n", "\n", - "while (get_airflow_check() != 'AVAILABLE'):\n", + "while get_airflow_check() != \"AVAILABLE\":\n", " mwaa_status\n", " time.sleep(60)\n", " print(\"Still waiting for MWAA Environment...\")\n", "\n", - "print('Sucess! MWAA Env: {} is ready!'.format(airflow_env_name)) \n" + "print(\"Sucess! MWAA Env: {} is ready!\".format(airflow_env_name))" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "# PLEASE MAKE SURE THAT THE ABOVE COMMAND RAN SUCESSFULLY BEFORE CONTINUING" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", @@ -262,4 +246,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb b/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb index 3d922e5c..8625352b 100644 --- a/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb +++ b/10_pipeline/airflow/03_Trigger_Airflow_Environment.ipynb @@ -18,9 +18,9 @@ "\n", "session = boto3.session.Session()\n", "region = session.region_name\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "dag_name = 'bert_reviews'" + "dag_name = \"bert_reviews\"" ] }, { @@ -40,30 +40,29 @@ "metadata": {}, "outputs": [], "source": [ - "mwaa = boto3.client('mwaa')\n", + "mwaa = boto3.client(\"mwaa\")\n", "mwaa_status = \"\"\n", "\n", + "\n", "def get_airflow_check():\n", - " response = mwaa.get_environment(\n", - " Name=airflow_env_name\n", - " )\n", + " response = mwaa.get_environment(Name=airflow_env_name)\n", " mwaa_status = response[\"Environment\"][\"Status\"]\n", " return mwaa_status\n", "\n", "\n", "mwaa_status = get_airflow_check()\n", - "if(mwaa_status != 'AVAILABLE'):\n", - " print('[ERROR] Cannot find MWAA {}.'.format(airflow_env_name))\n", - "else: \n", - " print('Sucess! {} is ready!'.format(airflow_env_name))" + "if mwaa_status != \"AVAILABLE\":\n", + " print(\"[ERROR] Cannot find MWAA {}.\".format(airflow_env_name))\n", + "else:\n", + " print(\"Sucess! {} is ready!\".format(airflow_env_name))" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "# PLEASE MAKE SURE THAT THE ABOVE COMMAND RAN SUCESSFULLY BEFORE CONTINUING" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "markdown", @@ -78,28 +77,21 @@ "metadata": {}, "outputs": [], "source": [ - "mwaa_cli_token = mwaa.create_cli_token(\n", - " Name=airflow_env_name\n", - ")\n", + "mwaa_cli_token = mwaa.create_cli_token(Name=airflow_env_name)\n", "\n", - "cli_token = 'Bearer ' + mwaa_cli_token['CliToken']\n", - "mwaa_web_server_hostname = 'https://' + mwaa_cli_token['WebServerHostname'] + '/aws_mwaa/cli'\n", + "cli_token = \"Bearer \" + mwaa_cli_token[\"CliToken\"]\n", + "mwaa_web_server_hostname = \"https://\" + mwaa_cli_token[\"WebServerHostname\"] + \"/aws_mwaa/cli\"\n", "\n", - "raw_data = 'trigger_dag {}'.format(dag_name)\n", + "raw_data = \"trigger_dag {}\".format(dag_name)\n", "\n", "response = requests.post(\n", - " mwaa_web_server_hostname,\n", - " headers={\n", - " 'Authorization': cli_token,\n", - " 'Content-Type': 'text/plain'\n", - " },\n", - " data=raw_data\n", - " )\n", + " mwaa_web_server_hostname, headers={\"Authorization\": cli_token, \"Content-Type\": \"text/plain\"}, data=raw_data\n", + ")\n", "\n", - "if (response.status_code != 200):\n", - " print('ERROR: DAG: {} failed to get triggered!'.format(dag_name))\n", + "if response.status_code != 200:\n", + " print(\"ERROR: DAG: {} failed to get triggered!\".format(dag_name))\n", "else:\n", - " print('Sucess! DAG: {} was triggered successfuly'.format(dag_name))" + " print(\"Sucess! DAG: {} was triggered successfuly\".format(dag_name))" ] }, { @@ -171,4 +163,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/10_pipeline/airflow/dags/bert_reviews.py b/10_pipeline/airflow/dags/bert_reviews.py index 81e1f680..c597eca1 100644 --- a/10_pipeline/airflow/dags/bert_reviews.py +++ b/10_pipeline/airflow/dags/bert_reviews.py @@ -5,7 +5,7 @@ import sys -sys.path.append('./airflow/dags/') +sys.path.append("./airflow/dags/") # airflow operators import airflow @@ -16,12 +16,9 @@ from airflow.operators.python_operator import PythonOperator # airflow sagemaker operators -from airflow.contrib.operators.sagemaker_training_operator \ - import SageMakerTrainingOperator -from airflow.contrib.operators.sagemaker_tuning_operator \ - import SageMakerTuningOperator -from airflow.contrib.operators.sagemaker_transform_operator \ - import SageMakerTransformOperator +from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator +from airflow.contrib.operators.sagemaker_tuning_operator import SageMakerTuningOperator +from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator from airflow.contrib.hooks.aws_hook import AwsHook # sagemaker sdk @@ -46,11 +43,9 @@ def is_hpo_enabled(): - """check if hyper-parameter optimization is enabled in the config - """ + """check if hyper-parameter optimization is enabled in the config""" hpo_enabled = False - if "job_level" in config and \ - "run_hyperparameter_opt" in config["job_level"]: + if "job_level" in config and "run_hyperparameter_opt" in config["job_level"]: run_hpo_config = config["job_level"]["run_hyperparameter_opt"] if run_hpo_config.lower() == "yes": hpo_enabled = True @@ -58,10 +53,11 @@ def is_hpo_enabled(): def get_sagemaker_role_arn(role_name, region_name): - iam = boto3.client('iam', region_name=region_name) + iam = boto3.client("iam", region_name=region_name) response = iam.get_role(RoleName=role_name) return response["Role"]["Arn"] + # ============================================================================= # setting up training, tuning and transform configuration # ============================================================================= @@ -71,13 +67,11 @@ def get_sagemaker_role_arn(role_name, region_name): config = cfg.config # set configuration for tasks -hook = AwsHook(aws_conn_id='airflow-sagemaker') +hook = AwsHook(aws_conn_id="airflow-sagemaker") region = config["job_level"]["region_name"] sess = hook.get_session(region_name=region) -role = get_sagemaker_role_arn( - config["train_model"]["sagemaker_role"], - sess.region_name) -container = get_image_uri(sess.region_name, 'factorization-machines') +role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name) +container = get_image_uri(sess.region_name, "factorization-machines") hpo_enabled = is_hpo_enabled() # create estimator @@ -89,20 +83,13 @@ def get_sagemaker_role_arn(role_name, region_name): ) # train_config specifies SageMaker training configuration -train_config = training_config( - estimator=fm_estimator, - inputs=config["train_model"]["inputs"]) +train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"]) # create tuner -fm_tuner = HyperparameterTuner( - estimator=fm_estimator, - **config["tune_model"]["tuner_config"] -) +fm_tuner = HyperparameterTuner(estimator=fm_estimator, **config["tune_model"]["tuner_config"]) # create tuning config -tuner_config = tuning_config( - tuner=fm_tuner, - inputs=config["tune_model"]["inputs"]) +tuner_config = tuning_config(tuner=fm_tuner, inputs=config["tune_model"]["inputs"]) # create transform config transform_config = transform_config_from_estimator( @@ -118,56 +105,54 @@ def get_sagemaker_role_arn(role_name, region_name): # define airflow DAG -args = { - 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) -} +args = {"owner": "airflow", "start_date": airflow.utils.dates.days_ago(2)} dag = DAG( - dag_id='bert_reviews', + dag_id="bert_reviews", default_args=args, schedule_interval=None, concurrency=1, max_active_runs=1, - user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)} + user_defined_filters={"tojson": lambda s: json.JSONEncoder().encode(s)}, ) # set the tasks in the DAG # dummy operator -init = DummyOperator( - task_id='start', - dag=dag -) +init = DummyOperator(task_id="start", dag=dag) # preprocess the data -process_task= PythonOperator( - task_id='process', +process_task = PythonOperator( + task_id="process", dag=dag, provide_context=False, python_callable=preprocess.preprocess, - op_kwargs=config["preprocess_data"]) + op_kwargs=config["preprocess_data"], +) -train_task= PythonOperator( - task_id='train', +train_task = PythonOperator( + task_id="train", dag=dag, provide_context=False, python_callable=preprocess.preprocess, - op_kwargs=config["preprocess_data"]) + op_kwargs=config["preprocess_data"], +) -model_task= PythonOperator( - task_id='model', +model_task = PythonOperator( + task_id="model", dag=dag, provide_context=False, python_callable=preprocess.preprocess, - op_kwargs=config["preprocess_data"]) + op_kwargs=config["preprocess_data"], +) -deploy_task= PythonOperator( - task_id='deploy', +deploy_task = PythonOperator( + task_id="deploy", dag=dag, provide_context=False, python_callable=preprocess.preprocess, - op_kwargs=config["preprocess_data"]) + op_kwargs=config["preprocess_data"], +) # set the dependencies between tasks diff --git a/10_pipeline/airflow/dags/config.py b/10_pipeline/airflow/dags/config.py index 9a2f38aa..0b726bdf 100644 --- a/10_pipeline/airflow/dags/config.py +++ b/10_pipeline/airflow/dags/config.py @@ -3,24 +3,21 @@ config = {} -config["job_level"] = { - "region_name": "us-east-1", - "run_hyperparameter_opt": "no" -} +config["job_level"] = {"region_name": "us-east-1", "run_hyperparameter_opt": "no"} config["preprocess_data"] = { "s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", - "s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace + "s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace "s3_out_prefix": "preprocess/", - "delimiter": "\t" + "delimiter": "\t", } config["prepare_data"] = { - "s3_in_bucket": "sagemaker-us-east-1-835319576252", # replace + "s3_in_bucket": "sagemaker-us-east-1-835319576252", # replace "s3_in_prefix": "preprocess/", "s3_out_bucket": "sagemaker-us-east-1-835319576252", # replace "s3_out_prefix": "prepare/", - "delimiter": "\t" + "delimiter": "\t", } config["train_model"] = { @@ -37,12 +34,12 @@ "epochs": "10", "mini_batch_size": "200", "num_factors": "64", - "predictor_type": 'regressor' - } + "predictor_type": "regressor", + }, }, "inputs": { "train": "s3://sagemaker-us-east-1-835319576252/prepare/train/train.protobuf", # replace - } + }, } config["tune_model"] = { @@ -51,16 +48,16 @@ "objective_type": "Minimize", "hyperparameter_ranges": { "factors_lr": ContinuousParameter(0.0001, 0.2), - "factors_init_sigma": ContinuousParameter(0.0001, 1) + "factors_init_sigma": ContinuousParameter(0.0001, 1), }, "max_jobs": 20, "max_parallel_jobs": 2, - "base_tuning_job_name": "hpo-recommender" + "base_tuning_job_name": "hpo-recommender", }, "inputs": { "train": "s3://sagemaker-us-east-1-835319576252/prepare/train/train.protobuf", # replace - "test": "s3://sagemaker-us-east-1-835319576252/prepare/validate/validate.protobuf" # replace - } + "test": "s3://sagemaker-us-east-1-835319576252/prepare/validate/validate.protobuf", # replace + }, } config["batch_transform"] = { @@ -71,6 +68,6 @@ "data_type": "S3Prefix", "content_type": "application/x-recordio-protobuf", "strategy": "MultiRecord", - "output_path": "s3://sagemaker-us-east-1-835319576252/transform/" + "output_path": "s3://sagemaker-us-east-1-835319576252/transform/", } } diff --git a/10_pipeline/airflow/dags/pipeline/prepare.py b/10_pipeline/airflow/dags/pipeline/prepare.py index ad82fed5..2f2a243a 100644 --- a/10_pipeline/airflow/dags/pipeline/prepare.py +++ b/10_pipeline/airflow/dags/pipeline/prepare.py @@ -20,20 +20,20 @@ def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products): # extract customers and ratings df_X = df_val[:, 0:2] # Features are one-hot encoded in a sparse matrix - X = lil_matrix((nb_rows, nb_cols)).astype('float32') + X = lil_matrix((nb_rows, nb_cols)).astype("float32") df_X[:, 1] = nb_customer + df_X[:, 1] coords = df_X[:, 0:2] X[np.arange(nb_rows), coords[:, 0]] = 1 X[np.arange(nb_rows), coords[:, 1]] = 1 # create label with ratings - Y = df_val[:, 2].astype('float32') + Y = df_val[:, 2].astype("float32") # validate size and shape print(X.shape) print(Y.shape) assert X.shape == (nb_rows, nb_cols) - assert Y.shape == (nb_rows, ) + assert Y.shape == (nb_rows,) return X, Y @@ -60,24 +60,19 @@ def save_as_protobuf(X, Y, bucket, key): buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, X, Y) buf.seek(0) - obj = '{}'.format(key) - boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf) - return 's3://{}/{}'.format(bucket, obj) + obj = "{}".format(key) + boto3.resource("s3").Bucket(bucket).Object(obj).upload_fileobj(buf) + return "s3://{}/{}".format(bucket, obj) def chunk(x, batch_size): - """split array into chunks of batch_size - """ + """split array into chunks of batch_size""" chunk_range = range(0, x.shape[0], batch_size) - chunks = [x[p: p + batch_size] for p in chunk_range] + chunks = [x[p : p + batch_size] for p in chunk_range] return chunks -def prepare(s3_in_bucket, - s3_in_prefix, - s3_out_bucket, - s3_out_prefix, - delimiter=","): +def prepare(s3_in_bucket, s3_in_prefix, s3_out_bucket, s3_out_prefix, delimiter=","): """Prepare data for training with Sagemaker algorithms - Read preprocessed data and converts to ProtoBuf format to prepare for @@ -114,47 +109,36 @@ def prepare(s3_in_bucket, # prepare training data set if s3_in_prefix[-1] == "/": s3_in_prefix = s3_in_prefix[:-1] - s3_train_url = "s3://{}/{}/{}".format( - s3_in_bucket, s3_in_prefix, 'train/train.csv') - train_df = pd.read_csv(s3_train_url, - sep=str(','), error_bad_lines=False) + s3_train_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "train/train.csv") + train_df = pd.read_csv(s3_train_url, sep=str(","), error_bad_lines=False) # prepare validateion dataset - s3_validate_url = "s3://{}/{}/{}".format( - s3_in_bucket, s3_in_prefix, 'validate/validate.csv') - validate_df = pd.read_csv(s3_validate_url, - sep=str(','), error_bad_lines=False) + s3_validate_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "validate/validate.csv") + validate_df = pd.read_csv(s3_validate_url, sep=str(","), error_bad_lines=False) # prepare test dataset - s3_test_url = "s3://{}/{}/{}".format( - s3_in_bucket, s3_in_prefix, 'test/test.csv') - test_df = pd.read_csv(s3_test_url, - sep=str(','), error_bad_lines=False) + s3_test_url = "s3://{}/{}/{}".format(s3_in_bucket, s3_in_prefix, "test/test.csv") + test_df = pd.read_csv(s3_test_url, sep=str(","), error_bad_lines=False) # get feature dimension all_df = pd.concat([train_df, validate_df, test_df]) - nb_customer = np.unique(all_df['customer'].values).shape[0] - nb_products = np.unique(all_df['product'].values).shape[0] + nb_customer = np.unique(all_df["customer"].values).shape[0] + nb_products = np.unique(all_df["product"].values).shape[0] feature_dim = nb_customer + nb_products print(nb_customer, nb_products, feature_dim) - train_X, train_Y = convert_sparse_matrix( - train_df, train_df.shape[0], nb_customer, nb_products) - validate_X, validate_Y = convert_sparse_matrix( - validate_df, validate_df.shape[0], nb_customer, nb_products) - test_X, test_Y = convert_sparse_matrix( - test_df, test_df.shape[0], nb_customer, nb_products) + train_X, train_Y = convert_sparse_matrix(train_df, train_df.shape[0], nb_customer, nb_products) + validate_X, validate_Y = convert_sparse_matrix(validate_df, validate_df.shape[0], nb_customer, nb_products) + test_X, test_Y = convert_sparse_matrix(test_df, test_df.shape[0], nb_customer, nb_products) # write train and test in protobuf format to s3 if s3_out_prefix[-1] == "/": s3_out_prefix = s3_out_prefix[:-1] - train_data = save_as_protobuf( - train_X, train_Y, s3_out_bucket, - s3_out_prefix + "/" + "train/train.protobuf") + train_data = save_as_protobuf(train_X, train_Y, s3_out_bucket, s3_out_prefix + "/" + "train/train.protobuf") print(train_data) validate_data = save_as_protobuf( - validate_X, validate_Y, s3_out_bucket, - s3_out_prefix + "/" + "validate/validate.protobuf") + validate_X, validate_Y, s3_out_bucket, s3_out_prefix + "/" + "validate/validate.protobuf" + ) print(validate_data) # chunk test data to avoid payload size issues when batch transforming @@ -166,7 +150,8 @@ def prepare(s3_in_bucket, test_x_chunks[i], test_y_chunks[i], s3_out_bucket, - s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf") + s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf", + ) print(test_data) return "SUCCESS" diff --git a/10_pipeline/airflow/dags/pipeline/preprocess.py b/10_pipeline/airflow/dags/pipeline/preprocess.py index 5a30bfab..07256487 100644 --- a/10_pipeline/airflow/dags/pipeline/preprocess.py +++ b/10_pipeline/airflow/dags/pipeline/preprocess.py @@ -3,10 +3,7 @@ import s3fs -def preprocess(s3_in_url, - s3_out_bucket, - s3_out_prefix, - delimiter=","): +def preprocess(s3_in_url, s3_out_bucket, s3_out_prefix, delimiter=","): """Preprocesses data based on business logic - Reads delimited file passed as s3_url and preprocess data by filtering @@ -42,64 +39,52 @@ def preprocess(s3_in_url, # limit dataframe to customer_id, product_id, and star_rating # `product_title` will be useful validating recommendations - df = df[['customer_id', 'product_id', 'star_rating', 'product_title']] + df = df[["customer_id", "product_id", "star_rating", "product_title"]] # clean out the long tail because most people haven't seen most videos, # and people rate fewer videos than they actually watch - customers = df['customer_id'].value_counts() - products = df['product_id'].value_counts() + customers = df["customer_id"].value_counts() + products = df["product_id"].value_counts() # based on data exploration only about 5% of customers have rated 5 or # more videos, and only 25% of videos have been rated by 9+ customers customers = customers[customers >= 5] products = products[products >= 10] print("# of rows before the long tail = {:10d}".format(df.shape[0])) - reduced_df = df \ - .merge(pd.DataFrame({'customer_id': customers.index})) \ - .merge(pd.DataFrame({'product_id': products.index})) - print("# of rows after the long tail = {:10d}".format( - reduced_df.shape[0])) - reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id']) - print("# of rows after removing duplicates = {:10d}".format( - reduced_df.shape[0])) + reduced_df = df.merge(pd.DataFrame({"customer_id": customers.index})).merge( + pd.DataFrame({"product_id": products.index}) + ) + print("# of rows after the long tail = {:10d}".format(reduced_df.shape[0])) + reduced_df = reduced_df.drop_duplicates(["customer_id", "product_id"]) + print("# of rows after removing duplicates = {:10d}".format(reduced_df.shape[0])) # recreate customer and product lists since there are customers with # more than 5 reviews, but all of their reviews are on products with # less than 5 reviews (and vice versa) - customers = reduced_df['customer_id'].value_counts() - products = reduced_df['product_id'].value_counts() + customers = reduced_df["customer_id"].value_counts() + products = reduced_df["product_id"].value_counts() # sequentially index each user and item to hold the sparse format where # the indices indicate the row and column in our ratings matrix - customer_index = pd.DataFrame({ - 'customer_id': customers.index, - 'customer': np.arange(customers.shape[0])}) - product_index = pd.DataFrame({ - 'product_id': products.index, - 'product': np.arange(products.shape[0])}) - reduced_df = reduced_df \ - .merge(customer_index) \ - .merge(product_index) - - nb_customer = reduced_df['customer'].max() + 1 - nb_products = reduced_df['product'].max() + 1 + customer_index = pd.DataFrame({"customer_id": customers.index, "customer": np.arange(customers.shape[0])}) + product_index = pd.DataFrame({"product_id": products.index, "product": np.arange(products.shape[0])}) + reduced_df = reduced_df.merge(customer_index).merge(product_index) + + nb_customer = reduced_df["customer"].max() + 1 + nb_products = reduced_df["product"].max() + 1 feature_dim = nb_customer + nb_products print(nb_customer, nb_products, feature_dim) - product_df = reduced_df[['customer', 'product', 'star_rating']] + product_df = reduced_df[["customer", "product", "star_rating"]] # split into train, validation and test data sets train_df, validate_df, test_df = np.split( - product_df.sample(frac=1), - [int(.6*len(product_df)), int(.8*len(product_df))] + product_df.sample(frac=1), [int(0.6 * len(product_df)), int(0.8 * len(product_df))] ) - print("# of rows train data set = {:10d}".format( - train_df.shape[0])) - print("# of rows validation data set = {:10d}".format( - validate_df.shape[0])) - print("# of rows test data set = {:10d}".format( - test_df.shape[0])) + print("# of rows train data set = {:10d}".format(train_df.shape[0])) + print("# of rows validation data set = {:10d}".format(validate_df.shape[0])) + print("# of rows test data set = {:10d}".format(test_df.shape[0])) # select columns required for training the model # excluding columns "customer_id", "product_id", "product_title" to @@ -111,25 +96,21 @@ def preprocess(s3_in_url, # write output to s3 as delimited file fs = s3fs.S3FileSystem(anon=False) - s3_out_prefix = s3_out_prefix[:-1] \ - if s3_out_prefix[-1] == "/" else s3_out_prefix - s3_out_train = "s3://{}/{}/{}".format( - s3_out_bucket, s3_out_prefix, "train/train.csv") + s3_out_prefix = s3_out_prefix[:-1] if s3_out_prefix[-1] == "/" else s3_out_prefix + s3_out_train = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "train/train.csv") print("writing training data to {}".format(s3_out_train)) with fs.open(s3_out_train, "w") as f: - train_df.to_csv(f, sep=str(','), index=False) + train_df.to_csv(f, sep=str(","), index=False) - s3_out_validate = "s3://{}/{}/{}".format( - s3_out_bucket, s3_out_prefix, "validate/validate.csv") + s3_out_validate = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "validate/validate.csv") print("writing test data to {}".format(s3_out_validate)) with fs.open(s3_out_validate, "w") as f: - validate_df.to_csv(f, sep=str(','), index=False) + validate_df.to_csv(f, sep=str(","), index=False) - s3_out_test = "s3://{}/{}/{}".format( - s3_out_bucket, s3_out_prefix, "test/test.csv") + s3_out_test = "s3://{}/{}/{}".format(s3_out_bucket, s3_out_prefix, "test/test.csv") print("writing test data to {}".format(s3_out_test)) with fs.open(s3_out_test, "w") as f: - test_df.to_csv(f, sep=str(','), index=False) + test_df.to_csv(f, sep=str(","), index=False) print("preprocessing completed") return "SUCCESS" diff --git a/10_pipeline/airflow/src/config.py b/10_pipeline/airflow/src/config.py index 5aeb92c5..77ae4a8c 100644 --- a/10_pipeline/airflow/src/config.py +++ b/10_pipeline/airflow/src/config.py @@ -3,24 +3,21 @@ config = {} -config["job_level"] = { - "region_name": "{0}", - "run_hyperparameter_opt": "no" -} +config["job_level"] = {"region_name": "{0}", "run_hyperparameter_opt": "no"} config["preprocess_data"] = { "s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz", - "s3_out_bucket": "{1}", # replace + "s3_out_bucket": "{1}", # replace "s3_out_prefix": "preprocess/", - "delimiter": "\t" + "delimiter": "\t", } config["prepare_data"] = { - "s3_in_bucket": "{1}", # replace + "s3_in_bucket": "{1}", # replace "s3_in_prefix": "preprocess/", "s3_out_bucket": "{1}", # replace "s3_out_prefix": "prepare/", - "delimiter": "\t" + "delimiter": "\t", } config["train_model"] = { @@ -37,12 +34,12 @@ "epochs": "10", "mini_batch_size": "200", "num_factors": "64", - "predictor_type": 'regressor' - } + "predictor_type": "regressor", + }, }, "inputs": { "train": "s3://{1}/prepare/train/train.protobuf", # replace - } + }, } config["tune_model"] = { @@ -51,16 +48,16 @@ "objective_type": "Minimize", "hyperparameter_ranges": { "factors_lr": ContinuousParameter(0.0001, 0.2), - "factors_init_sigma": ContinuousParameter(0.0001, 1) + "factors_init_sigma": ContinuousParameter(0.0001, 1), }, "max_jobs": 20, "max_parallel_jobs": 2, - "base_tuning_job_name": "hpo-recommender" + "base_tuning_job_name": "hpo-recommender", }, "inputs": { "train": "s3://{1}/prepare/train/train.protobuf", # replace - "test": "s3://{1}/prepare/validate/validate.protobuf" # replace - } + "test": "s3://{1}/prepare/validate/validate.protobuf", # replace + }, } config["batch_transform"] = { @@ -71,6 +68,6 @@ "data_type": "S3Prefix", "content_type": "application/x-recordio-protobuf", "strategy": "MultiRecord", - "output_path": "s3://{1}/transform/" + "output_path": "s3://{1}/transform/", } } diff --git a/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py b/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py index c0265d09..0324b855 100644 --- a/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py +++ b/10_pipeline/airflow/src/dag_ml_pipeline_amazon_video_reviews.py @@ -5,7 +5,7 @@ import sys -sys.path.append('/Users/cfregly/airflow/dags/') +sys.path.append("/Users/cfregly/airflow/dags/") # airflow operators import airflow @@ -16,12 +16,9 @@ from airflow.operators.python_operator import PythonOperator # airflow sagemaker operators -from airflow.contrib.operators.sagemaker_training_operator \ - import SageMakerTrainingOperator -from airflow.contrib.operators.sagemaker_tuning_operator \ - import SageMakerTuningOperator -from airflow.contrib.operators.sagemaker_transform_operator \ - import SageMakerTransformOperator +from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator +from airflow.contrib.operators.sagemaker_tuning_operator import SageMakerTuningOperator +from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator from airflow.contrib.hooks.aws_hook import AwsHook # sagemaker sdk @@ -46,11 +43,9 @@ def is_hpo_enabled(): - """check if hyper-parameter optimization is enabled in the config - """ + """check if hyper-parameter optimization is enabled in the config""" hpo_enabled = False - if "job_level" in config and \ - "run_hyperparameter_opt" in config["job_level"]: + if "job_level" in config and "run_hyperparameter_opt" in config["job_level"]: run_hpo_config = config["job_level"]["run_hyperparameter_opt"] if run_hpo_config.lower() == "yes": hpo_enabled = True @@ -58,10 +53,11 @@ def is_hpo_enabled(): def get_sagemaker_role_arn(role_name, region_name): - iam = boto3.client('iam', region_name=region_name) + iam = boto3.client("iam", region_name=region_name) response = iam.get_role(RoleName=role_name) return response["Role"]["Arn"] + # ============================================================================= # setting up training, tuning and transform configuration # ============================================================================= @@ -71,13 +67,11 @@ def get_sagemaker_role_arn(role_name, region_name): config = cfg.config # set configuration for tasks -hook = AwsHook(aws_conn_id='airflow-sagemaker') +hook = AwsHook(aws_conn_id="airflow-sagemaker") region = config["job_level"]["region_name"] sess = hook.get_session(region_name=region) -role = get_sagemaker_role_arn( - config["train_model"]["sagemaker_role"], - sess.region_name) -container = get_image_uri(sess.region_name, 'factorization-machines') +role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name) +container = get_image_uri(sess.region_name, "factorization-machines") hpo_enabled = is_hpo_enabled() # create estimator @@ -89,20 +83,13 @@ def get_sagemaker_role_arn(role_name, region_name): ) # train_config specifies SageMaker training configuration -train_config = training_config( - estimator=fm_estimator, - inputs=config["train_model"]["inputs"]) +train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"]) # create tuner -fm_tuner = HyperparameterTuner( - estimator=fm_estimator, - **config["tune_model"]["tuner_config"] -) +fm_tuner = HyperparameterTuner(estimator=fm_estimator, **config["tune_model"]["tuner_config"]) # create tuning config -tuner_config = tuning_config( - tuner=fm_tuner, - inputs=config["tune_model"]["inputs"]) +tuner_config = tuning_config(tuner=fm_tuner, inputs=config["tune_model"]["inputs"]) # create transform config transform_config = transform_config_from_estimator( @@ -118,84 +105,76 @@ def get_sagemaker_role_arn(role_name, region_name): # define airflow DAG -args = { - 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) -} +args = {"owner": "airflow", "start_date": airflow.utils.dates.days_ago(2)} dag = DAG( - dag_id='sagemaker-ml-pipeline', + dag_id="sagemaker-ml-pipeline", default_args=args, schedule_interval=None, concurrency=1, max_active_runs=1, - user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)} + user_defined_filters={"tojson": lambda s: json.JSONEncoder().encode(s)}, ) # set the tasks in the DAG # dummy operator -init = DummyOperator( - task_id='start', - dag=dag -) +init = DummyOperator(task_id="start", dag=dag) # preprocess the data preprocess_task = PythonOperator( - task_id='preprocessing', + task_id="preprocessing", dag=dag, provide_context=False, python_callable=preprocess.preprocess, - op_kwargs=config["preprocess_data"]) + op_kwargs=config["preprocess_data"], +) # prepare the data for training prepare_task = PythonOperator( - task_id='preparing', + task_id="preparing", dag=dag, provide_context=False, python_callable=prepare.prepare, - op_kwargs=config["prepare_data"] + op_kwargs=config["prepare_data"], ) branching = BranchPythonOperator( - task_id='branching', - dag=dag, - python_callable=lambda: "model_tuning" if hpo_enabled else "model_training") + task_id="branching", dag=dag, python_callable=lambda: "model_tuning" if hpo_enabled else "model_training" +) # launch sagemaker training job and wait until it completes train_model_task = SageMakerTrainingOperator( - task_id='model_training', + task_id="model_training", dag=dag, config=train_config, - aws_conn_id='airflow-sagemaker', + aws_conn_id="airflow-sagemaker", wait_for_completion=True, - check_interval=30 + check_interval=30, ) # launch sagemaker hyperparameter job and wait until it completes tune_model_task = SageMakerTuningOperator( - task_id='model_tuning', + task_id="model_tuning", dag=dag, config=tuner_config, - aws_conn_id='airflow-sagemaker', + aws_conn_id="airflow-sagemaker", wait_for_completion=True, - check_interval=30 + check_interval=30, ) # launch sagemaker batch transform job and wait until it completes batch_transform_task = SageMakerTransformOperator( - task_id='predicting', + task_id="predicting", dag=dag, config=transform_config, - aws_conn_id='airflow-sagemaker', + aws_conn_id="airflow-sagemaker", wait_for_completion=True, check_interval=30, - trigger_rule=TriggerRule.ONE_SUCCESS + trigger_rule=TriggerRule.ONE_SUCCESS, ) -cleanup_task = DummyOperator( - task_id='cleaning_up', - dag=dag) +cleanup_task = DummyOperator(task_id="cleaning_up", dag=dag) # set the dependencies between tasks diff --git a/10_pipeline/evaluate_model_metrics.py b/10_pipeline/evaluate_model_metrics.py index 024afdec..f3523174 100644 --- a/10_pipeline/evaluate_model_metrics.py +++ b/10_pipeline/evaluate_model_metrics.py @@ -4,13 +4,16 @@ from datetime import datetime import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) import pandas as pd import os import re @@ -33,99 +36,99 @@ from sklearn.utils import resample -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] -config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) +config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, +) def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--input-model', type=str, - default='/opt/ml/processing/input/model', + parser.add_argument( + "--input-model", + type=str, + default="/opt/ml/processing/input/model", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - + ) + return parser.parse_args() - + def process(args): - print('Current host: {}'.format(args.current_host)) - - print('input_data: {}'.format(args.input_data)) - print('input_model: {}'.format(args.input_model)) - - print('Listing contents of input model dir: {}'.format(args.input_model)) + print("Current host: {}".format(args.current_host)) + + print("input_data: {}".format(args.input_data)) + print("input_model: {}".format(args.input_model)) + + print("Listing contents of input model dir: {}".format(args.input_model)) input_files = os.listdir(args.input_model) for file in input_files: print(file) - model_tar_path = '{}/model.tar.gz'.format(args.input_model) + model_tar_path = "{}/model.tar.gz".format(args.input_model) model_tar = tarfile.open(model_tar_path) model_tar.extractall(args.input_model) - model_tar.close() + model_tar.close() - model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model)) + model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model)) print(model) - + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=args.max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -133,81 +136,86 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) - + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) ########################################################################################### # TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz # - ########################################################################################### -# evaluation_data_path = '/opt/ml/processing/input/data/' - - print('Listing contents of input data dir: {}'.format(args.input_data)) + ########################################################################################### + # evaluation_data_path = '/opt/ml/processing/input/data/' + + print("Listing contents of input data dir: {}".format(args.input_data)) input_files = os.listdir(args.input_data) - test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data) - print('Using only {} to evaluate.'.format(test_data_path)) - df_test_reviews = pd.read_csv(test_data_path, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data) + print("Using only {} to evaluate.".format(test_data_path)) + df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[ + ["review_body", "star_rating"] + ] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - y_actual = df_test_reviews['star_rating'] + y_actual = df_test_reviews["star_rating"] y_actual print(classification_report(y_true=y_test, y_pred=y_actual)) - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) def plot_conf_mat(cm, classes, title, cmap): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.ylabel("True label") + plt.xlabel("Predicted label") cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=CLASSES, - title='Confusion Matrix', - cmap=plt.cm.Greens) + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens) - # Save the confusion matrix + # Save the confusion matrix plt.show() - # Model Output - metrics_path = os.path.join(args.output_data, 'metrics/') + # Model Output + metrics_path = os.path.join(args.output_data, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) report_dict = { "metrics": { @@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap): evaluation_path = "{}/evaluation.json".format(metrics_path) with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict)) - - print('Listing contents of output dir: {}'.format(args.output_data)) + + print("Listing contents of output dir: {}".format(args.output_data)) output_files = os.listdir(args.output_data) for file in output_files: print(file) - print('Listing contents of output/metrics dir: {}'.format(metrics_path)) - output_files = os.listdir('{}'.format(metrics_path)) + print("Listing contents of output/metrics dir: {}".format(metrics_path)) + output_files = os.listdir("{}".format(metrics_path)) for file in output_files: print(file) - print('Complete') - - + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) - process(args) + process(args) diff --git a/10_pipeline/human/00_Overview.ipynb b/10_pipeline/human/00_Overview.ipynb index cf576e4a..fa851c30 100644 --- a/10_pipeline/human/00_Overview.ipynb +++ b/10_pipeline/human/00_Overview.ipynb @@ -66,7 +66,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] }, diff --git a/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb b/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb index 5c195013..fd0e09da 100644 --- a/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb +++ b/10_pipeline/human/01_Setup_Augmented_AI_Workflow.ipynb @@ -34,7 +34,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -54,10 +54,10 @@ "import botocore\n", "\n", "# Amazon Python SDK clients\n", - "sagemaker = boto3.client('sagemaker', region)\n", - "comprehend = boto3.client('comprehend', region)\n", - "a2i = boto3.client('sagemaker-a2i-runtime')\n", - "s3 = boto3.client('s3', region)" + "sagemaker = boto3.client(\"sagemaker\", region)\n", + "comprehend = boto3.client(\"comprehend\", region)\n", + "a2i = boto3.client(\"sagemaker-a2i-runtime\")\n", + "s3 = boto3.client(\"s3\", region)" ] }, { @@ -73,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "output_path = f's3://{bucket}/a2i-comprehend-star-rating-results'\n", + "output_path = f\"s3://{bucket}/a2i-comprehend-star-rating-results\"\n", "print(output_path)" ] }, @@ -94,7 +94,11 @@ "metadata": {}, "outputs": [], "source": [ - "print('https://{}.console.aws.amazon.com/sagemaker/groundtruth?region={}#/labeling-workforces/create'.format(region, region))" + "print(\n", + " \"https://{}.console.aws.amazon.com/sagemaker/groundtruth?region={}#/labeling-workforces/create\".format(\n", + " region, region\n", + " )\n", + ")" ] }, { @@ -147,9 +151,9 @@ "source": [ "import boto3\n", "\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", "\n", - "augmented_ai_workteam_arn = 'arn:aws:sagemaker:{}:{}:workteam/private-crowd/dsoaws'.format(region, account_id)\n", + "augmented_ai_workteam_arn = \"arn:aws:sagemaker:{}:{}:workteam/private-crowd/dsoaws\".format(region, account_id)\n", "\n", "print(augmented_ai_workteam_arn)" ] @@ -218,13 +222,11 @@ "outputs": [], "source": [ "# Task UI name - this value is unique per account and region. You can also provide your own value here.\n", - "task_ui_name = 'ui-comprehend-' + str(uuid.uuid4()) \n", + "task_ui_name = \"ui-comprehend-\" + str(uuid.uuid4())\n", "\n", "# Create a Human Task UI resource.\n", - "human_task_ui_response = sagemaker.create_human_task_ui(\n", - " HumanTaskUiName=task_ui_name,\n", - " UiTemplate={'Content': template})\n", - "human_task_ui_arn = human_task_ui_response['HumanTaskUiArn']\n", + "human_task_ui_response = sagemaker.create_human_task_ui(HumanTaskUiName=task_ui_name, UiTemplate={\"Content\": template})\n", + "human_task_ui_arn = human_task_ui_response[\"HumanTaskUiArn\"]\n", "print(human_task_ui_arn)" ] }, @@ -260,24 +262,22 @@ "import uuid\n", "\n", "# Flow definition name - this value is unique per account and region. You can also provide your own value here.\n", - "flow_definition_name = 'fd-dsoaws-comprehend-' + str(uuid.uuid4()) \n", + "flow_definition_name = \"fd-dsoaws-comprehend-\" + str(uuid.uuid4())\n", "\n", "create_workflow_definition_response = sagemaker.create_flow_definition(\n", - " FlowDefinitionName=flow_definition_name,\n", - " RoleArn=role,\n", - " HumanLoopConfig={\n", - " 'WorkteamArn': augmented_ai_workteam_arn,\n", - " 'HumanTaskUiArn': human_task_ui_arn,\n", - " 'TaskCount': 1,\n", - " 'TaskDescription': 'Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)',\n", - " 'TaskTitle': 'Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)'\n", - " },\n", - " OutputConfig={\n", - " 'S3OutputPath' : output_path\n", - " }\n", - " )\n", + " FlowDefinitionName=flow_definition_name,\n", + " RoleArn=role,\n", + " HumanLoopConfig={\n", + " \"WorkteamArn\": augmented_ai_workteam_arn,\n", + " \"HumanTaskUiArn\": human_task_ui_arn,\n", + " \"TaskCount\": 1,\n", + " \"TaskDescription\": \"Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)\",\n", + " \"TaskTitle\": \"Classify Reviews into Star Ratings Between 1 (Worst) and 5 (Best)\",\n", + " },\n", + " OutputConfig={\"S3OutputPath\": output_path},\n", + ")\n", "\n", - "augmented_ai_flow_definition_arn = create_workflow_definition_response['FlowDefinitionArn']" + "augmented_ai_flow_definition_arn = create_workflow_definition_response[\"FlowDefinitionArn\"]" ] }, { @@ -298,8 +298,8 @@ "# Describe flow definition - status should turn to \"active\"\n", "for x in range(60):\n", " describeFlowDefinitionResponse = sagemaker.describe_flow_definition(FlowDefinitionName=flow_definition_name)\n", - " print(describeFlowDefinitionResponse['FlowDefinitionStatus'])\n", - " if (describeFlowDefinitionResponse['FlowDefinitionStatus'] == 'Active'):\n", + " print(describeFlowDefinitionResponse[\"FlowDefinitionStatus\"])\n", + " if describeFlowDefinitionResponse[\"FlowDefinitionStatus\"] == \"Active\":\n", " print(\"Flow Definition is active\")\n", " break\n", " time.sleep(2)" @@ -330,7 +330,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb b/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb index 51c5a1ff..69e0a4f6 100644 --- a/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb +++ b/10_pipeline/human/02_Fix_Poor_Predictions_From_Comprehend_Custom_Text_Classifier.ipynb @@ -17,7 +17,7 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" @@ -37,10 +37,10 @@ "import botocore\n", "\n", "# Amazon Python SDK clients\n", - "sagemaker = boto3.client('sagemaker', region)\n", - "comprehend = boto3.client('comprehend', region)\n", - "a2i = boto3.client('sagemaker-a2i-runtime')\n", - "s3 = boto3.client('s3', region)" + "sagemaker = boto3.client(\"sagemaker\", region)\n", + "comprehend = boto3.client(\"comprehend\", region)\n", + "a2i = boto3.client(\"sagemaker-a2i-runtime\")\n", + "s3 = boto3.client(\"s3\", region)" ] }, { @@ -99,11 +99,11 @@ }, "outputs": [], "source": [ - "try: \n", + "try:\n", " comprehend_endpoint_arn\n", "except NameError:\n", - " print('*** PLEASE WAIT FOR THE Comprehend JOB TO FINISH IN THE PREVIOUS SECTION BEFORE CONTINUING ***')\n", - " print('*** YOU WILL NEED TO RESTART THIS NOTEBOOK ONCE THE JOB FINISHES ***')" + " print(\"*** PLEASE WAIT FOR THE Comprehend JOB TO FINISH IN THE PREVIOUS SECTION BEFORE CONTINUING ***\")\n", + " print(\"*** YOU WILL NEED TO RESTART THIS NOTEBOOK ONCE THE JOB FINISHES ***\")" ] }, { @@ -136,12 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample_reviews = [\n", - " 'I enjoy this product', \n", - " 'I am unhappy with this product', \n", - " 'It is okay', \n", - " 'sometimes it works'\n", - " ]" + "sample_reviews = [\"I enjoy this product\", \"I am unhappy with this product\", \"It is okay\", \"sometimes it works\"]" ] }, { @@ -166,37 +161,35 @@ "\n", "for sample_review in sample_reviews:\n", " # Call the Comprehend Custom model that we trained earlier\n", - " response = comprehend.classify_document(Text=sample_review, \n", - " EndpointArn=comprehend_endpoint_arn)\n", + " response = comprehend.classify_document(Text=sample_review, EndpointArn=comprehend_endpoint_arn)\n", + "\n", + " star_rating = response[\"Classes\"][0][\"Name\"]\n", + " confidence_score = response[\"Classes\"][0][\"Score\"]\n", + "\n", + " print(f'Processing sample_review: \"{sample_review}\"')\n", "\n", - " star_rating = response['Classes'][0]['Name']\n", - " confidence_score = response['Classes'][0]['Score']\n", - " \n", - " print(f'Processing sample_review: \\\"{sample_review}\\\"')\n", - " \n", " # Our condition for when we want to engage a human for review\n", - " if (confidence_score < CONFIDENCE_SCORE_THRESHOLD):\n", - " \n", + " if confidence_score < CONFIDENCE_SCORE_THRESHOLD:\n", + "\n", " humanLoopName = str(uuid.uuid4())\n", - " inputContent = {\n", - " 'initialValue': star_rating,\n", - " 'taskObject': sample_review\n", - " }\n", + " inputContent = {\"initialValue\": star_rating, \"taskObject\": sample_review}\n", " start_loop_response = a2i.start_human_loop(\n", " HumanLoopName=humanLoopName,\n", " FlowDefinitionArn=augmented_ai_flow_definition_arn,\n", - " HumanLoopInput={\n", - " 'InputContent': json.dumps(inputContent)\n", - " }\n", + " HumanLoopInput={\"InputContent\": json.dumps(inputContent)},\n", " )\n", "\n", " human_loops_started.append(humanLoopName)\n", "\n", - " print(f'Confidence score of {confidence_score} for star rating of {star_rating} is less than the threshold of {CONFIDENCE_SCORE_THRESHOLD}')\n", - " print(f'*** ==> Starting human loop with name: {humanLoopName} \\n')\n", + " print(\n", + " f\"Confidence score of {confidence_score} for star rating of {star_rating} is less than the threshold of {CONFIDENCE_SCORE_THRESHOLD}\"\n", + " )\n", + " print(f\"*** ==> Starting human loop with name: {humanLoopName} \\n\")\n", " else:\n", - " print(f'Confidence score of {confidence_score} for star rating of {star_rating} is above threshold of {CONFIDENCE_SCORE_THRESHOLD}')\n", - " print('No human loop created. \\n')" + " print(\n", + " f\"Confidence score of {confidence_score} for star rating of {star_rating} is above threshold of {CONFIDENCE_SCORE_THRESHOLD}\"\n", + " )\n", + " print(\"No human loop created. \\n\")" ] }, { @@ -215,12 +208,12 @@ "completed_human_loops = []\n", "for human_loop_name in human_loops_started:\n", " resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n", - " print(f'HumanLoop Name: {human_loop_name}')\n", + " print(f\"HumanLoop Name: {human_loop_name}\")\n", " print(f'HumanLoop Status: {resp[\"HumanLoopStatus\"]}')\n", " print(f'HumanLoop Output Destination: {resp[\"HumanLoopOutput\"]}')\n", - " print('')\n", - " \n", - " if resp['HumanLoopStatus'] == 'Completed':\n", + " print(\"\")\n", + "\n", + " if resp[\"HumanLoopStatus\"] == \"Completed\":\n", " completed_human_loops.append(resp)" ] }, @@ -257,13 +250,13 @@ "metadata": {}, "outputs": [], "source": [ - "workteam_name = augmented_ai_workteam_arn[augmented_ai_workteam_arn.rfind('/') + 1:]\n", + "workteam_name = augmented_ai_workteam_arn[augmented_ai_workteam_arn.rfind(\"/\") + 1 :]\n", "print(workteam_name)\n", - "print('Navigate to the private worker portal and complete the human loop.')\n", - "print('Make sure you have invited yourself to the workteam and received the signup email.')\n", - "print('Note: Check your spam filter if you have not received the email.')\n", - "print('')\n", - "print('https://' + sagemaker.describe_workteam(WorkteamName=workteam_name)['Workteam']['SubDomain'])" + "print(\"Navigate to the private worker portal and complete the human loop.\")\n", + "print(\"Make sure you have invited yourself to the workteam and received the signup email.\")\n", + "print(\"Note: Check your spam filter if you have not received the email.\")\n", + "print(\"\")\n", + "print(\"https://\" + sagemaker.describe_workteam(WorkteamName=workteam_name)[\"Workteam\"][\"SubDomain\"])" ] }, { @@ -311,18 +304,18 @@ "completed_human_loops = []\n", "for human_loop_name in human_loops_started:\n", " resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n", - " print(f'HumanLoop Name: {human_loop_name}')\n", + " print(f\"HumanLoop Name: {human_loop_name}\")\n", " print(f'HumanLoop Status: {resp[\"HumanLoopStatus\"]}')\n", " print(f'HumanLoop Output Destination: {resp[\"HumanLoopOutput\"]}')\n", - " print('')\n", + " print(\"\")\n", " while resp[\"HumanLoopStatus\"] != \"Completed\":\n", - " print(f'Waiting for HumanLoop to complete.') \n", + " print(f\"Waiting for HumanLoop to complete.\")\n", " time.sleep(10)\n", " resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)\n", " if resp[\"HumanLoopStatus\"] == \"Completed\":\n", " completed_human_loops.append(resp)\n", - " print(f'Completed!')\n", - " print('')" + " print(f\"Completed!\")\n", + " print(\"\")" ] }, { @@ -355,17 +348,17 @@ "fixed_items = []\n", "\n", "for resp in completed_human_loops:\n", - " split_string = re.split('s3://' + bucket + '/', resp['HumanLoopOutput']['OutputS3Uri'])\n", + " split_string = re.split(\"s3://\" + bucket + \"/\", resp[\"HumanLoopOutput\"][\"OutputS3Uri\"])\n", " output_bucket_key = split_string[1]\n", "\n", " response = s3.get_object(Bucket=bucket, Key=output_bucket_key)\n", - " content = response['Body'].read().decode('utf-8')\n", + " content = response[\"Body\"].read().decode(\"utf-8\")\n", " json_output = json.loads(content)\n", " print(json_output)\n", "\n", - " input_content = json_output['inputContent']\n", - " human_answer = json_output['humanAnswers'][0]['answerContent']\n", - " fixed_item = {'input_content': input_content, 'human_answer': human_answer}\n", + " input_content = json_output[\"inputContent\"]\n", + " human_answer = json_output[\"humanAnswers\"][0][\"answerContent\"]\n", + " fixed_item = {\"input_content\": input_content, \"human_answer\": human_answer}\n", " fixed_items.append(fixed_item)" ] }, @@ -417,7 +410,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb b/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb index 8d0eba29..38f6617d 100644 --- a/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb +++ b/10_pipeline/kubeflow/00_00_Setup_EKS.ipynb @@ -27,9 +27,9 @@ "outputs": [], "source": [ "!wget https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz\n", - " \n", + "\n", "!tar -xzvf eksctl_$(uname -s)_amd64.tar.gz -C /tmp\n", - " \n", + "\n", "!mv /tmp/eksctl /usr/local/bin\n", "\n", "!eksctl version" @@ -41,8 +41,8 @@ "metadata": {}, "outputs": [], "source": [ - "!wget https://amazon-eks.s3.us-west-2.amazonaws.com/1.15.10/2020-02-22/bin/linux/amd64/kubectl \n", - " \n", + "!wget https://amazon-eks.s3.us-west-2.amazonaws.com/1.15.10/2020-02-22/bin/linux/amd64/kubectl\n", + "\n", "!chmod +x ./kubectl\n", "\n", "!mv ./kubectl /usr/local/bin\n", diff --git a/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb b/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb index 41a1bfca..97bd7912 100644 --- a/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb +++ b/10_pipeline/kubeflow/00_05_Launch_Kubeflow_Jupyter_Notebook.ipynb @@ -252,7 +252,7 @@ "\n", "#

Shutting down your kernel for this notebook to release resources.

\n", "# \n", - " \n", + "\n", "# " ] }, diff --git a/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb b/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb index 49a44db3..a739d211 100644 --- a/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb +++ b/10_pipeline/kubeflow/02_Kubeflow_Pipeline_Simple.ipynb @@ -70,6 +70,7 @@ "source": [ "# Restart the kernel to pick up pip installed libraries\n", "from IPython.core.display import HTML\n", + "\n", "HTML(\"\")" ] }, @@ -129,23 +130,22 @@ "import kfp\n", "from kfp import dsl\n", "\n", + "\n", "def add_two_numbers(a, b):\n", " return dsl.ContainerOp(\n", - " name='calculate_sum',\n", - " image='python:3.6.8',\n", - " command=['python', '-c'],\n", + " name=\"calculate_sum\",\n", + " image=\"python:3.6.8\",\n", + " command=[\"python\", \"-c\"],\n", " arguments=['with open(\"/tmp/results.txt\", \"a\") as file: file.write(str({} + {}))'.format(a, b)],\n", " file_outputs={\n", - " 'data': '/tmp/results.txt',\n", - " }\n", + " \"data\": \"/tmp/results.txt\",\n", + " },\n", " )\n", "\n", + "\n", "def echo_op(text):\n", " return dsl.ContainerOp(\n", - " name='echo',\n", - " image='library/bash:4.4.23',\n", - " command=['sh', '-c'],\n", - " arguments=['echo \"Result: {}\"'.format(text)]\n", + " name=\"echo\", image=\"library/bash:4.4.23\", command=[\"sh\", \"-c\"], arguments=['echo \"Result: {}\"'.format(text)]\n", " )" ] }, @@ -164,16 +164,8 @@ "metadata": {}, "outputs": [], "source": [ - "@dsl.pipeline(\n", - " name='Calculate sum pipeline',\n", - " description='Calculate sum of numbers and prints the result.'\n", - ")\n", - "def calculate_sum(\n", - " a=7,\n", - " b=10,\n", - " c=4,\n", - " d=7\n", - "):\n", + "@dsl.pipeline(name=\"Calculate sum pipeline\", description=\"Calculate sum of numbers and prints the result.\")\n", + "def calculate_sum(a=7, b=10, c=4, d=7):\n", " \"\"\"A four-step pipeline with first two running in parallel.\"\"\"\n", "\n", " sum1 = add_two_numbers(a, b)\n", @@ -198,7 +190,7 @@ "metadata": {}, "outputs": [], "source": [ - "kfp.compiler.Compiler().compile(calculate_sum, 'calculate-sum-pipeline.zip')" + "kfp.compiler.Compiler().compile(calculate_sum, \"calculate-sum-pipeline.zip\")" ] }, { @@ -247,11 +239,9 @@ "source": [ "client = kfp.Client()\n", "\n", - "experiment = client.create_experiment(name='kubeflow')\n", + "experiment = client.create_experiment(name=\"kubeflow\")\n", "\n", - "my_run = client.run_pipeline(experiment.id, \n", - " 'calculate-sum-pipeline', \n", - " 'calculate-sum-pipeline.zip')" + "my_run = client.run_pipeline(experiment.id, \"calculate-sum-pipeline\", \"calculate-sum-pipeline.zip\")" ] }, { diff --git a/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb b/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb index e3b32b6e..9fe4a191 100644 --- a/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb +++ b/10_pipeline/kubeflow/03_Kubeflow_Pipeline_Reviews_BERT_SageMaker.ipynb @@ -50,6 +50,7 @@ "source": [ "# Restart the kernel to pick up pip installed libraries\n", "from IPython.core.display import HTML\n", + "\n", "HTML(\"\")" ] }, @@ -85,13 +86,13 @@ "metadata": {}, "outputs": [], "source": [ - "iam_roles = boto3.client('iam').list_roles()['Roles']\n", + "iam_roles = boto3.client(\"iam\").list_roles()[\"Roles\"]\n", "\n", "for iam_role in iam_roles:\n", - " if 'SageMakerExecutionRole' in iam_role['RoleName']:\n", - " role = iam_role['Arn']\n", + " if \"SageMakerExecutionRole\" in iam_role[\"RoleName\"]:\n", + " role = iam_role[\"Arn\"]\n", " break\n", - "print('Role: {}'.format(role))" + "print(\"Role: {}\".format(role))" ] }, { @@ -107,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'" + "s3_public_path_tsv = \"s3://amazon-reviews-pds/tsv\"" ] }, { @@ -116,7 +117,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)\n", + "s3_private_path_tsv = \"s3://{}/amazon-reviews-pds/tsv\".format(bucket)\n", "print(s3_private_path_tsv)" ] }, @@ -137,7 +138,7 @@ "metadata": {}, "outputs": [], "source": [ - "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)" + "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)" ] }, { @@ -165,7 +166,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_process_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml')" + "sagemaker_process_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml\"\n", + ")" ] }, { @@ -174,7 +177,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml')" + "sagemaker_train_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml\"\n", + ")" ] }, { @@ -183,7 +188,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml')" + "sagemaker_model_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml\"\n", + ")" ] }, { @@ -192,7 +199,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml')" + "sagemaker_deploy_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml\"\n", + ")" ] }, { @@ -210,7 +219,7 @@ }, "outputs": [], "source": [ - "processing_code_s3_uri = 's3://{}/processing_code/preprocess-scikit-text-to-bert-feature-store.py'.format(bucket)\n", + "processing_code_s3_uri = \"s3://{}/processing_code/preprocess-scikit-text-to-bert-feature-store.py\".format(bucket)\n", "print(processing_code_s3_uri)\n", "\n", "!aws s3 cp ./preprocess-scikit-text-to-bert-feature-store.py $processing_code_s3_uri" @@ -238,7 +247,7 @@ "metadata": {}, "outputs": [], "source": [ - "training_code_s3_uri = 's3://{}/training_code/sourcedir.tar.gz'.format(bucket)\n", + "training_code_s3_uri = \"s3://{}/training_code/sourcedir.tar.gz\".format(bucket)\n", "print(training_code_s3_uri)\n", "\n", "!aws s3 cp sourcedir.tar.gz $training_code_s3_uri" @@ -262,14 +271,11 @@ " },\n", " }\n", "\n", + "\n", "def processing_output(output_name, s3_uri, local_path, s3_upload_mode):\n", " return {\n", " \"OutputName\": output_name,\n", - " \"S3Output\": {\n", - " \"LocalPath\": local_path, \n", - " \"S3Uri\": s3_uri,\n", - " \"S3UploadMode\": s3_upload_mode\n", - " },\n", + " \"S3Output\": {\"LocalPath\": local_path, \"S3Uri\": s3_uri, \"S3UploadMode\": s3_upload_mode},\n", " }" ] }, @@ -284,9 +290,9 @@ " \"ChannelName\": input_name,\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", - " \"S3Uri\": s3_uri, \n", + " \"S3Uri\": s3_uri,\n", " \"S3DataType\": \"S3Prefix\",\n", - " \"S3DataDistributionType\": s3_data_distribution_type \n", + " \"S3DataDistributionType\": s3_data_distribution_type,\n", " }\n", " },\n", " }" @@ -309,48 +315,45 @@ " name=\"BERT Pipeline\",\n", " description=\"BERT Pipeline\",\n", ")\n", - "def bert_pipeline(role=role, \n", - " bucket=bucket, \n", - " region=region,\n", - " raw_input_data_s3_uri=raw_input_data_s3_uri):\n", + "def bert_pipeline(role=role, bucket=bucket, region=region, raw_input_data_s3_uri=raw_input_data_s3_uri):\n", "\n", " import time\n", " import json\n", - " \n", - " pipeline_name = 'kubeflow-pipeline-sagemaker-{}'.format(int(time.time()))\n", "\n", - " network_isolation=False\n", - " \n", + " pipeline_name = \"kubeflow-pipeline-sagemaker-{}\".format(int(time.time()))\n", + "\n", + " network_isolation = False\n", + "\n", " ########################\n", " # FEATURE ENGINEERING\n", - " ######################## \n", - " \n", - " max_seq_length=64\n", - " train_split_percentage=0.90\n", - " validation_split_percentage=0.05\n", - " test_split_percentage=0.05\n", - " balance_dataset=True\n", - "\n", - " processed_train_data_s3_uri = 's3://{}/{}/processing/output/bert-train'.format(bucket, pipeline_name)\n", - " processed_validation_data_s3_uri = 's3://{}/{}/processing/output/bert-validation'.format(bucket, pipeline_name)\n", - " processed_test_data_s3_uri = 's3://{}/{}/processing/output/bert-test'.format(bucket, pipeline_name)\n", - "\n", - " processing_instance_type = 'ml.c5.2xlarge'\n", + " ########################\n", + "\n", + " max_seq_length = 64\n", + " train_split_percentage = 0.90\n", + " validation_split_percentage = 0.05\n", + " test_split_percentage = 0.05\n", + " balance_dataset = True\n", + "\n", + " processed_train_data_s3_uri = \"s3://{}/{}/processing/output/bert-train\".format(bucket, pipeline_name)\n", + " processed_validation_data_s3_uri = \"s3://{}/{}/processing/output/bert-validation\".format(bucket, pipeline_name)\n", + " processed_test_data_s3_uri = \"s3://{}/{}/processing/output/bert-test\".format(bucket, pipeline_name)\n", + "\n", + " processing_instance_type = \"ml.c5.2xlarge\"\n", " processing_instance_count = 2\n", - " \n", + "\n", " timestamp = int(time.time())\n", "\n", - " feature_store_offline_prefix = 'reviews-feature-store-' + str(timestamp)\n", - " feature_group_name = 'reviews-feature-group-' + str(timestamp)\n", + " feature_store_offline_prefix = \"reviews-feature-store-\" + str(timestamp)\n", + " feature_group_name = \"reviews-feature-group-\" + str(timestamp)\n", "\n", " # hard-coding to avoid the wrong ECR account id with create_image_uri()\n", - " processing_image = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3'\n", - "# import sagemaker\n", - "# processing_image = sagemaker.fw_utils.create_image_uri(framework='scikit-learn',\n", - "# framework_version='0.23-1',\n", - "# py_version='py3',\n", - "# instance_type='ml.c5.9xlarge',\n", - "# region='us-east-1') # hard-coding to avoid serialization issue\n", + " processing_image = \"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\"\n", + " # import sagemaker\n", + " # processing_image = sagemaker.fw_utils.create_image_uri(framework='scikit-learn',\n", + " # framework_version='0.23-1',\n", + " # py_version='py3',\n", + " # instance_type='ml.c5.9xlarge',\n", + " # region='us-east-1') # hard-coding to avoid serialization issue\n", "\n", " process = sagemaker_process_op(\n", " role=role,\n", @@ -359,15 +362,23 @@ " network_isolation=network_isolation,\n", " instance_type=processing_instance_type,\n", " instance_count=processing_instance_count,\n", - " container_arguments=['--train-split-percentage', str(train_split_percentage),\n", - " '--validation-split-percentage', str(validation_split_percentage),\n", - " '--test-split-percentage', str(test_split_percentage),\n", - " '--max-seq-length', str(max_seq_length),\n", - " '--balance-dataset', str(balance_dataset),\n", - " '--feature-store-offline-prefix', str(feature_store_offline_prefix),\n", - " '--feature-group-name', str(feature_group_name)\n", - " ], \n", - " environment={'AWS_DEFAULT_REGION': 'us-east-1'}, # hard-coding to avoid serialization issue\n", + " container_arguments=[\n", + " \"--train-split-percentage\",\n", + " str(train_split_percentage),\n", + " \"--validation-split-percentage\",\n", + " str(validation_split_percentage),\n", + " \"--test-split-percentage\",\n", + " str(test_split_percentage),\n", + " \"--max-seq-length\",\n", + " str(max_seq_length),\n", + " \"--balance-dataset\",\n", + " str(balance_dataset),\n", + " \"--feature-store-offline-prefix\",\n", + " str(feature_store_offline_prefix),\n", + " \"--feature-group-name\",\n", + " str(feature_group_name),\n", + " ],\n", + " environment={\"AWS_DEFAULT_REGION\": \"us-east-1\"}, # hard-coding to avoid serialization issue\n", " container_entrypoint=[\n", " \"python3\",\n", " \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert-feature-store.py\",\n", @@ -377,13 +388,13 @@ " input_name=\"raw-input-data\",\n", " s3_uri=\"{}\".format(raw_input_data_s3_uri),\n", " local_path=\"/opt/ml/processing/input/data/\",\n", - " s3_data_distribution_type=\"ShardedByS3Key\"\n", + " s3_data_distribution_type=\"ShardedByS3Key\",\n", " ),\n", " processing_input(\n", " input_name=\"code\",\n", " s3_uri=\"{}\".format(processing_code_s3_uri),\n", " local_path=\"/opt/ml/processing/input/code\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", " ),\n", " ],\n", " output_config=[\n", @@ -391,150 +402,146 @@ " output_name=\"bert-train\",\n", " s3_uri=\"{}\".format(processed_train_data_s3_uri),\n", " local_path=\"/opt/ml/processing/output/bert/train\",\n", - " s3_upload_mode=\"EndOfJob\"\n", + " s3_upload_mode=\"EndOfJob\",\n", " ),\n", " processing_output(\n", " output_name=\"bert-validation\",\n", " s3_uri=\"{}\".format(processed_validation_data_s3_uri),\n", " local_path=\"/opt/ml/processing/output/bert/validation\",\n", - " s3_upload_mode=\"EndOfJob\"\n", + " s3_upload_mode=\"EndOfJob\",\n", " ),\n", " processing_output(\n", " output_name=\"bert-test\",\n", " s3_uri=\"{}\".format(processed_test_data_s3_uri),\n", " local_path=\"/opt/ml/processing/output/bert/test\",\n", - " s3_upload_mode=\"EndOfJob\"\n", + " s3_upload_mode=\"EndOfJob\",\n", " ),\n", " ],\n", " )\n", "\n", - "\n", " ########################\n", " # TRAIN\n", " ########################\n", - " \n", + "\n", " train_channels = [\n", - " training_input(input_name=\"train\", \n", - " s3_uri=processed_train_data_s3_uri,\n", - " s3_data_distribution_type=\"ShardedByS3Key\"\n", + " training_input(\n", + " input_name=\"train\", s3_uri=processed_train_data_s3_uri, s3_data_distribution_type=\"ShardedByS3Key\"\n", + " ),\n", + " training_input(\n", + " input_name=\"validation\",\n", + " s3_uri=processed_validation_data_s3_uri,\n", + " s3_data_distribution_type=\"ShardedByS3Key\",\n", + " ),\n", + " training_input(\n", + " input_name=\"test\", s3_uri=processed_test_data_s3_uri, s3_data_distribution_type=\"ShardedByS3Key\"\n", " ),\n", - " training_input(input_name=\"validation\", \n", - " s3_uri=processed_validation_data_s3_uri,\n", - " s3_data_distribution_type=\"ShardedByS3Key\"\n", - " ), \n", - " training_input(input_name=\"test\", \n", - " s3_uri=processed_test_data_s3_uri,\n", - " s3_data_distribution_type=\"ShardedByS3Key\"\n", - " )\n", " ]\n", "\n", - " epochs=1\n", - " learning_rate=0.00001\n", - " epsilon=0.00000001\n", - " train_batch_size=128\n", - " validation_batch_size=128\n", - " test_batch_size=128\n", - " train_steps_per_epoch=100\n", - " validation_steps=100\n", - " test_steps=100\n", - " train_volume_size=1024\n", - " use_xla=True\n", - " use_amp=True\n", - " freeze_bert_layer=False\n", - " enable_sagemaker_debugger=False\n", - " enable_checkpointing=False\n", - " enable_tensorboard=False\n", - " input_mode='File'\n", - " run_validation=True\n", - " run_test=True\n", - " run_sample_predictions=True\n", - "\n", - " train_instance_type='ml.c5.9xlarge' \n", - " train_instance_count=1\n", + " epochs = 1\n", + " learning_rate = 0.00001\n", + " epsilon = 0.00000001\n", + " train_batch_size = 128\n", + " validation_batch_size = 128\n", + " test_batch_size = 128\n", + " train_steps_per_epoch = 100\n", + " validation_steps = 100\n", + " test_steps = 100\n", + " train_volume_size = 1024\n", + " use_xla = True\n", + " use_amp = True\n", + " freeze_bert_layer = False\n", + " enable_sagemaker_debugger = False\n", + " enable_checkpointing = False\n", + " enable_tensorboard = False\n", + " input_mode = \"File\"\n", + " run_validation = True\n", + " run_test = True\n", + " run_sample_predictions = True\n", + "\n", + " train_instance_type = \"ml.c5.9xlarge\"\n", + " train_instance_count = 1\n", "\n", " train_output_location = \"s3://{}/{}/output\".format(bucket, pipeline_name)\n", - " \n", - " hyperparameters={\n", - " 'epochs': '{}'.format(epochs),\n", - " 'learning_rate': '{}'.format(learning_rate),\n", - " 'epsilon': '{}'.format(epsilon),\n", - " 'train_batch_size': '{}'.format(train_batch_size),\n", - " 'validation_batch_size': '{}'.format(validation_batch_size),\n", - " 'test_batch_size': '{}'.format(test_batch_size), \n", - " 'train_steps_per_epoch': '{}'.format(train_steps_per_epoch),\n", - " 'validation_steps': '{}'.format(validation_steps),\n", - " 'test_steps': '{}'.format(test_steps),\n", - " 'use_xla': '{}'.format(use_xla),\n", - " 'use_amp': '{}'.format(use_amp), \n", - " 'max_seq_length': '{}'.format(max_seq_length),\n", - " 'freeze_bert_layer': '{}'.format(freeze_bert_layer),\n", - " 'enable_sagemaker_debugger': '{}'.format(enable_sagemaker_debugger),\n", - " 'enable_checkpointing': '{}'.format(enable_checkpointing),\n", - " 'enable_tensorboard': '{}'.format(enable_tensorboard), \n", - " 'run_validation': '{}'.format(run_validation),\n", - " 'run_test': '{}'.format(run_test),\n", - " 'run_sample_predictions': '{}'.format(run_sample_predictions),\n", - " 'model_dir': '{}'.format(train_output_location),\n", - " 'sagemaker_program': 'tf_bert_reviews.py',\n", - " 'sagemaker_region': '{}'.format(region),\n", - " 'sagemaker_submit_directory': training_code_s3_uri\n", + "\n", + " hyperparameters = {\n", + " \"epochs\": \"{}\".format(epochs),\n", + " \"learning_rate\": \"{}\".format(learning_rate),\n", + " \"epsilon\": \"{}\".format(epsilon),\n", + " \"train_batch_size\": \"{}\".format(train_batch_size),\n", + " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n", + " \"test_batch_size\": \"{}\".format(test_batch_size),\n", + " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n", + " \"validation_steps\": \"{}\".format(validation_steps),\n", + " \"test_steps\": \"{}\".format(test_steps),\n", + " \"use_xla\": \"{}\".format(use_xla),\n", + " \"use_amp\": \"{}\".format(use_amp),\n", + " \"max_seq_length\": \"{}\".format(max_seq_length),\n", + " \"freeze_bert_layer\": \"{}\".format(freeze_bert_layer),\n", + " \"enable_sagemaker_debugger\": \"{}\".format(enable_sagemaker_debugger),\n", + " \"enable_checkpointing\": \"{}\".format(enable_checkpointing),\n", + " \"enable_tensorboard\": \"{}\".format(enable_tensorboard),\n", + " \"run_validation\": \"{}\".format(run_validation),\n", + " \"run_test\": \"{}\".format(run_test),\n", + " \"run_sample_predictions\": \"{}\".format(run_sample_predictions),\n", + " \"model_dir\": \"{}\".format(train_output_location),\n", + " \"sagemaker_program\": \"tf_bert_reviews.py\",\n", + " \"sagemaker_region\": \"{}\".format(region),\n", + " \"sagemaker_submit_directory\": training_code_s3_uri,\n", " }\n", " hyperparameters_json = json.dumps(hyperparameters)\n", - " \n", + "\n", " # metric_definitions='{\"val_acc\": \"val_accuracy: ([0-9\\\\\\\\.]+)\"}',\n", " metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", " ]\n", " metrics_definitions_json = json.dumps(metrics_definitions)\n", " print(metrics_definitions_json)\n", "\n", - "\n", " # .after(process) is explicitly appended below\n", - " train_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04'.format(region) \n", + " train_image = \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04\".format(region)\n", " training = sagemaker_train_op(\n", " region=region,\n", " image=train_image,\n", - " network_isolation=network_isolation, \n", + " network_isolation=network_isolation,\n", " instance_type=train_instance_type,\n", " instance_count=train_instance_count,\n", " hyperparameters=hyperparameters_json,\n", - " training_input_mode=input_mode, \n", - " channels=train_channels, \n", + " training_input_mode=input_mode,\n", + " channels=train_channels,\n", " model_artifact_path=train_output_location,\n", " # metric_definitions=metrics_definitions_json,\n", " # TODO: Add rules\n", - " role=role \n", + " role=role,\n", " ).after(process)\n", "\n", - "\n", " ########################\n", " # DEPLOY\n", " ########################\n", - " \n", + "\n", " # .after(training) is implied because we depend on training.outputs[]\n", - " serve_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.3.1-cpu'.format(region)\n", + " serve_image = \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.3.1-cpu\".format(region)\n", " create_model = sagemaker_model_op(\n", " region=region,\n", " model_name=training.outputs[\"job_name\"],\n", " image=serve_image,\n", - " network_isolation=network_isolation, \n", + " network_isolation=network_isolation,\n", " model_artifact_url=training.outputs[\"model_artifact_url\"],\n", - " role=role\n", + " role=role,\n", " )\n", "\n", - " deploy_instance_type='ml.c5.9xlarge'\n", - " deploy_instance_count=1\n", + " deploy_instance_type = \"ml.c5.9xlarge\"\n", + " deploy_instance_count = 1\n", "\n", " # .after(create_model) is implied because we depend on create_model.outputs\n", " deploy_model = sagemaker_deploy_op(\n", " region=region,\n", - " variant_name_1='AllTraffic',\n", + " variant_name_1=\"AllTraffic\",\n", " model_name_1=create_model.output,\n", " instance_type_1=deploy_instance_type,\n", - " initial_instance_count_1=deploy_instance_count \n", + " initial_instance_count_1=deploy_instance_count,\n", " )" ] }, @@ -551,7 +558,7 @@ "metadata": {}, "outputs": [], "source": [ - "kfp.compiler.Compiler().compile(bert_pipeline, 'bert-pipeline.zip')" + "kfp.compiler.Compiler().compile(bert_pipeline, \"bert-pipeline.zip\")" ] }, { @@ -598,11 +605,9 @@ "source": [ "client = kfp.Client()\n", "\n", - "experiment = client.create_experiment(name='kubeflow')\n", + "experiment = client.create_experiment(name=\"kubeflow\")\n", "\n", - "my_run = client.run_pipeline(experiment.id, \n", - " 'bert-pipeline', \n", - " 'bert-pipeline.zip')" + "my_run = client.run_pipeline(experiment.id, \"bert-pipeline\", \"bert-pipeline.zip\")" ] }, { @@ -660,9 +665,9 @@ "source": [ "import boto3\n", "\n", - "sm_runtime = boto3.Session(region_name=region).client('sagemaker-runtime')\n", + "sm_runtime = boto3.Session(region_name=region).client(\"sagemaker-runtime\")\n", "\n", - "endpoint_name = ''" + "endpoint_name = \"\"" ] }, { @@ -673,28 +678,25 @@ "source": [ "import json\n", "\n", - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "] \n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "response = sm_runtime.invoke_endpoint(\n", - " EndpointName=endpoint_name,\n", - " ContentType='application/jsonlines', \n", - " Accept='application/jsonlines', \n", - " Body=json.dumps(inputs).encode('utf-8')\n", + " EndpointName=endpoint_name,\n", + " ContentType=\"application/jsonlines\",\n", + " Accept=\"application/jsonlines\",\n", + " Body=json.dumps(inputs).encode(\"utf-8\"),\n", ")\n", - "print('response: {}'.format(response))\n", + "print(\"response: {}\".format(response))\n", "\n", - "predicted_classes_str = response['Body'].read().decode()\n", + "predicted_classes_str = response[\"Body\"].read().decode()\n", "predicted_classes_json = json.loads(predicted_classes_str)\n", "\n", "predicted_classes = predicted_classes_json.splitlines()\n", - "print('predicted_classes: {}'.format(predicted_classes))\n", + "print(\"predicted_classes: {}\".format(predicted_classes))\n", "\n", "for predicted_class_json, input_data in zip(predicted_classes, inputs):\n", - " predicted_class = json.loads(predicted_class_json)['predicted_label']\n", - " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0])) " + " predicted_class = json.loads(predicted_class_json)[\"predicted_label\"]\n", + " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0]))" ] }, { diff --git a/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb b/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb index 86a70c1e..8e6b432c 100644 --- a/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb +++ b/10_pipeline/kubeflow/99_DISABLE_PUBLIC_ENDPOINT_TO_AVOID_GETTING_HACKED.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/10_pipeline/kubeflow/code/inference.py b/10_pipeline/kubeflow/code/inference.py index 2975dc2d..53196737 100644 --- a/10_pipeline/kubeflow/code/inference.py +++ b/10_pipeline/kubeflow/code/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/10_pipeline/kubeflow/code/tf_bert_reviews.py b/10_pipeline/kubeflow/code/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/10_pipeline/kubeflow/code/tf_bert_reviews.py +++ b/10_pipeline/kubeflow/code/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/10_pipeline/kubeflow/evaluate_model_metrics.py b/10_pipeline/kubeflow/evaluate_model_metrics.py index 024afdec..f3523174 100644 --- a/10_pipeline/kubeflow/evaluate_model_metrics.py +++ b/10_pipeline/kubeflow/evaluate_model_metrics.py @@ -4,13 +4,16 @@ from datetime import datetime import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) import pandas as pd import os import re @@ -33,99 +36,99 @@ from sklearn.utils import resample -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] -config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) +config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, +) def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--input-model', type=str, - default='/opt/ml/processing/input/model', + parser.add_argument( + "--input-model", + type=str, + default="/opt/ml/processing/input/model", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - + ) + return parser.parse_args() - + def process(args): - print('Current host: {}'.format(args.current_host)) - - print('input_data: {}'.format(args.input_data)) - print('input_model: {}'.format(args.input_model)) - - print('Listing contents of input model dir: {}'.format(args.input_model)) + print("Current host: {}".format(args.current_host)) + + print("input_data: {}".format(args.input_data)) + print("input_model: {}".format(args.input_model)) + + print("Listing contents of input model dir: {}".format(args.input_model)) input_files = os.listdir(args.input_model) for file in input_files: print(file) - model_tar_path = '{}/model.tar.gz'.format(args.input_model) + model_tar_path = "{}/model.tar.gz".format(args.input_model) model_tar = tarfile.open(model_tar_path) model_tar.extractall(args.input_model) - model_tar.close() + model_tar.close() - model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model)) + model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model)) print(model) - + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=args.max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -133,81 +136,86 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) - + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) ########################################################################################### # TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz # - ########################################################################################### -# evaluation_data_path = '/opt/ml/processing/input/data/' - - print('Listing contents of input data dir: {}'.format(args.input_data)) + ########################################################################################### + # evaluation_data_path = '/opt/ml/processing/input/data/' + + print("Listing contents of input data dir: {}".format(args.input_data)) input_files = os.listdir(args.input_data) - test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data) - print('Using only {} to evaluate.'.format(test_data_path)) - df_test_reviews = pd.read_csv(test_data_path, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data) + print("Using only {} to evaluate.".format(test_data_path)) + df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[ + ["review_body", "star_rating"] + ] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - y_actual = df_test_reviews['star_rating'] + y_actual = df_test_reviews["star_rating"] y_actual print(classification_report(y_true=y_test, y_pred=y_actual)) - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) def plot_conf_mat(cm, classes, title, cmap): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.ylabel("True label") + plt.xlabel("Predicted label") cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=CLASSES, - title='Confusion Matrix', - cmap=plt.cm.Greens) + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens) - # Save the confusion matrix + # Save the confusion matrix plt.show() - # Model Output - metrics_path = os.path.join(args.output_data, 'metrics/') + # Model Output + metrics_path = os.path.join(args.output_data, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) report_dict = { "metrics": { @@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap): evaluation_path = "{}/evaluation.json".format(metrics_path) with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict)) - - print('Listing contents of output dir: {}'.format(args.output_data)) + + print("Listing contents of output dir: {}".format(args.output_data)) output_files = os.listdir(args.output_data) for file in output_files: print(file) - print('Listing contents of output/metrics dir: {}'.format(metrics_path)) - output_files = os.listdir('{}'.format(metrics_path)) + print("Listing contents of output/metrics dir: {}".format(metrics_path)) + output_files = os.listdir("{}".format(metrics_path)) for file in output_files: print(file) - print('Complete') - - + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) - process(args) + process(args) diff --git a/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py index 1211ba85..7e1cd385 100644 --- a/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py +++ b/10_pipeline/kubeflow/preprocess-scikit-text-to-bert-feature-store.py @@ -20,16 +20,18 @@ import subprocess ## PIP INSTALLS ## -# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to +# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to # use anaconda and anaconda only supports 2.3.0 at this time -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"]) import pandas as pd import re import sagemaker @@ -40,51 +42,55 @@ FeatureTypeEnum, ) -region = os.environ['AWS_DEFAULT_REGION'] -print('Region: {}'.format(region)) +region = os.environ["AWS_DEFAULT_REGION"] +print("Region: {}".format(region)) ############################# ## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc. ## Role and Bucket are malformed if we do this later. -sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region) +sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region) caller_identity = sts.get_caller_identity() -print('caller_identity: {}'.format(caller_identity)) +print("caller_identity: {}".format(caller_identity)) -assumed_role_arn = caller_identity['Arn'] -print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn)) +assumed_role_arn = caller_identity["Arn"] +print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn)) -assumed_role_name = assumed_role_arn.split('/')[-2] +assumed_role_name = assumed_role_arn.split("/")[-2] -iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region) -get_role_response = iam.get_role(RoleName=assumed_role_name) -print('get_role_response {}'.format(get_role_response)) -role = get_role_response['Role']['Arn'] -print('role {}'.format(role)) +iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region) +get_role_response = iam.get_role(RoleName=assumed_role_name) +print("get_role_response {}".format(get_role_response)) +role = get_role_response["Role"]["Arn"] +print("role {}".format(role)) bucket = sagemaker.Session().default_bucket() -print('The DEFAULT BUCKET is {}'.format(bucket)) +print("The DEFAULT BUCKET is {}".format(bucket)) ############################# -sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region) +sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region) -featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region) +featurestore_runtime = boto3.Session(region_name=region).client( + service_name="sagemaker-featurestore-runtime", region_name=region +) -s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region) +s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region) -sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region), - sagemaker_client=sm, - sagemaker_featurestore_runtime_client=featurestore_runtime) +sagemaker_session = sagemaker.Session( + boto_session=boto3.Session(region_name=region), + sagemaker_client=sm, + sagemaker_featurestore_runtime_client=featurestore_runtime, +) -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -REVIEW_BODY_COLUMN = 'review_body' -REVIEW_ID_COLUMN = 'review_id' +REVIEW_BODY_COLUMN = "review_body" +REVIEW_ID_COLUMN = "review_id" # DATE_COLUMN = 'date' -LABEL_COLUMN = 'star_rating' +LABEL_COLUMN = "star_rating" LABEL_VALUES = [1, 2, 3, 4, 5] - + label_map = {} for (i, label) in enumerate(LABEL_VALUES): label_map[label] = i @@ -92,94 +98,88 @@ def cast_object_to_string(data_frame): for label in data_frame.columns: - if data_frame.dtypes[label] == 'object': + if data_frame.dtypes[label] == "object": data_frame[label] = data_frame[label].astype("str").astype("string") return data_frame - + def wait_for_feature_group_creation_complete(feature_group): try: status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) if status != "Created": - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.") except: - print('No feature group created yet.') - - + print("No feature group created yet.") + + def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records - feature_definitions= [ - FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), -# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) + feature_definitions = [ + FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), + # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] - + feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=sagemaker_session) - - print('Feature Group: {}'.format(feature_group)) - - try: - print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...') + name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session + ) + + print("Feature Group: {}".format(feature_group)) + + try: + print( + "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." + ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: - print('Before CREATE FG wait exeption: {}'.format(e)) -# pass - + print("Before CREATE FG wait exeption: {}".format(e)) + # pass + try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" - - print('Creating Feature Group with role {}...'.format(role)) + + print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, - enable_online_store=True + enable_online_store=True, ) - print('Creating Feature Group. Completed.') - - print('Waiting for new Feature Group to become available...') + print("Creating Feature Group. Completed.") + + print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) - print('Feature Group available.') + print("Feature Group available.") feature_group.describe() - + except Exception as e: - print('Exception: {}'.format(e)) - + print("Exception: {}".format(e)) + return feature_group - + class InputFeatures(object): - """BERT feature vectors.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - review_id, - date, - label): -# review_body): + """BERT feature vectors.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label): + # review_body): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids @@ -187,36 +187,38 @@ def __init__(self, self.review_id = review_id self.date = date self.label = label + + # self.review_body = review_body - - + + class Input(object): - """A single training/test input for sequence classification.""" - - def __init__(self, text, review_id, date, label=None): - """Constructs an Input. - Args: - text: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.text = text - self.review_id = review_id - self.date = date - self.label = label - - + """A single training/test input for sequence classification.""" + + def __init__(self, text, review_id, date, label=None): + """Constructs an Input. + Args: + text: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.text = text + self.review_id = review_id + self.date = date + self.label = label + + def convert_input(the_input, max_seq_length): # First, we need to preprocess our data so that it matches the data BERT was trained on: # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) - # + # # Fortunately, the Transformers tokenizer does this for us! # - tokens = tokenizer.tokenize(the_input.text) + tokens = tokenizer.tokenize(the_input.text) # Next, we need to do the following: # @@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length): # # Again, the Transformers tokenizer does this for us! # - encode_plus_tokens = tokenizer.encode_plus(the_input.text, - pad_to_max_length=True, - max_length=max_seq_length, -# truncation=True - ) + encode_plus_tokens = tokenizer.encode_plus( + the_input.text, + pad_to_max_length=True, + max_length=max_seq_length, + # truncation=True + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] - - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + input_ids = encode_plus_tokens["input_ids"] + + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction. segment_ids = [0] * max_seq_length @@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length): label_id=label_id, review_id=the_input.review_id, date=the_input.date, - label=the_input.label) -# review_body=the_input.text) - -# print('**input_ids**\n{}\n'.format(features.input_ids)) -# print('**input_mask**\n{}\n'.format(features.input_mask)) -# print('**segment_ids**\n{}\n'.format(features.segment_ids)) -# print('**label_id**\n{}\n'.format(features.label_id)) -# print('**review_id**\n{}\n'.format(features.review_id)) -# print('**date**\n{}\n'.format(features.date)) -# print('**label**\n{}\n'.format(features.label)) -# print('**review_body**\n{}\n'.format(features.review_body)) + label=the_input.label, + ) + # review_body=the_input.text) + + # print('**input_ids**\n{}\n'.format(features.input_ids)) + # print('**input_mask**\n{}\n'.format(features.input_mask)) + # print('**segment_ids**\n{}\n'.format(features.segment_ids)) + # print('**label_id**\n{}\n'.format(features.label_id)) + # print('**review_id**\n{}\n'.format(features.review_id)) + # print('**date**\n{}\n'.format(features.date)) + # print('**label**\n{}\n'.format(features.label)) + # print('**review_body**\n{}\n'.format(features.review_body)) return features -def transform_inputs_to_tfrecord(inputs, - output_file, - max_seq_length): +def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length): """Convert a set of `Input`s to a TFRecord file.""" records = [] tf_record_writer = tf.io.TFRecordWriter(output_file) - + for (input_idx, the_input) in enumerate(inputs): if input_idx % 10000 == 0: - print('Writing input {} of {}\n'.format(input_idx, len(inputs))) + print("Writing input {} of {}\n".format(input_idx, len(inputs))) features = convert_input(the_input, max_seq_length) all_features = collections.OrderedDict() - all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) - all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) - all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) - all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) + all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) + all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) + all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) + all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) tf_record = tf.train.Example(features=tf.train.Features(feature=all_features)) tf_record_writer.write(tf_record.SerializeToString()) - records.append({#'tf_record': tf_record.SerializeToString(), - 'input_ids': features.input_ids, - 'input_mask': features.input_mask, - 'segment_ids': features.segment_ids, - 'label_id': features.label_id, - 'review_id': the_input.review_id, - 'date': the_input.date, - 'label': features.label, -# 'review_body': features.review_body - }) + records.append( + { #'tf_record': tf_record.SerializeToString(), + "input_ids": features.input_ids, + "input_mask": features.input_mask, + "segment_ids": features.segment_ids, + "label_id": features.label_id, + "review_id": the_input.review_id, + "date": the_input.date, + "label": features.label, + # 'review_body': features.review_body + } + ) ##################################### ####### TODO: REMOVE THIS BREAK ####### - ##################################### + ##################################### # break - + tf_record_writer.close() - + return records - + def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--train-split-percentage', type=float, + parser.add_argument( + "--train-split-percentage", + type=float, default=0.90, ) - parser.add_argument('--validation-split-percentage', type=float, - default=0.05, - ) - parser.add_argument('--test-split-percentage', type=float, + parser.add_argument( + "--validation-split-percentage", + type=float, default=0.05, ) - parser.add_argument('--balance-dataset', type=eval, - default=True + parser.add_argument( + "--test-split-percentage", + type=float, + default=0.05, ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument("--balance-dataset", type=eval, default=True) + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - parser.add_argument('--feature-store-offline-prefix', type=str, + ) + parser.add_argument( + "--feature-store-offline-prefix", + type=str, default=None, - ) - parser.add_argument('--feature-group-name', type=str, + ) + parser.add_argument( + "--feature-group-name", + type=str, default=None, - ) - + ) + return parser.parse_args() - -def _transform_tsv_to_tfrecord(file, - max_seq_length, - balance_dataset, - prefix, - feature_group_name): - print('file {}'.format(file)) - print('max_seq_length {}'.format(max_seq_length)) - print('balance_dataset {}'.format(balance_dataset)) - print('prefix {}'.format(prefix)) - print('feature_group_name {}'.format(feature_group_name)) + +def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name): + print("file {}".format(file)) + print("max_seq_length {}".format(max_seq_length)) + print("balance_dataset {}".format(balance_dataset)) + print("prefix {}".format(prefix)) + print("feature_group_name {}".format(feature_group_name)) # need to re-load since we can't pass feature_group object in _partial functions for some reason feature_group = create_or_load_feature_group(prefix, feature_group_name) - + filename_without_extension = Path(Path(file).stem).stem - df = pd.read_csv(file, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip') + df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip") df.isna().values.any() df = df.dropna() df = df.reset_index(drop=True) - print('Shape of dataframe {}'.format(df.shape)) + print("Shape of dataframe {}".format(df.shape)) - if balance_dataset: + if balance_dataset: # Balance the dataset down to the minority class from sklearn.utils import resample - five_star_df = df.query('star_rating == 5') - four_star_df = df.query('star_rating == 4') - three_star_df = df.query('star_rating == 3') - two_star_df = df.query('star_rating == 2') - one_star_df = df.query('star_rating == 1') - - minority_count = min(five_star_df.shape[0], - four_star_df.shape[0], - three_star_df.shape[0], - two_star_df.shape[0], - one_star_df.shape[0]) - - five_star_df = resample(five_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - four_star_df = resample(four_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - three_star_df = resample(three_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - two_star_df = resample(two_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - one_star_df = resample(one_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) + five_star_df = df.query("star_rating == 5") + four_star_df = df.query("star_rating == 4") + three_star_df = df.query("star_rating == 3") + two_star_df = df.query("star_rating == 2") + one_star_df = df.query("star_rating == 1") + + minority_count = min( + five_star_df.shape[0], + four_star_df.shape[0], + three_star_df.shape[0], + two_star_df.shape[0], + one_star_df.shape[0], + ) + + five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27) + + four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27) + + three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27) + + two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27) + + one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27) df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df]) - df_balanced = df_balanced.reset_index(drop=True) - print('Shape of balanced dataframe {}'.format(df_balanced.shape)) - print(df_balanced['star_rating'].head(100)) + df_balanced = df_balanced.reset_index(drop=True) + print("Shape of balanced dataframe {}".format(df_balanced.shape)) + print(df_balanced["star_rating"].head(100)) df = df_balanced - - print('Shape of dataframe before splitting {}'.format(df.shape)) - - print('train split percentage {}'.format(args.train_split_percentage)) - print('validation split percentage {}'.format(args.validation_split_percentage)) - print('test split percentage {}'.format(args.test_split_percentage)) - + + print("Shape of dataframe before splitting {}".format(df.shape)) + + print("train split percentage {}".format(args.train_split_percentage)) + print("validation split percentage {}".format(args.validation_split_percentage)) + print("test split percentage {}".format(args.test_split_percentage)) + holdout_percentage = 1.00 - args.train_split_percentage - print('holdout percentage {}'.format(holdout_percentage)) - df_train, df_holdout = train_test_split(df, - test_size=holdout_percentage, - stratify=df['star_rating']) + print("holdout percentage {}".format(holdout_percentage)) + df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"]) test_holdout_percentage = args.test_split_percentage / holdout_percentage - print('test holdout percentage {}'.format(test_holdout_percentage)) - df_validation, df_test = train_test_split(df_holdout, - test_size=test_holdout_percentage, - stratify=df_holdout['star_rating']) - + print("test holdout percentage {}".format(test_holdout_percentage)) + df_validation, df_test = train_test_split( + df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"] + ) + df_train = df_train.reset_index(drop=True) df_validation = df_validation.reset_index(drop=True) df_test = df_test.reset_index(drop=True) - print('Shape of train dataframe {}'.format(df_train.shape)) - print('Shape of validation dataframe {}'.format(df_validation.shape)) - print('Shape of test dataframe {}'.format(df_test.shape)) + print("Shape of train dataframe {}".format(df_train.shape)) + print("Shape of validation dataframe {}".format(df_validation.shape)) + print("Shape of test dataframe {}".format(df_test.shape)) timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) - train_inputs = df_train.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - validation_inputs = df_validation.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - test_inputs = df_test.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) + train_inputs = df_train.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + validation_inputs = df_validation.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + test_inputs = df_test.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): - # - # + # + # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) # 4. Map our words to indexes using a vocab file that BERT provides # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) - # + # # We don't have to worry about these details. The Transformers tokenizer does this for us. - # - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) + # + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow. - train_records = transform_inputs_to_tfrecord(train_inputs, - '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension), - max_seq_length) - - validation_records = transform_inputs_to_tfrecord(validation_inputs, - '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension), - max_seq_length) - - test_records = transform_inputs_to_tfrecord(test_inputs, - '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension), - max_seq_length) - + train_records = transform_inputs_to_tfrecord( + train_inputs, + "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + validation_records = transform_inputs_to_tfrecord( + validation_inputs, + "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + test_records = transform_inputs_to_tfrecord( + test_inputs, + "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension), + max_seq_length, + ) + df_train_records = pd.DataFrame.from_dict(train_records) - df_train_records['split_type'] = 'train' - df_train_records.head() - + df_train_records["split_type"] = "train" + df_train_records.head() + df_validation_records = pd.DataFrame.from_dict(validation_records) - df_validation_records['split_type'] = 'validation' - df_validation_records.head() + df_validation_records["split_type"] = "validation" + df_validation_records.head() df_test_records = pd.DataFrame.from_dict(test_records) - df_test_records['split_type'] = 'test' - df_test_records.head() - - # Add record to feature store + df_test_records["split_type"] = "test" + df_test_records.head() + + # Add record to feature store df_fs_train_records = cast_object_to_string(df_train_records) df_fs_validation_records = cast_object_to_string(df_validation_records) df_fs_test_records = cast_object_to_string(df_test_records) - print('Ingesting Features...') - feature_group.ingest( - data_frame=df_fs_train_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_validation_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_test_records, max_workers=3, wait=True - ) - print('Feature ingest completed.') + print("Ingesting Features...") + feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True) + print("Feature ingest completed.") def process(args): - print('Current host: {}'.format(args.current_host)) - - feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) + print("Current host: {}".format(args.current_host)) + + feature_group = create_or_load_feature_group( + prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name + ) feature_group.describe() - + print(feature_group.as_hive_ddl()) - - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) - - transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord, - max_seq_length=args.max_seq_length, - balance_dataset=args.balance_dataset, - prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) - - input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data)) + + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) + + transform_tsv_to_tfrecord = functools.partial( + _transform_tsv_to_tfrecord, + max_seq_length=args.max_seq_length, + balance_dataset=args.balance_dataset, + prefix=args.feature_store_offline_prefix, + feature_group_name=args.feature_group_name, + ) + + input_files = glob.glob("{}/*.tsv.gz".format(args.input_data)) num_cpus = multiprocessing.cpu_count() - print('num_cpus {}'.format(num_cpus)) + print("num_cpus {}".format(num_cpus)) p = multiprocessing.Pool(num_cpus) p.map(transform_tsv_to_tfrecord, input_files) - print('Listing contents of {}'.format(args.output_data)) + print("Listing contents of {}".format(args.output_data)) dirs_output = os.listdir(args.output_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(train_data)) + print("Listing contents of {}".format(train_data)) dirs_output = os.listdir(train_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(validation_data)) + print("Listing contents of {}".format(validation_data)) dirs_output = os.listdir(validation_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(test_data)) + print("Listing contents of {}".format(test_data)) dirs_output = os.listdir(test_data) for file in dirs_output: print(file) - + offline_store_contents = None - while (offline_store_contents is None): - objects_in_bucket = s3.list_objects(Bucket=bucket, - Prefix=args.feature_store_offline_prefix) - if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): - offline_store_contents = objects_in_bucket['Contents'] + while offline_store_contents is None: + objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix) + if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1: + offline_store_contents = objects_in_bucket["Contents"] else: - print('Waiting for data in offline store...\n') + print("Waiting for data in offline store...\n") sleep(60) - print('Data available.') - - print('Complete') - - + print("Data available.") + + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) process(args) diff --git a/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb b/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb index cfef442d..1914e62f 100644 --- a/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb +++ b/10_pipeline/mlops/01_Create_SageMaker_Pipeline_BERT_Reviews_MLOps.ipynb @@ -26,16 +26,16 @@ "import pandas as pd\n", "from pprint import pprint\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "sc = boto3.Session().client(service_name='servicecatalog', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "codepipeline = boto3.Session().client('codepipeline', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "sc = boto3.Session().client(service_name=\"servicecatalog\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "codepipeline = boto3.Session().client(\"codepipeline\", region_name=region)" ] }, { @@ -45,15 +45,10 @@ "outputs": [], "source": [ "search_response = sc.search_products(\n", - " Filters={\n", - " 'FullTextSearch': \n", - " [\n", - " 'MLOps template for model building, training, and deployment'\n", - " ]\n", - " }\n", + " Filters={\"FullTextSearch\": [\"MLOps template for model building, training, and deployment\"]}\n", ")\n", "\n", - "sagemaker_pipeline_product_id = search_response['ProductViewSummaries'][0]['ProductId']\n", + "sagemaker_pipeline_product_id = search_response[\"ProductViewSummaries\"][0][\"ProductId\"]\n", "print(sagemaker_pipeline_product_id)" ] }, @@ -65,7 +60,7 @@ "source": [ "describe_response = sc.describe_product(Id=sagemaker_pipeline_product_id)\n", "\n", - "sagemaker_pipeline_product_provisioning_artifact_id = describe_response['ProvisioningArtifacts'][0]['Id']" + "sagemaker_pipeline_product_provisioning_artifact_id = describe_response[\"ProvisioningArtifacts\"][0][\"Id\"]" ] }, { @@ -101,22 +96,22 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_project_name = 'dsoaws-{}'.format(timestamp)\n", + "sagemaker_project_name = \"dsoaws-{}\".format(timestamp)\n", "\n", "create_response = sm.create_project(\n", " ProjectName=sagemaker_project_name,\n", - " ProjectDescription='dsoaws-{}'.format(timestamp),\n", + " ProjectDescription=\"dsoaws-{}\".format(timestamp),\n", " ServiceCatalogProvisioningDetails={\n", - " 'ProductId': sagemaker_pipeline_product_id,\n", - " 'ProvisioningArtifactId': sagemaker_pipeline_product_provisioning_artifact_id\n", - " }\n", + " \"ProductId\": sagemaker_pipeline_product_id,\n", + " \"ProvisioningArtifactId\": sagemaker_pipeline_product_provisioning_artifact_id,\n", + " },\n", ")\n", "\n", - "sagemaker_project_id = create_response['ProjectId']\n", - "sagemaker_project_arn = create_response['ProjectArn']\n", + "sagemaker_project_id = create_response[\"ProjectId\"]\n", + "sagemaker_project_arn = create_response[\"ProjectArn\"]\n", "\n", - "print('Project ID {}'.format(sagemaker_project_id))\n", - "print('Project ARN {}'.format(sagemaker_project_arn))" + "print(\"Project ID {}\".format(sagemaker_project_id))\n", + "print(\"Project ARN {}\".format(sagemaker_project_arn))" ] }, { @@ -125,9 +120,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_project_name_and_id = '{}-{}'.format(sagemaker_project_name, sagemaker_project_id)\n", + "sagemaker_project_name_and_id = \"{}-{}\".format(sagemaker_project_name, sagemaker_project_id)\n", "\n", - "print('Combined Project ID and ARN combined: {}'.format(sagemaker_project_name_and_id))" + "print(\"Combined Project ID and ARN combined: {}\".format(sagemaker_project_name_and_id))" ] }, { @@ -149,26 +144,26 @@ "\n", "try:\n", " describe_project_response = sm.describe_project(ProjectName=sagemaker_project_name)\n", - " project_status = describe_project_response['ProjectStatus']\n", - " print('Creating Project...')\n", + " project_status = describe_project_response[\"ProjectStatus\"]\n", + " print(\"Creating Project...\")\n", "\n", - " while project_status in ['Pending', 'CreateInProgress']:\n", - " print('Please wait...')\n", + " while project_status in [\"Pending\", \"CreateInProgress\"]:\n", + " print(\"Please wait...\")\n", " time.sleep(30)\n", " describe_project_response = sm.describe_project(ProjectName=sagemaker_project_name)\n", - " project_status = describe_project_response['ProjectStatus']\n", - " print('Project status: {}'.format(project_status))\n", + " project_status = describe_project_response[\"ProjectStatus\"]\n", + " print(\"Project status: {}\".format(project_status))\n", "\n", - " if project_status == 'CreateCompleted': \n", - " print('Project {}'.format(project_status))\n", + " if project_status == \"CreateCompleted\":\n", + " print(\"Project {}\".format(project_status))\n", "\n", " else:\n", - " print('Project status: {}'.format(project_status))\n", - " raise Exception('Project not created.')\n", - " \n", + " print(\"Project status: {}\".format(project_status))\n", + " raise Exception(\"Project not created.\")\n", + "\n", "except Exception as e:\n", " print(e)\n", - " \n", + "\n", "print(describe_project_response)" ] }, @@ -193,7 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "sc_role_name='AmazonSageMakerServiceCatalogProductsUseRole'" + "sc_role_name = \"AmazonSageMakerServiceCatalogProductsUseRole\"" ] }, { @@ -202,7 +197,7 @@ "metadata": {}, "outputs": [], "source": [ - "account_id = sts.get_caller_identity()['Account']\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "print(account_id)" ] }, @@ -212,10 +207,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.attach_role_policy(\n", - " RoleName=sc_role_name,\n", - " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n", - ")\n", + "response = iam.attach_role_policy(RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/AmazonSageMakerFullAccess\")\n", "\n", "print(response)" ] @@ -229,8 +221,7 @@ "outputs": [], "source": [ "response = iam.attach_role_policy(\n", - " RoleName=sc_role_name,\n", - " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFeatureStoreAccess'\n", + " RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/AmazonSageMakerFeatureStoreAccess\"\n", ")\n", "\n", "print(response)" @@ -242,10 +233,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.attach_role_policy(\n", - " RoleName=sc_role_name,\n", - " PolicyArn='arn:aws:iam::aws:policy/IAMFullAccess'\n", - ")\n", + "response = iam.attach_role_policy(RoleName=sc_role_name, PolicyArn=\"arn:aws:iam::aws:policy/IAMFullAccess\")\n", "\n", "print(response)" ] @@ -264,7 +252,9 @@ "metadata": {}, "outputs": [], "source": [ - "sample_abalone_pipeline_execution_arn = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries'][0]['PipelineExecutionArn']\n", + "sample_abalone_pipeline_execution_arn = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n", + " \"PipelineExecutionSummaries\"\n", + "][0][\"PipelineExecutionArn\"]\n", "\n", "print(sample_abalone_pipeline_execution_arn)" ] @@ -287,25 +277,29 @@ "%%time\n", "\n", "try:\n", - " describe_pipeline_execution_response = sm.describe_pipeline_execution(PipelineExecutionArn=sample_abalone_pipeline_execution_arn)\n", - " pipeline_execution_status = describe_pipeline_execution_response['PipelineExecutionStatus']\n", + " describe_pipeline_execution_response = sm.describe_pipeline_execution(\n", + " PipelineExecutionArn=sample_abalone_pipeline_execution_arn\n", + " )\n", + " pipeline_execution_status = describe_pipeline_execution_response[\"PipelineExecutionStatus\"]\n", "\n", - " while pipeline_execution_status not in ['Stopped', 'Failed']:\n", - " print('Please wait...')\n", + " while pipeline_execution_status not in [\"Stopped\", \"Failed\"]:\n", + " print(\"Please wait...\")\n", " time.sleep(30)\n", - " describe_pipeline_execution_response = sm.describe_pipeline_execution(PipelineExecutionArn=sample_abalone_pipeline_execution_arn)\n", - " pipeline_execution_status = describe_pipeline_execution_response['PipelineExecutionStatus']\n", - " print('Pipeline execution status: {}'.format(pipeline_execution_status))\n", - "\n", - " if pipeline_execution_status in ['Stopped', 'Failed']: \n", - " print('Pipeline execution status {}'.format(pipeline_execution_status))\n", + " describe_pipeline_execution_response = sm.describe_pipeline_execution(\n", + " PipelineExecutionArn=sample_abalone_pipeline_execution_arn\n", + " )\n", + " pipeline_execution_status = describe_pipeline_execution_response[\"PipelineExecutionStatus\"]\n", + " print(\"Pipeline execution status: {}\".format(pipeline_execution_status))\n", + "\n", + " if pipeline_execution_status in [\"Stopped\", \"Failed\"]:\n", + " print(\"Pipeline execution status {}\".format(pipeline_execution_status))\n", " else:\n", - " print('Pipeline execution status: {}'.format(pipeline_execution_status))\n", - " raise Exception('Pipeline execution not deleted.')\n", - " \n", + " print(\"Pipeline execution status: {}\".format(pipeline_execution_status))\n", + " raise Exception(\"Pipeline execution not deleted.\")\n", + "\n", "except Exception as e:\n", " print(e)\n", - " \n", + "\n", "print(describe_pipeline_execution_response)" ] }, @@ -333,8 +327,8 @@ "source": [ "import os\n", "\n", - "sm_studio_root_path='/root/' \n", - "sm_notebooks_root_path='/home/ec2-user/SageMaker/'\n", + "sm_studio_root_path = \"/root/\"\n", + "sm_notebooks_root_path = \"/home/ec2-user/SageMaker/\"\n", "\n", "root_path = sm_notebooks_root_path if os.path.isdir(sm_notebooks_root_path) else sm_studio_root_path\n", "\n", @@ -356,7 +350,9 @@ "metadata": {}, "outputs": [], "source": [ - "code_commit_repo1 = 'https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modelbuild'.format(region, sagemaker_project_name_and_id)\n", + "code_commit_repo1 = \"https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modelbuild\".format(\n", + " region, sagemaker_project_name_and_id\n", + ")\n", "print(code_commit_repo1)" ] }, @@ -366,7 +362,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_mlops_build_code = '{}{}/sagemaker-{}-modelbuild'.format(root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id)\n", + "sagemaker_mlops_build_code = \"{}{}/sagemaker-{}-modelbuild\".format(\n", + " root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id\n", + ")\n", "print(sagemaker_mlops_build_code)" ] }, @@ -376,7 +374,9 @@ "metadata": {}, "outputs": [], "source": [ - "code_commit_repo2 = 'https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modeldeploy'.format(region, sagemaker_project_name_and_id)\n", + "code_commit_repo2 = \"https://git-codecommit.{}.amazonaws.com/v1/repos/sagemaker-{}-modeldeploy\".format(\n", + " region, sagemaker_project_name_and_id\n", + ")\n", "print(code_commit_repo2)" ] }, @@ -386,7 +386,9 @@ "metadata": {}, "outputs": [], "source": [ - "sagemaker_mlops_deploy_code = '{}{}/sagemaker-{}-modeldeploy'.format(root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id)\n", + "sagemaker_mlops_deploy_code = \"{}{}/sagemaker-{}-modeldeploy\".format(\n", + " root_path, sagemaker_project_name_and_id, sagemaker_project_name_and_id\n", + ")\n", "print(sagemaker_mlops_deploy_code)" ] }, @@ -447,7 +449,7 @@ "metadata": {}, "outputs": [], "source": [ - "workshop_project_build_code='{}workshop/10_pipeline/mlops/sagemaker-project-modelbuild'.format(root_path)\n", + "workshop_project_build_code = \"{}workshop/10_pipeline/mlops/sagemaker-project-modelbuild\".format(root_path)\n", "print(workshop_project_build_code)" ] }, @@ -457,7 +459,7 @@ "metadata": {}, "outputs": [], "source": [ - "workshop_project_deploy_code='{}workshop/10_pipeline/mlops/sagemaker-project-modeldeploy'.format(root_path)\n", + "workshop_project_deploy_code = \"{}workshop/10_pipeline/mlops/sagemaker-project-modeldeploy\".format(root_path)\n", "print(workshop_project_deploy_code)" ] }, @@ -579,13 +581,15 @@ "\n", "while True:\n", " try:\n", - " print('Listing executions for our pipeline...')\n", - " list_executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n", - " break;\n", + " print(\"Listing executions for our pipeline...\")\n", + " list_executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n", + " \"PipelineExecutionSummaries\"\n", + " ]\n", + " break\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(list_executions_response)" ] }, @@ -595,7 +599,7 @@ "metadata": {}, "outputs": [], "source": [ - "build_pipeline_name = 'sagemaker-{}-modelbuild'.format(sagemaker_project_name_and_id)" + "build_pipeline_name = \"sagemaker-{}-modelbuild\".format(sagemaker_project_name_and_id)" ] }, { @@ -606,7 +610,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Check ModelBuild Pipeline'.format(build_pipeline_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Check ModelBuild Pipeline'.format(\n", + " build_pipeline_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -629,20 +639,24 @@ "import time\n", "from pprint import pprint\n", "\n", - "executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n", - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n", + " \"PipelineExecutionSummaries\"\n", + "]\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)\n", "\n", - "while pipeline_execution_status=='Executing':\n", + "while pipeline_execution_status == \"Executing\":\n", " try:\n", - " executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)['PipelineExecutionSummaries']\n", - " pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", - "# print('Executions for our pipeline...')\n", - "# print(pipeline_execution_status)\n", + " executions_response = sm.list_pipeline_executions(PipelineName=sagemaker_project_name_and_id)[\n", + " \"PipelineExecutionSummaries\"\n", + " ]\n", + " pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", + " # print('Executions for our pipeline...')\n", + " # print(pipeline_execution_status)\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(executions_response)" ] }, @@ -666,7 +680,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_status = executions_response[0]['PipelineExecutionStatus']\n", + "pipeline_execution_status = executions_response[0][\"PipelineExecutionStatus\"]\n", "print(pipeline_execution_status)" ] }, @@ -676,7 +690,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_execution_arn = executions_response[0]['PipelineExecutionArn']\n", + "pipeline_execution_arn = executions_response[0][\"PipelineExecutionArn\"]\n", "print(pipeline_execution_arn)" ] }, @@ -713,15 +727,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -754,13 +768,13 @@ "\n", "while True:\n", " try:\n", - " print('Executions for our pipeline...')\n", + " print(\"Executions for our pipeline...\")\n", " list_model_packages_response = sm.list_model_packages(ModelPackageGroupName=sagemaker_project_name_and_id)\n", - " break;\n", + " break\n", " except Exception as e:\n", - " print('Please wait...')\n", - " time.sleep(30) \n", - " \n", + " print(\"Please wait...\")\n", + " time.sleep(30)\n", + "\n", "pprint(list_model_packages_response)" ] }, @@ -772,7 +786,7 @@ "source": [ "time.sleep(30)\n", "\n", - "model_package_arn = list_model_packages_response['ModelPackageSummaryList'][0]['ModelPackageArn']\n", + "model_package_arn = list_model_packages_response[\"ModelPackageSummaryList\"][0][\"ModelPackageArn\"]\n", "print(model_package_arn)" ] }, @@ -798,7 +812,7 @@ "source": [ "time.sleep(30)\n", "\n", - "model_name = sm.list_models()['Models'][0]['ModelName']\n", + "model_name = sm.list_models()[\"Models\"][0][\"ModelName\"]\n", "print(model_name)" ] }, @@ -810,7 +824,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Model'.format(region, model_name)))" + "display(\n", + " HTML(\n", + " 'Review Model'.format(\n", + " region, model_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -819,7 +839,7 @@ "metadata": {}, "outputs": [], "source": [ - "deploy_pipeline_name = 'sagemaker-{}-modeldeploy'.format(sagemaker_project_name_and_id)" + "deploy_pipeline_name = \"sagemaker-{}-modeldeploy\".format(sagemaker_project_name_and_id)" ] }, { @@ -830,7 +850,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Check ModelDeploy Pipeline'.format(deploy_pipeline_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Check ModelDeploy Pipeline'.format(\n", + " deploy_pipeline_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -839,7 +865,7 @@ "metadata": {}, "outputs": [], "source": [ - "staging_endpoint_name='{}-staging'.format(sagemaker_project_name)" + "staging_endpoint_name = \"{}-staging\".format(sagemaker_project_name)" ] }, { @@ -850,7 +876,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker Staging REST Endpoint'.format(region, staging_endpoint_name)))" + "display(\n", + " HTML(\n", + " 'Review SageMaker Staging REST Endpoint'.format(\n", + " region, staging_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -869,16 +901,16 @@ "%%time\n", "\n", "while True:\n", - " try: \n", - " waiter = sm.get_waiter('endpoint_in_service')\n", - " print('Waiting for staging endpoint to be in `InService`...')\n", + " try:\n", + " waiter = sm.get_waiter(\"endpoint_in_service\")\n", + " print(\"Waiting for staging endpoint to be in `InService`...\")\n", " waiter.wait(EndpointName=staging_endpoint_name)\n", - " break;\n", + " break\n", " except:\n", - " print('Waiting for staging endpoint to be in `Creating`...')\n", + " print(\"Waiting for staging endpoint to be in `Creating`...\")\n", " time.sleep(30)\n", - " \n", - "print('Staging endpoint deployed.')" + "\n", + "print(\"Staging endpoint deployed.\")" ] }, { @@ -917,15 +949,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", @@ -951,14 +983,16 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=staging_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer()) " + "predictor = TensorFlowPredictor(\n", + " endpoint_name=staging_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -967,15 +1001,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -993,7 +1024,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Deploy to Production Pipeline '.format(sagemaker_project_name_and_id, region)))" + "display(\n", + " HTML(\n", + " 'Review Deploy to Production Pipeline '.format(\n", + " sagemaker_project_name_and_id, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -1002,8 +1039,8 @@ "metadata": {}, "outputs": [], "source": [ - "stage_name = 'DeployStaging'\n", - "action_name = 'ApproveDeployment'" + "stage_name = \"DeployStaging\"\n", + "action_name = \"ApproveDeployment\"" ] }, { @@ -1014,15 +1051,15 @@ "source": [ "time.sleep(30)\n", "\n", - "stage_states = codepipeline.get_pipeline_state(name=deploy_pipeline_name)['stageStates'] \n", + "stage_states = codepipeline.get_pipeline_state(name=deploy_pipeline_name)[\"stageStates\"]\n", "\n", "for stage_state in stage_states:\n", "\n", - " if stage_state['stageName'] == stage_name:\n", - " for action_state in stage_state['actionStates']:\n", - " if action_state['actionName'] == action_name:\n", - " token = action_state['latestExecution']['token']\n", - " \n", + " if stage_state[\"stageName\"] == stage_name:\n", + " for action_state in stage_state[\"actionStates\"]:\n", + " if action_state[\"actionName\"] == action_name:\n", + " token = action_state[\"latestExecution\"][\"token\"]\n", + "\n", "print(token)" ] }, @@ -1036,11 +1073,8 @@ " pipelineName=deploy_pipeline_name,\n", " stageName=stage_name,\n", " actionName=action_name,\n", - " result={\n", - " 'summary': 'Approve from Staging to Production',\n", - " 'status': 'Approved'\n", - " },\n", - " token=token\n", + " result={\"summary\": \"Approve from Staging to Production\", \"status\": \"Approved\"},\n", + " token=token,\n", ")" ] }, @@ -1059,7 +1093,7 @@ "source": [ "time.sleep(30)\n", "\n", - "production_endpoint_name='{}-prod'.format(sagemaker_project_name)" + "production_endpoint_name = \"{}-prod\".format(sagemaker_project_name)" ] }, { @@ -1070,7 +1104,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review SageMaker Production REST Endpoint'.format(region, production_endpoint_name)))\n" + "display(\n", + " HTML(\n", + " 'Review SageMaker Production REST Endpoint'.format(\n", + " region, production_endpoint_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -1089,16 +1129,16 @@ "%%time\n", "\n", "while True:\n", - " try: \n", - " waiter = sm.get_waiter('endpoint_in_service')\n", - " print('Waiting for production endpoint to be in `InService`...')\n", + " try:\n", + " waiter = sm.get_waiter(\"endpoint_in_service\")\n", + " print(\"Waiting for production endpoint to be in `InService`...\")\n", " waiter.wait(EndpointName=production_endpoint_name)\n", - " break;\n", + " break\n", " except:\n", - " print('Waiting for production endpoint to be in `Creating`...')\n", + " print(\"Waiting for production endpoint to be in `Creating`...\")\n", " time.sleep(30)\n", - " \n", - "print('Production endpoint deployed.')" + "\n", + "print(\"Production endpoint deployed.\")" ] }, { @@ -1119,14 +1159,16 @@ "from sagemaker.serializers import JSONLinesSerializer\n", "from sagemaker.deserializers import JSONLinesDeserializer\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=production_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0,\n", - " content_type='application/jsonlines',\n", - " accept_type='application/jsonlines',\n", - " serializer=JSONLinesSerializer(),\n", - " deserializer=JSONLinesDeserializer()) " + "predictor = TensorFlowPredictor(\n", + " endpoint_name=production_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + " content_type=\"application/jsonlines\",\n", + " accept_type=\"application/jsonlines\",\n", + " serializer=JSONLinesSerializer(),\n", + " deserializer=JSONLinesDeserializer(),\n", + ")" ] }, { @@ -1135,15 +1177,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "]\n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "predicted_classes = predictor.predict(inputs)\n", "\n", "for predicted_class in predicted_classes:\n", - " print('Predicted star_rating: {}'.format(predicted_class))" + " print(\"Predicted star_rating: {}\".format(predicted_class))" ] }, { @@ -1175,15 +1214,15 @@ "\n", "viz = LineageTableVisualizer(sagemaker.session.Session())\n", "\n", - "for execution_step in reversed(steps['PipelineExecutionSteps']):\n", + "for execution_step in reversed(steps[\"PipelineExecutionSteps\"]):\n", " print(execution_step)\n", " # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step\n", - " if execution_step['StepName'] == 'Processing':\n", - " processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]\n", + " if execution_step[\"StepName\"] == \"Processing\":\n", + " processing_job_name = execution_step[\"Metadata\"][\"ProcessingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(processing_job_name)\n", " display(viz.show(processing_job_name=processing_job_name))\n", - " elif execution_step['StepName'] == 'Train':\n", - " training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1]\n", + " elif execution_step[\"StepName\"] == \"Train\":\n", + " training_job_name = execution_step[\"Metadata\"][\"TrainingJob\"][\"Arn\"].split(\"/\")[-1]\n", " print(training_job_name)\n", " display(viz.show(training_job_name=training_job_name))\n", " else:\n", diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py index 024afdec..f3523174 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/evaluate_model_metrics.py @@ -4,13 +4,16 @@ from datetime import datetime import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) import pandas as pd import os import re @@ -33,99 +36,99 @@ from sklearn.utils import resample -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] -config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) +config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, +) def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--input-model', type=str, - default='/opt/ml/processing/input/model', + parser.add_argument( + "--input-model", + type=str, + default="/opt/ml/processing/input/model", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - + ) + return parser.parse_args() - + def process(args): - print('Current host: {}'.format(args.current_host)) - - print('input_data: {}'.format(args.input_data)) - print('input_model: {}'.format(args.input_model)) - - print('Listing contents of input model dir: {}'.format(args.input_model)) + print("Current host: {}".format(args.current_host)) + + print("input_data: {}".format(args.input_data)) + print("input_model: {}".format(args.input_model)) + + print("Listing contents of input model dir: {}".format(args.input_model)) input_files = os.listdir(args.input_model) for file in input_files: print(file) - model_tar_path = '{}/model.tar.gz'.format(args.input_model) + model_tar_path = "{}/model.tar.gz".format(args.input_model) model_tar = tarfile.open(model_tar_path) model_tar.extractall(args.input_model) - model_tar.close() + model_tar.close() - model = keras.models.load_model('{}/tensorflow/saved_model/0'.format(args.input_model)) + model = keras.models.load_model("{}/tensorflow/saved_model/0".format(args.input_model)) print(model) - + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=args.max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=args.max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -133,81 +136,86 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) - + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) ########################################################################################### # TODO: Replace this with glob for all files and remove test_data/ from the model.tar.gz # - ########################################################################################### -# evaluation_data_path = '/opt/ml/processing/input/data/' - - print('Listing contents of input data dir: {}'.format(args.input_data)) + ########################################################################################### + # evaluation_data_path = '/opt/ml/processing/input/data/' + + print("Listing contents of input data dir: {}".format(args.input_data)) input_files = os.listdir(args.input_data) - test_data_path = '{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz'.format(args.input_data) - print('Using only {} to evaluate.'.format(test_data_path)) - df_test_reviews = pd.read_csv(test_data_path, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + test_data_path = "{}/amazon_reviews_us_Digital_Software_v1_00.tsv.gz".format(args.input_data) + print("Using only {} to evaluate.".format(test_data_path)) + df_test_reviews = pd.read_csv(test_data_path, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")[ + ["review_body", "star_rating"] + ] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - y_actual = df_test_reviews['star_rating'] + y_actual = df_test_reviews["star_rating"] y_actual print(classification_report(y_true=y_test, y_pred=y_actual)) - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) def plot_conf_mat(cm, classes, title, cmap): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.ylabel("True label") + plt.xlabel("Predicted label") cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=CLASSES, - title='Confusion Matrix', - cmap=plt.cm.Greens) + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=CLASSES, title="Confusion Matrix", cmap=plt.cm.Greens) - # Save the confusion matrix + # Save the confusion matrix plt.show() - # Model Output - metrics_path = os.path.join(args.output_data, 'metrics/') + # Model Output + metrics_path = os.path.join(args.output_data, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) report_dict = { "metrics": { @@ -220,26 +228,26 @@ def plot_conf_mat(cm, classes, title, cmap): evaluation_path = "{}/evaluation.json".format(metrics_path) with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict)) - - print('Listing contents of output dir: {}'.format(args.output_data)) + + print("Listing contents of output dir: {}".format(args.output_data)) output_files = os.listdir(args.output_data) for file in output_files: print(file) - print('Listing contents of output/metrics dir: {}'.format(metrics_path)) - output_files = os.listdir('{}'.format(metrics_path)) + print("Listing contents of output/metrics dir: {}".format(metrics_path)) + output_files = os.listdir("{}".format(metrics_path)) for file in output_files: print(file) - print('Complete') - - + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) - process(args) + process(args) diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py index 2975dc2d..53196737 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py index 20561f85..07b2b0ef 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/pipeline.py @@ -35,20 +35,12 @@ ScriptProcessor, ) -from sagemaker.workflow.parameters import ( - ParameterInteger, - ParameterString, - ParameterFloat -) +from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat from sagemaker.workflow.pipeline import Pipeline -from sagemaker.workflow.steps import ( - ProcessingStep, - TrainingStep, - CreateModelStep -) +from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep -from sagemaker.model_metrics import MetricsSource, ModelMetrics +from sagemaker.model_metrics import MetricsSource, ModelMetrics from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo from sagemaker.workflow.condition_step import ( ConditionStep, @@ -62,22 +54,15 @@ from sagemaker.inputs import CreateModelInput -sess = sagemaker.Session() +sess = sagemaker.Session() bucket = sess.default_bucket() timestamp = int(time.time()) BASE_DIR = os.path.dirname(os.path.realpath(__file__)) -print('BASE_DIR: {}'.format(BASE_DIR)) +print("BASE_DIR: {}".format(BASE_DIR)) -def get_pipeline( - region, - role, - default_bucket, - pipeline_name, - model_package_group_name, - base_job_prefix -): +def get_pipeline(region, role, default_bucket, pipeline_name, model_package_group_name, base_job_prefix): """Gets a SageMaker ML Pipeline instance working with BERT. Args: @@ -91,23 +76,17 @@ def get_pipeline( Returns: an instance of a pipeline """ - - sm = boto3.Session().client(service_name='sagemaker', region_name=region) - + + sm = boto3.Session().client(service_name="sagemaker", region_name=region) + input_data = ParameterString( name="InputDataUrl", default_value="s3://{}/amazon-reviews-pds/tsv/".format(bucket), ) - - processing_instance_count = ParameterInteger( - name="ProcessingInstanceCount", - default_value=1 - ) - processing_instance_type = ParameterString( - name="ProcessingInstanceType", - default_value="ml.c5.2xlarge" - ) + processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) + + processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.c5.2xlarge") max_seq_length = ParameterInteger( name="MaxSeqLength", @@ -140,58 +119,53 @@ def get_pipeline( ) feature_group_name = ParameterString( - name="FeatureGroupName", - default_value="reviews-feature-group-" + str(timestamp) - ) - - train_instance_type = ParameterString( - name="TrainingInstanceType", - default_value="ml.c5.9xlarge" + name="FeatureGroupName", default_value="reviews-feature-group-" + str(timestamp) ) - train_instance_count = ParameterInteger( - name="TrainingInstanceCount", - default_value=1 - ) - - - + train_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.c5.9xlarge") + + train_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1) + ######################### # PROCESSING STEP ######################### - + processor = SKLearnProcessor( - framework_version='0.23-1', + framework_version="0.23-1", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, - env={'AWS_DEFAULT_REGION': region}, - max_runtime_in_seconds=7200) - - processing_inputs=[ + env={"AWS_DEFAULT_REGION": region}, + max_runtime_in_seconds=7200, + ) + + processing_inputs = [ ProcessingInput( - input_name='raw-input-data', + input_name="raw-input-data", source=input_data, - destination='/opt/ml/processing/input/data/', - s3_data_distribution_type='ShardedByS3Key' + destination="/opt/ml/processing/input/data/", + s3_data_distribution_type="ShardedByS3Key", ) ] - - processing_outputs=[ - ProcessingOutput(output_name='bert-train', - s3_upload_mode='EndOfJob', - source='/opt/ml/processing/output/bert/train', - ), - ProcessingOutput(output_name='bert-validation', - s3_upload_mode='EndOfJob', - source='/opt/ml/processing/output/bert/validation', - ), - ProcessingOutput(output_name='bert-test', - s3_upload_mode='EndOfJob', - source='/opt/ml/processing/output/bert/test', - ), + + processing_outputs = [ + ProcessingOutput( + output_name="bert-train", + s3_upload_mode="EndOfJob", + source="/opt/ml/processing/output/bert/train", + ), + ProcessingOutput( + output_name="bert-validation", + s3_upload_mode="EndOfJob", + source="/opt/ml/processing/output/bert/validation", + ), + ProcessingOutput( + output_name="bert-test", + s3_upload_mode="EndOfJob", + source="/opt/ml/processing/output/bert/test", + ), ] - + # TODO: Figure out why the Parameter's are not resolving properly to their native type when user here. # We shouldn't be using `default_value` processing_step = ProcessingStep( @@ -200,72 +174,48 @@ def get_pipeline( inputs=processing_inputs, outputs=processing_outputs, job_arguments=[ - '--train-split-percentage', str(train_split_percentage.default_value), - '--validation-split-percentage', str(validation_split_percentage.default_value), - '--test-split-percentage', str(test_split_percentage.default_value), - '--max-seq-length', str(max_seq_length.default_value), - '--balance-dataset', str(balance_dataset.default_value), - '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value), - '--feature-group-name', str(feature_group_name.default_value) + "--train-split-percentage", + str(train_split_percentage.default_value), + "--validation-split-percentage", + str(validation_split_percentage.default_value), + "--test-split-percentage", + str(test_split_percentage.default_value), + "--max-seq-length", + str(max_seq_length.default_value), + "--balance-dataset", + str(balance_dataset.default_value), + "--feature-store-offline-prefix", + str(feature_store_offline_prefix.default_value), + "--feature-group-name", + str(feature_group_name.default_value), ], - code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py") + code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py"), ) - - + ######################### # TRAINING STEP ######################### - - epochs = ParameterInteger( - name="Epochs", - default_value=1 - ) - - learning_rate = ParameterFloat( - name="LearningRate", - default_value=0.00001 - ) - - epsilon = ParameterFloat( - name="Epsilon", - default_value=0.00000001 - ) - - train_batch_size = ParameterInteger( - name="TrainBatchSize", - default_value=128 - ) - - validation_batch_size = ParameterInteger( - name="ValidationBatchSize", - default_value=128 - ) - - test_batch_size = ParameterInteger( - name="TestBatchSize", - default_value=128 - ) - - train_steps_per_epoch = ParameterInteger( - name="TrainStepsPerEpoch", - default_value=50 - ) - - validation_steps = ParameterInteger( - name="ValidationSteps", - default_value=50 - ) - - test_steps = ParameterInteger( - name="TestSteps", - default_value=50 - ) - - train_volume_size = ParameterInteger( - name="TrainVolumeSize", - default_value=1024 - ) - + + epochs = ParameterInteger(name="Epochs", default_value=1) + + learning_rate = ParameterFloat(name="LearningRate", default_value=0.00001) + + epsilon = ParameterFloat(name="Epsilon", default_value=0.00000001) + + train_batch_size = ParameterInteger(name="TrainBatchSize", default_value=128) + + validation_batch_size = ParameterInteger(name="ValidationBatchSize", default_value=128) + + test_batch_size = ParameterInteger(name="TestBatchSize", default_value=128) + + train_steps_per_epoch = ParameterInteger(name="TrainStepsPerEpoch", default_value=50) + + validation_steps = ParameterInteger(name="ValidationSteps", default_value=50) + + test_steps = ParameterInteger(name="TestSteps", default_value=50) + + train_volume_size = ParameterInteger(name="TrainVolumeSize", default_value=1024) + use_xla = ParameterString( name="UseXLA", default_value="True", @@ -275,7 +225,7 @@ def get_pipeline( name="UseAMP", default_value="True", ) - + freeze_bert_layer = ParameterString( name="FreezeBERTLayer", default_value="False", @@ -285,7 +235,7 @@ def get_pipeline( name="EnableSageMakerDebugger", default_value="False", ) - + enable_checkpointing = ParameterString( name="EnableCheckpointing", default_value="False", @@ -295,7 +245,7 @@ def get_pipeline( name="EnableTensorboard", default_value="False", ) - + input_mode = ParameterString( name="InputMode", default_value="File", @@ -305,188 +255,171 @@ def get_pipeline( name="RunValidation", default_value="True", ) - + run_test = ParameterString( name="RunTest", default_value="False", ) - + run_sample_predictions = ParameterString( name="RunSamplePredictions", default_value="False", ) - - + metrics_definitions = [ - {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'}, - {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'}, - {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'}, - {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'} + {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"}, + {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"}, + {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"}, + {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"}, ] - - train_src=os.path.join(BASE_DIR, "src") + + train_src = os.path.join(BASE_DIR, "src") model_path = f"s3://{default_bucket}/{base_job_prefix}/output/model" - + estimator = TensorFlow( - entry_point='tf_bert_reviews.py', + entry_point="tf_bert_reviews.py", source_dir=BASE_DIR, role=role, output_path=model_path, instance_count=train_instance_count, instance_type=train_instance_type, volume_size=train_volume_size, - py_version='py37', - framework_version='2.3.1', + py_version="py37", + framework_version="2.3.1", hyperparameters={ - 'epochs': epochs, - 'learning_rate': learning_rate, - 'epsilon': epsilon, - 'train_batch_size': train_batch_size, - 'validation_batch_size': validation_batch_size, - 'test_batch_size': test_batch_size, - 'train_steps_per_epoch': train_steps_per_epoch, - 'validation_steps': validation_steps, - 'test_steps': test_steps, - 'use_xla': use_xla, - 'use_amp': use_amp, - 'max_seq_length': max_seq_length, - 'freeze_bert_layer': freeze_bert_layer, - 'enable_sagemaker_debugger': enable_sagemaker_debugger, - 'enable_checkpointing': enable_checkpointing, - 'enable_tensorboard': enable_tensorboard, - 'run_validation': run_validation, - 'run_test': run_test, - 'run_sample_predictions': run_sample_predictions}, + "epochs": epochs, + "learning_rate": learning_rate, + "epsilon": epsilon, + "train_batch_size": train_batch_size, + "validation_batch_size": validation_batch_size, + "test_batch_size": test_batch_size, + "train_steps_per_epoch": train_steps_per_epoch, + "validation_steps": validation_steps, + "test_steps": test_steps, + "use_xla": use_xla, + "use_amp": use_amp, + "max_seq_length": max_seq_length, + "freeze_bert_layer": freeze_bert_layer, + "enable_sagemaker_debugger": enable_sagemaker_debugger, + "enable_checkpointing": enable_checkpointing, + "enable_tensorboard": enable_tensorboard, + "run_validation": run_validation, + "run_test": run_test, + "run_sample_predictions": run_sample_predictions, + }, input_mode=input_mode, metric_definitions=metrics_definitions, -# max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute - ) + # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute + ) training_step = TrainingStep( - name='Train', + name="Train", estimator=estimator, inputs={ - 'train': TrainingInput( - s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - 'bert-train' - ].S3Output.S3Uri, - content_type='text/csv' + "train": TrainingInput( + s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-train"].S3Output.S3Uri, + content_type="text/csv", + ), + "validation": TrainingInput( + s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-validation"].S3Output.S3Uri, + content_type="text/csv", ), - 'validation': TrainingInput( - s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - 'bert-validation' - ].S3Output.S3Uri, - content_type='text/csv' + "test": TrainingInput( + s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["bert-test"].S3Output.S3Uri, + content_type="text/csv", ), - 'test': TrainingInput( - s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - 'bert-test' - ].S3Output.S3Uri, - content_type='text/csv' - ) - } - ) - - + }, + ) + ######################### # EVALUATION STEP ######################### - - evaluation_processor = SKLearnProcessor(framework_version='0.23-1', - role=role, - instance_type=processing_instance_type, - instance_count=processing_instance_count, - env={'AWS_DEFAULT_REGION': region}, - max_runtime_in_seconds=7200) - - evaluation_report = PropertyFile( - name='EvaluationReport', - output_name='metrics', - path='evaluation.json' - ) - + + evaluation_processor = SKLearnProcessor( + framework_version="0.23-1", + role=role, + instance_type=processing_instance_type, + instance_count=processing_instance_count, + env={"AWS_DEFAULT_REGION": region}, + max_runtime_in_seconds=7200, + ) + + evaluation_report = PropertyFile(name="EvaluationReport", output_name="metrics", path="evaluation.json") + evaluation_step = ProcessingStep( - name='EvaluateModel', + name="EvaluateModel", processor=evaluation_processor, code=os.path.join(BASE_DIR, "evaluate_model_metrics.py"), inputs=[ ProcessingInput( source=training_step.properties.ModelArtifacts.S3ModelArtifacts, - destination='/opt/ml/processing/input/model' + destination="/opt/ml/processing/input/model", ), ProcessingInput( - source=processing_step.properties.ProcessingInputs['raw-input-data'].S3Input.S3Uri, - destination='/opt/ml/processing/input/data' - ) + source=processing_step.properties.ProcessingInputs["raw-input-data"].S3Input.S3Uri, + destination="/opt/ml/processing/input/data", + ), ], outputs=[ - ProcessingOutput(output_name='metrics', - s3_upload_mode='EndOfJob', - source='/opt/ml/processing/output/metrics/'), + ProcessingOutput( + output_name="metrics", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/metrics/" + ), ], job_arguments=[ - '--max-seq-length', str(max_seq_length.default_value), - ], + "--max-seq-length", + str(max_seq_length.default_value), + ], property_files=[evaluation_report], # these cause deserialization issues - ) - + ) + model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), - content_type="application/json" + content_type="application/json", ) - ) - - - ######################### - ## REGISTER TRAINED MODEL STEP + ) + + ######################### + ## REGISTER TRAINED MODEL STEP ######################### - - model_approval_status = ParameterString( - name="ModelApprovalStatus", - default_value="PendingManualApproval" - ) - - deploy_instance_type = ParameterString( - name="DeployInstanceType", - default_value="ml.m5.4xlarge" - ) - - deploy_instance_count = ParameterInteger( - name="DeployInstanceCount", - default_value=1 - ) - + + model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval") + + deploy_instance_type = ParameterString(name="DeployInstanceType", default_value="ml.m5.4xlarge") + + deploy_instance_count = ParameterInteger(name="DeployInstanceCount", default_value=1) + inference_image_uri = sagemaker.image_uris.retrieve( framework="tensorflow", region=region, version="2.3.1", py_version="py37", instance_type=deploy_instance_type, - image_scope="inference" + image_scope="inference", ) print(inference_image_uri) register_step = RegisterModel( name="RegisterModel", estimator=estimator, - image_uri=inference_image_uri, # we have to specify, by default it's using training image + image_uri=inference_image_uri, # we have to specify, by default it's using training image model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], - inference_instances=[deploy_instance_type], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception + inference_instances=[ + deploy_instance_type + ], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception transform_instances=[deploy_instance_type], model_package_group_name=model_package_group_name, approval_status=model_approval_status, ) - - + ######################### ## CREATE MODEL FOR DEPLOYMENT STEP ######################### - + model = Model( image_uri=inference_image_uri, model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, @@ -503,38 +436,33 @@ def get_pipeline( model=model, inputs=create_inputs, ) - ######################### ## CONDITION STEP: EVALUATE THE MODEL ######################### - - min_accuracy_value = ParameterFloat( - name="MinAccuracyValue", - default_value=0.01 - ) - + + min_accuracy_value = ParameterFloat(name="MinAccuracyValue", default_value=0.01) + minimum_accuracy_condition = ConditionGreaterThanOrEqualTo( left=JsonGet( step=evaluation_step, property_file=evaluation_report, json_path="metrics.accuracy.value", ), - right=min_accuracy_value # accuracy + right=min_accuracy_value, # accuracy ) minimum_accuracy_condition_step = ConditionStep( name="AccuracyCondition", conditions=[minimum_accuracy_condition], - if_steps=[register_step, create_step], # success, continue with model registration - else_steps=[], # fail, end the pipeline + if_steps=[register_step, create_step], # success, continue with model registration + else_steps=[], # fail, end the pipeline ) - ######################### ## CREATE PIPELINE ######################### - + pipeline = Pipeline( name=pipeline_name, parameters=[ @@ -569,20 +497,18 @@ def get_pipeline( input_mode, run_validation, run_test, - run_sample_predictions, + run_sample_predictions, min_accuracy_value, model_approval_status, deploy_instance_type, - deploy_instance_count + deploy_instance_count, ], - steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step], - sagemaker_session=sess + steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step], + sagemaker_session=sess, ) - - - ######################### + + ######################### ## RETURN PIPELINE ######################### - - return pipeline + return pipeline diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py index 1211ba85..7e1cd385 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/preprocess-scikit-text-to-bert-feature-store.py @@ -20,16 +20,18 @@ import subprocess ## PIP INSTALLS ## -# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to +# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to # use anaconda and anaconda only supports 2.3.0 at this time -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"]) import pandas as pd import re import sagemaker @@ -40,51 +42,55 @@ FeatureTypeEnum, ) -region = os.environ['AWS_DEFAULT_REGION'] -print('Region: {}'.format(region)) +region = os.environ["AWS_DEFAULT_REGION"] +print("Region: {}".format(region)) ############################# ## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc. ## Role and Bucket are malformed if we do this later. -sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region) +sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region) caller_identity = sts.get_caller_identity() -print('caller_identity: {}'.format(caller_identity)) +print("caller_identity: {}".format(caller_identity)) -assumed_role_arn = caller_identity['Arn'] -print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn)) +assumed_role_arn = caller_identity["Arn"] +print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn)) -assumed_role_name = assumed_role_arn.split('/')[-2] +assumed_role_name = assumed_role_arn.split("/")[-2] -iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region) -get_role_response = iam.get_role(RoleName=assumed_role_name) -print('get_role_response {}'.format(get_role_response)) -role = get_role_response['Role']['Arn'] -print('role {}'.format(role)) +iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region) +get_role_response = iam.get_role(RoleName=assumed_role_name) +print("get_role_response {}".format(get_role_response)) +role = get_role_response["Role"]["Arn"] +print("role {}".format(role)) bucket = sagemaker.Session().default_bucket() -print('The DEFAULT BUCKET is {}'.format(bucket)) +print("The DEFAULT BUCKET is {}".format(bucket)) ############################# -sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region) +sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region) -featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region) +featurestore_runtime = boto3.Session(region_name=region).client( + service_name="sagemaker-featurestore-runtime", region_name=region +) -s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region) +s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region) -sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region), - sagemaker_client=sm, - sagemaker_featurestore_runtime_client=featurestore_runtime) +sagemaker_session = sagemaker.Session( + boto_session=boto3.Session(region_name=region), + sagemaker_client=sm, + sagemaker_featurestore_runtime_client=featurestore_runtime, +) -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -REVIEW_BODY_COLUMN = 'review_body' -REVIEW_ID_COLUMN = 'review_id' +REVIEW_BODY_COLUMN = "review_body" +REVIEW_ID_COLUMN = "review_id" # DATE_COLUMN = 'date' -LABEL_COLUMN = 'star_rating' +LABEL_COLUMN = "star_rating" LABEL_VALUES = [1, 2, 3, 4, 5] - + label_map = {} for (i, label) in enumerate(LABEL_VALUES): label_map[label] = i @@ -92,94 +98,88 @@ def cast_object_to_string(data_frame): for label in data_frame.columns: - if data_frame.dtypes[label] == 'object': + if data_frame.dtypes[label] == "object": data_frame[label] = data_frame[label].astype("str").astype("string") return data_frame - + def wait_for_feature_group_creation_complete(feature_group): try: status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) if status != "Created": - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.") except: - print('No feature group created yet.') - - + print("No feature group created yet.") + + def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records - feature_definitions= [ - FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), -# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) + feature_definitions = [ + FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), + # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] - + feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=sagemaker_session) - - print('Feature Group: {}'.format(feature_group)) - - try: - print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...') + name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session + ) + + print("Feature Group: {}".format(feature_group)) + + try: + print( + "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." + ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: - print('Before CREATE FG wait exeption: {}'.format(e)) -# pass - + print("Before CREATE FG wait exeption: {}".format(e)) + # pass + try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" - - print('Creating Feature Group with role {}...'.format(role)) + + print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, - enable_online_store=True + enable_online_store=True, ) - print('Creating Feature Group. Completed.') - - print('Waiting for new Feature Group to become available...') + print("Creating Feature Group. Completed.") + + print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) - print('Feature Group available.') + print("Feature Group available.") feature_group.describe() - + except Exception as e: - print('Exception: {}'.format(e)) - + print("Exception: {}".format(e)) + return feature_group - + class InputFeatures(object): - """BERT feature vectors.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - review_id, - date, - label): -# review_body): + """BERT feature vectors.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label): + # review_body): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids @@ -187,36 +187,38 @@ def __init__(self, self.review_id = review_id self.date = date self.label = label + + # self.review_body = review_body - - + + class Input(object): - """A single training/test input for sequence classification.""" - - def __init__(self, text, review_id, date, label=None): - """Constructs an Input. - Args: - text: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.text = text - self.review_id = review_id - self.date = date - self.label = label - - + """A single training/test input for sequence classification.""" + + def __init__(self, text, review_id, date, label=None): + """Constructs an Input. + Args: + text: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.text = text + self.review_id = review_id + self.date = date + self.label = label + + def convert_input(the_input, max_seq_length): # First, we need to preprocess our data so that it matches the data BERT was trained on: # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) - # + # # Fortunately, the Transformers tokenizer does this for us! # - tokens = tokenizer.tokenize(the_input.text) + tokens = tokenizer.tokenize(the_input.text) # Next, we need to do the following: # @@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length): # # Again, the Transformers tokenizer does this for us! # - encode_plus_tokens = tokenizer.encode_plus(the_input.text, - pad_to_max_length=True, - max_length=max_seq_length, -# truncation=True - ) + encode_plus_tokens = tokenizer.encode_plus( + the_input.text, + pad_to_max_length=True, + max_length=max_seq_length, + # truncation=True + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] - - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + input_ids = encode_plus_tokens["input_ids"] + + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction. segment_ids = [0] * max_seq_length @@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length): label_id=label_id, review_id=the_input.review_id, date=the_input.date, - label=the_input.label) -# review_body=the_input.text) - -# print('**input_ids**\n{}\n'.format(features.input_ids)) -# print('**input_mask**\n{}\n'.format(features.input_mask)) -# print('**segment_ids**\n{}\n'.format(features.segment_ids)) -# print('**label_id**\n{}\n'.format(features.label_id)) -# print('**review_id**\n{}\n'.format(features.review_id)) -# print('**date**\n{}\n'.format(features.date)) -# print('**label**\n{}\n'.format(features.label)) -# print('**review_body**\n{}\n'.format(features.review_body)) + label=the_input.label, + ) + # review_body=the_input.text) + + # print('**input_ids**\n{}\n'.format(features.input_ids)) + # print('**input_mask**\n{}\n'.format(features.input_mask)) + # print('**segment_ids**\n{}\n'.format(features.segment_ids)) + # print('**label_id**\n{}\n'.format(features.label_id)) + # print('**review_id**\n{}\n'.format(features.review_id)) + # print('**date**\n{}\n'.format(features.date)) + # print('**label**\n{}\n'.format(features.label)) + # print('**review_body**\n{}\n'.format(features.review_body)) return features -def transform_inputs_to_tfrecord(inputs, - output_file, - max_seq_length): +def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length): """Convert a set of `Input`s to a TFRecord file.""" records = [] tf_record_writer = tf.io.TFRecordWriter(output_file) - + for (input_idx, the_input) in enumerate(inputs): if input_idx % 10000 == 0: - print('Writing input {} of {}\n'.format(input_idx, len(inputs))) + print("Writing input {} of {}\n".format(input_idx, len(inputs))) features = convert_input(the_input, max_seq_length) all_features = collections.OrderedDict() - all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) - all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) - all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) - all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) + all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) + all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) + all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) + all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) tf_record = tf.train.Example(features=tf.train.Features(feature=all_features)) tf_record_writer.write(tf_record.SerializeToString()) - records.append({#'tf_record': tf_record.SerializeToString(), - 'input_ids': features.input_ids, - 'input_mask': features.input_mask, - 'segment_ids': features.segment_ids, - 'label_id': features.label_id, - 'review_id': the_input.review_id, - 'date': the_input.date, - 'label': features.label, -# 'review_body': features.review_body - }) + records.append( + { #'tf_record': tf_record.SerializeToString(), + "input_ids": features.input_ids, + "input_mask": features.input_mask, + "segment_ids": features.segment_ids, + "label_id": features.label_id, + "review_id": the_input.review_id, + "date": the_input.date, + "label": features.label, + # 'review_body': features.review_body + } + ) ##################################### ####### TODO: REMOVE THIS BREAK ####### - ##################################### + ##################################### # break - + tf_record_writer.close() - + return records - + def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--train-split-percentage', type=float, + parser.add_argument( + "--train-split-percentage", + type=float, default=0.90, ) - parser.add_argument('--validation-split-percentage', type=float, - default=0.05, - ) - parser.add_argument('--test-split-percentage', type=float, + parser.add_argument( + "--validation-split-percentage", + type=float, default=0.05, ) - parser.add_argument('--balance-dataset', type=eval, - default=True + parser.add_argument( + "--test-split-percentage", + type=float, + default=0.05, ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument("--balance-dataset", type=eval, default=True) + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - parser.add_argument('--feature-store-offline-prefix', type=str, + ) + parser.add_argument( + "--feature-store-offline-prefix", + type=str, default=None, - ) - parser.add_argument('--feature-group-name', type=str, + ) + parser.add_argument( + "--feature-group-name", + type=str, default=None, - ) - + ) + return parser.parse_args() - -def _transform_tsv_to_tfrecord(file, - max_seq_length, - balance_dataset, - prefix, - feature_group_name): - print('file {}'.format(file)) - print('max_seq_length {}'.format(max_seq_length)) - print('balance_dataset {}'.format(balance_dataset)) - print('prefix {}'.format(prefix)) - print('feature_group_name {}'.format(feature_group_name)) + +def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name): + print("file {}".format(file)) + print("max_seq_length {}".format(max_seq_length)) + print("balance_dataset {}".format(balance_dataset)) + print("prefix {}".format(prefix)) + print("feature_group_name {}".format(feature_group_name)) # need to re-load since we can't pass feature_group object in _partial functions for some reason feature_group = create_or_load_feature_group(prefix, feature_group_name) - + filename_without_extension = Path(Path(file).stem).stem - df = pd.read_csv(file, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip') + df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip") df.isna().values.any() df = df.dropna() df = df.reset_index(drop=True) - print('Shape of dataframe {}'.format(df.shape)) + print("Shape of dataframe {}".format(df.shape)) - if balance_dataset: + if balance_dataset: # Balance the dataset down to the minority class from sklearn.utils import resample - five_star_df = df.query('star_rating == 5') - four_star_df = df.query('star_rating == 4') - three_star_df = df.query('star_rating == 3') - two_star_df = df.query('star_rating == 2') - one_star_df = df.query('star_rating == 1') - - minority_count = min(five_star_df.shape[0], - four_star_df.shape[0], - three_star_df.shape[0], - two_star_df.shape[0], - one_star_df.shape[0]) - - five_star_df = resample(five_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - four_star_df = resample(four_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - three_star_df = resample(three_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - two_star_df = resample(two_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - one_star_df = resample(one_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) + five_star_df = df.query("star_rating == 5") + four_star_df = df.query("star_rating == 4") + three_star_df = df.query("star_rating == 3") + two_star_df = df.query("star_rating == 2") + one_star_df = df.query("star_rating == 1") + + minority_count = min( + five_star_df.shape[0], + four_star_df.shape[0], + three_star_df.shape[0], + two_star_df.shape[0], + one_star_df.shape[0], + ) + + five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27) + + four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27) + + three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27) + + two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27) + + one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27) df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df]) - df_balanced = df_balanced.reset_index(drop=True) - print('Shape of balanced dataframe {}'.format(df_balanced.shape)) - print(df_balanced['star_rating'].head(100)) + df_balanced = df_balanced.reset_index(drop=True) + print("Shape of balanced dataframe {}".format(df_balanced.shape)) + print(df_balanced["star_rating"].head(100)) df = df_balanced - - print('Shape of dataframe before splitting {}'.format(df.shape)) - - print('train split percentage {}'.format(args.train_split_percentage)) - print('validation split percentage {}'.format(args.validation_split_percentage)) - print('test split percentage {}'.format(args.test_split_percentage)) - + + print("Shape of dataframe before splitting {}".format(df.shape)) + + print("train split percentage {}".format(args.train_split_percentage)) + print("validation split percentage {}".format(args.validation_split_percentage)) + print("test split percentage {}".format(args.test_split_percentage)) + holdout_percentage = 1.00 - args.train_split_percentage - print('holdout percentage {}'.format(holdout_percentage)) - df_train, df_holdout = train_test_split(df, - test_size=holdout_percentage, - stratify=df['star_rating']) + print("holdout percentage {}".format(holdout_percentage)) + df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"]) test_holdout_percentage = args.test_split_percentage / holdout_percentage - print('test holdout percentage {}'.format(test_holdout_percentage)) - df_validation, df_test = train_test_split(df_holdout, - test_size=test_holdout_percentage, - stratify=df_holdout['star_rating']) - + print("test holdout percentage {}".format(test_holdout_percentage)) + df_validation, df_test = train_test_split( + df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"] + ) + df_train = df_train.reset_index(drop=True) df_validation = df_validation.reset_index(drop=True) df_test = df_test.reset_index(drop=True) - print('Shape of train dataframe {}'.format(df_train.shape)) - print('Shape of validation dataframe {}'.format(df_validation.shape)) - print('Shape of test dataframe {}'.format(df_test.shape)) + print("Shape of train dataframe {}".format(df_train.shape)) + print("Shape of validation dataframe {}".format(df_validation.shape)) + print("Shape of test dataframe {}".format(df_test.shape)) timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) - train_inputs = df_train.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - validation_inputs = df_validation.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - test_inputs = df_test.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) + train_inputs = df_train.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + validation_inputs = df_validation.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + test_inputs = df_test.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): - # - # + # + # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) # 4. Map our words to indexes using a vocab file that BERT provides # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) - # + # # We don't have to worry about these details. The Transformers tokenizer does this for us. - # - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) + # + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow. - train_records = transform_inputs_to_tfrecord(train_inputs, - '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension), - max_seq_length) - - validation_records = transform_inputs_to_tfrecord(validation_inputs, - '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension), - max_seq_length) - - test_records = transform_inputs_to_tfrecord(test_inputs, - '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension), - max_seq_length) - + train_records = transform_inputs_to_tfrecord( + train_inputs, + "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + validation_records = transform_inputs_to_tfrecord( + validation_inputs, + "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + test_records = transform_inputs_to_tfrecord( + test_inputs, + "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension), + max_seq_length, + ) + df_train_records = pd.DataFrame.from_dict(train_records) - df_train_records['split_type'] = 'train' - df_train_records.head() - + df_train_records["split_type"] = "train" + df_train_records.head() + df_validation_records = pd.DataFrame.from_dict(validation_records) - df_validation_records['split_type'] = 'validation' - df_validation_records.head() + df_validation_records["split_type"] = "validation" + df_validation_records.head() df_test_records = pd.DataFrame.from_dict(test_records) - df_test_records['split_type'] = 'test' - df_test_records.head() - - # Add record to feature store + df_test_records["split_type"] = "test" + df_test_records.head() + + # Add record to feature store df_fs_train_records = cast_object_to_string(df_train_records) df_fs_validation_records = cast_object_to_string(df_validation_records) df_fs_test_records = cast_object_to_string(df_test_records) - print('Ingesting Features...') - feature_group.ingest( - data_frame=df_fs_train_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_validation_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_test_records, max_workers=3, wait=True - ) - print('Feature ingest completed.') + print("Ingesting Features...") + feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True) + print("Feature ingest completed.") def process(args): - print('Current host: {}'.format(args.current_host)) - - feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) + print("Current host: {}".format(args.current_host)) + + feature_group = create_or_load_feature_group( + prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name + ) feature_group.describe() - + print(feature_group.as_hive_ddl()) - - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) - - transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord, - max_seq_length=args.max_seq_length, - balance_dataset=args.balance_dataset, - prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) - - input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data)) + + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) + + transform_tsv_to_tfrecord = functools.partial( + _transform_tsv_to_tfrecord, + max_seq_length=args.max_seq_length, + balance_dataset=args.balance_dataset, + prefix=args.feature_store_offline_prefix, + feature_group_name=args.feature_group_name, + ) + + input_files = glob.glob("{}/*.tsv.gz".format(args.input_data)) num_cpus = multiprocessing.cpu_count() - print('num_cpus {}'.format(num_cpus)) + print("num_cpus {}".format(num_cpus)) p = multiprocessing.Pool(num_cpus) p.map(transform_tsv_to_tfrecord, input_files) - print('Listing contents of {}'.format(args.output_data)) + print("Listing contents of {}".format(args.output_data)) dirs_output = os.listdir(args.output_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(train_data)) + print("Listing contents of {}".format(train_data)) dirs_output = os.listdir(train_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(validation_data)) + print("Listing contents of {}".format(validation_data)) dirs_output = os.listdir(validation_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(test_data)) + print("Listing contents of {}".format(test_data)) dirs_output = os.listdir(test_data) for file in dirs_output: print(file) - + offline_store_contents = None - while (offline_store_contents is None): - objects_in_bucket = s3.list_objects(Bucket=bucket, - Prefix=args.feature_store_offline_prefix) - if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): - offline_store_contents = objects_in_bucket['Contents'] + while offline_store_contents is None: + objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix) + if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1: + offline_store_contents = objects_in_bucket["Contents"] else: - print('Waiting for data in offline store...\n') + print("Waiting for data in offline store...\n") sleep(60) - print('Data available.') - - print('Complete') - - + print("Data available.") + + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) process(args) diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/dsoaws/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py index 31951e62..9063ac1e 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/pipelines/run_pipeline.py @@ -26,7 +26,8 @@ from smexperiments import tracker import boto3 -sm = boto3.Session().client(service_name='sagemaker') + +sm = boto3.Session().client(service_name="sagemaker") import sagemaker @@ -36,9 +37,7 @@ def main(): # pragma: no cover Creates or updates the pipeline and runs it. """ - parser = argparse.ArgumentParser( - "Creates or updates and runs the pipeline for the pipeline script." - ) + parser = argparse.ArgumentParser("Creates or updates and runs the pipeline for the pipeline script.") parser.add_argument( "-n", @@ -89,9 +88,7 @@ def main(): # pragma: no cover parsed = json.loads(pipeline.definition()) print(json.dumps(parsed, indent=2, sort_keys=True)) - upsert_response = pipeline.upsert( - role_arn=args.role_arn, description=args.description, tags=tags - ) + upsert_response = pipeline.upsert(role_arn=args.role_arn, description=args.description, tags=tags) print("\n###### Created/Updated SageMaker Pipeline: Response received:") print(upsert_response) @@ -100,125 +97,113 @@ def main(): # pragma: no cover # Now we describe execution instance and list the steps in the execution to find out more about the execution. execution_run = execution.describe() - print(execution_run) - - # Create or Load the 'Experiment' + print(execution_run) + + # Create or Load the 'Experiment' try: experiment = Experiment.create( - experiment_name=pipeline.name, - description='Amazon Customer Reviews BERT Pipeline Experiment' - ) - except: - experiment = Experiment.load( - experiment_name=pipeline.name + experiment_name=pipeline.name, description="Amazon Customer Reviews BERT Pipeline Experiment" ) - - print('Experiment name: {}'.format(experiment.experiment_name)) - + except: + experiment = Experiment.load(experiment_name=pipeline.name) + + print("Experiment name: {}".format(experiment.experiment_name)) + # Add Execution Run as Trial to Experiments - execution_run_name = execution_run['PipelineExecutionDisplayName'] + execution_run_name = execution_run["PipelineExecutionDisplayName"] print(execution_run_name) - + # Create the `Trial` timestamp = int(time.time()) - trial = Trial.create(trial_name=execution_run_name, - experiment_name=experiment.experiment_name, - sagemaker_boto_client=sm) + trial = Trial.create( + trial_name=execution_run_name, experiment_name=experiment.experiment_name, sagemaker_boto_client=sm + ) trial_name = trial.trial_name - print('Trial name: {}'.format(trial_name)) - + print("Trial name: {}".format(trial_name)) + ###################################################### ## Parse Pipeline Definition For Processing Job Args ###################################################### - + processing_param_dict = {} - - for step in parsed['Steps']: - print('step: {}'.format(step)) - if step['Name']=='Processing': - print('Step Name is Processing...') - arg_list = step['Arguments']['AppSpecification']['ContainerArguments'] + + for step in parsed["Steps"]: + print("step: {}".format(step)) + if step["Name"] == "Processing": + print("Step Name is Processing...") + arg_list = step["Arguments"]["AppSpecification"]["ContainerArguments"] print(arg_list) num_args = len(arg_list) print(num_args) - - # arguments are (key, value) pairs in this list, so we extract them in pairs + + # arguments are (key, value) pairs in this list, so we extract them in pairs # using [i] and [i+1] indexes and stepping by 2 through the list for i in range(0, num_args, 2): - key = arg_list[i].replace('--', '') - value = arg_list[i+1] - print('arg key: {}'.format(key)) - print('arg value: {}'.format(value)) + key = arg_list[i].replace("--", "") + value = arg_list[i + 1] + print("arg key: {}".format(key)) + print("arg value: {}".format(value)) processing_param_dict[key] = value - ############################## ## Wait For Execution To Finish ############################## - + print("Waiting for the execution to finish...") execution.wait() - print("\n#####Execution completed. Execution step details:") - + print("\n#####Execution completed. Execution step details:") + # List Execution Steps - print(execution.list_steps()) - + print(execution.list_steps()) + # List All Artifacts Generated By The Pipeline - processing_job_name=None - training_job_name=None - + processing_job_name = None + training_job_name = None + from sagemaker.lineage.visualizer import LineageTableVisualizer viz = LineageTableVisualizer(sagemaker.session.Session()) for execution_step in reversed(execution.list_steps()): print(execution_step) # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step - if execution_step['StepName'] == 'Processing': - processing_job_name=execution_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1] + if execution_step["StepName"] == "Processing": + processing_job_name = execution_step["Metadata"]["ProcessingJob"]["Arn"].split("/")[-1] print(processing_job_name) - #display(viz.show(processing_job_name=processing_job_name)) - elif execution_step['StepName'] == 'Train': - training_job_name=execution_step['Metadata']['TrainingJob']['Arn'].split('/')[-1] + # display(viz.show(processing_job_name=processing_job_name)) + elif execution_step["StepName"] == "Train": + training_job_name = execution_step["Metadata"]["TrainingJob"]["Arn"].split("/")[-1] print(training_job_name) - #display(viz.show(training_job_name=training_job_name)) + # display(viz.show(training_job_name=training_job_name)) else: - #display(viz.show(pipeline_execution_step=execution_step)) + # display(viz.show(pipeline_execution_step=execution_step)) time.sleep(5) - # Add Trial Compontents To Experiment Trial - processing_job_tc = '{}-aws-processing-job'.format(processing_job_name) + # Add Trial Compontents To Experiment Trial + processing_job_tc = "{}-aws-processing-job".format(processing_job_name) print(processing_job_tc) # -aws-processing-job is the default name assigned by ProcessingJob - response = sm.associate_trial_component( - TrialComponentName=processing_job_tc, - TrialName=trial_name - ) - + response = sm.associate_trial_component(TrialComponentName=processing_job_tc, TrialName=trial_name) + # -aws-training-job is the default name assigned by TrainingJob - training_job_tc = '{}-aws-training-job'.format(training_job_name) + training_job_tc = "{}-aws-training-job".format(training_job_name) print(training_job_tc) - response = sm.associate_trial_component( - TrialComponentName=training_job_tc, - TrialName=trial_name - ) - + response = sm.associate_trial_component(TrialComponentName=training_job_tc, TrialName=trial_name) + ############## # Log Additional Parameters within Trial ############## - print('Logging Processing Job Parameters within Experiment Trial...') - processing_job_tracker = tracker.Tracker.load(trial_component_name=processing_job_tc) - + print("Logging Processing Job Parameters within Experiment Trial...") + processing_job_tracker = tracker.Tracker.load(trial_component_name=processing_job_tc) + for key, value in processing_param_dict.items(): - print('key: {}, value: {}'.format(key, value)) - processing_job_tracker.log_parameters({ - key: str(value) - }) + print("key: {}, value: {}".format(key, value)) + processing_job_tracker.log_parameters({key: str(value)}) # must save after logging - processing_job_tracker.trial_component.save(); + processing_job_tracker.trial_component.save() except Exception as e: # pylint: disable=W0703 print(f"Exception: {e}") diff --git a/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py b/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py index b6b8b179..224153d5 100644 --- a/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py +++ b/10_pipeline/mlops/sagemaker-project-modelbuild/setup.py @@ -12,7 +12,13 @@ readme = f.read() -required_packages = ["sagemaker==2.24.3", "sagemaker-experiments==0.1.26", "pandas==1.0.1", "boto3==1.17.4", "botocore==1.20.4"] +required_packages = [ + "sagemaker==2.24.3", + "sagemaker-experiments==0.1.26", + "pandas==1.0.1", + "boto3==1.17.4", + "botocore==1.20.4", +] extras = { "test": [ "black", diff --git a/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py b/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py index a9c66cf9..7825f488 100644 --- a/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py +++ b/10_pipeline/mlops/sagemaker-project-modeldeploy/test/test.py @@ -61,9 +61,7 @@ def test_endpoint(endpoint_name): config = json.load(f) # Get the endpoint name from sagemaker project name - endpoint_name = "{}-{}".format( - config["Parameters"]["SageMakerProjectName"], config["Parameters"]["StageName"] - ) + endpoint_name = "{}-{}".format(config["Parameters"]["SageMakerProjectName"], config["Parameters"]["StageName"]) results = test_endpoint(endpoint_name) # Print results and write to file diff --git a/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py b/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py index 1211ba85..7e1cd385 100644 --- a/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py +++ b/10_pipeline/preprocess-scikit-text-to-bert-feature-store.py @@ -20,16 +20,18 @@ import subprocess ## PIP INSTALLS ## -# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to +# This is 2.3.0 (vs. 2.3.1 everywhere else) because we need to # use anaconda and anaconda only supports 2.3.0 at this time -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'anaconda', 'tensorflow==2.3.0', '-y']) +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "anaconda", "tensorflow==2.3.0", "-y"]) import tensorflow as tf from tensorflow import keras -subprocess.check_call([sys.executable, '-m', 'conda', 'install', '-c', 'conda-forge', 'transformers==3.5.1', '-y']) + +subprocess.check_call([sys.executable, "-m", "conda", "install", "-c", "conda-forge", "transformers==3.5.1", "-y"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.24.1"]) import pandas as pd import re import sagemaker @@ -40,51 +42,55 @@ FeatureTypeEnum, ) -region = os.environ['AWS_DEFAULT_REGION'] -print('Region: {}'.format(region)) +region = os.environ["AWS_DEFAULT_REGION"] +print("Region: {}".format(region)) ############################# ## We may need to get the Role and Bucket before setting sm, featurestore_runtime, etc. ## Role and Bucket are malformed if we do this later. -sts = boto3.Session(region_name=region).client(service_name='sts', region_name=region) +sts = boto3.Session(region_name=region).client(service_name="sts", region_name=region) caller_identity = sts.get_caller_identity() -print('caller_identity: {}'.format(caller_identity)) +print("caller_identity: {}".format(caller_identity)) -assumed_role_arn = caller_identity['Arn'] -print('(assumed_role) caller_identity_arn: {}'.format(assumed_role_arn)) +assumed_role_arn = caller_identity["Arn"] +print("(assumed_role) caller_identity_arn: {}".format(assumed_role_arn)) -assumed_role_name = assumed_role_arn.split('/')[-2] +assumed_role_name = assumed_role_arn.split("/")[-2] -iam = boto3.Session(region_name=region).client(service_name='iam', region_name=region) -get_role_response = iam.get_role(RoleName=assumed_role_name) -print('get_role_response {}'.format(get_role_response)) -role = get_role_response['Role']['Arn'] -print('role {}'.format(role)) +iam = boto3.Session(region_name=region).client(service_name="iam", region_name=region) +get_role_response = iam.get_role(RoleName=assumed_role_name) +print("get_role_response {}".format(get_role_response)) +role = get_role_response["Role"]["Arn"] +print("role {}".format(role)) bucket = sagemaker.Session().default_bucket() -print('The DEFAULT BUCKET is {}'.format(bucket)) +print("The DEFAULT BUCKET is {}".format(bucket)) ############################# -sm = boto3.Session(region_name=region).client(service_name='sagemaker', region_name=region) +sm = boto3.Session(region_name=region).client(service_name="sagemaker", region_name=region) -featurestore_runtime = boto3.Session(region_name=region).client(service_name='sagemaker-featurestore-runtime', region_name=region) +featurestore_runtime = boto3.Session(region_name=region).client( + service_name="sagemaker-featurestore-runtime", region_name=region +) -s3 = boto3.Session(region_name=region).client(service_name='s3', region_name=region) +s3 = boto3.Session(region_name=region).client(service_name="s3", region_name=region) -sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region), - sagemaker_client=sm, - sagemaker_featurestore_runtime_client=featurestore_runtime) +sagemaker_session = sagemaker.Session( + boto_session=boto3.Session(region_name=region), + sagemaker_client=sm, + sagemaker_featurestore_runtime_client=featurestore_runtime, +) -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -REVIEW_BODY_COLUMN = 'review_body' -REVIEW_ID_COLUMN = 'review_id' +REVIEW_BODY_COLUMN = "review_body" +REVIEW_ID_COLUMN = "review_id" # DATE_COLUMN = 'date' -LABEL_COLUMN = 'star_rating' +LABEL_COLUMN = "star_rating" LABEL_VALUES = [1, 2, 3, 4, 5] - + label_map = {} for (i, label) in enumerate(LABEL_VALUES): label_map[label] = i @@ -92,94 +98,88 @@ def cast_object_to_string(data_frame): for label in data_frame.columns: - if data_frame.dtypes[label] == 'object': + if data_frame.dtypes[label] == "object": data_frame[label] = data_frame[label].astype("str").astype("string") return data_frame - + def wait_for_feature_group_creation_complete(feature_group): try: status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) if status != "Created": - print('Feature Group status: {}'.format(status)) + print("Feature Group status: {}".format(status)) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.") except: - print('No feature group created yet.') - - + print("No feature group created yet.") + + def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records - feature_definitions= [ - FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), -# FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) + feature_definitions = [ + FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), + # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] - + feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=sagemaker_session) - - print('Feature Group: {}'.format(feature_group)) - - try: - print('Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...') + name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session + ) + + print("Feature Group: {}".format(feature_group)) + + try: + print( + "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." + ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: - print('Before CREATE FG wait exeption: {}'.format(e)) -# pass - + print("Before CREATE FG wait exeption: {}".format(e)) + # pass + try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" - - print('Creating Feature Group with role {}...'.format(role)) + + print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, - enable_online_store=True + enable_online_store=True, ) - print('Creating Feature Group. Completed.') - - print('Waiting for new Feature Group to become available...') + print("Creating Feature Group. Completed.") + + print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) - print('Feature Group available.') + print("Feature Group available.") feature_group.describe() - + except Exception as e: - print('Exception: {}'.format(e)) - + print("Exception: {}".format(e)) + return feature_group - + class InputFeatures(object): - """BERT feature vectors.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - review_id, - date, - label): -# review_body): + """BERT feature vectors.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label): + # review_body): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids @@ -187,36 +187,38 @@ def __init__(self, self.review_id = review_id self.date = date self.label = label + + # self.review_body = review_body - - + + class Input(object): - """A single training/test input for sequence classification.""" - - def __init__(self, text, review_id, date, label=None): - """Constructs an Input. - Args: - text: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.text = text - self.review_id = review_id - self.date = date - self.label = label - - + """A single training/test input for sequence classification.""" + + def __init__(self, text, review_id, date, label=None): + """Constructs an Input. + Args: + text: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.text = text + self.review_id = review_id + self.date = date + self.label = label + + def convert_input(the_input, max_seq_length): # First, we need to preprocess our data so that it matches the data BERT was trained on: # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) - # + # # Fortunately, the Transformers tokenizer does this for us! # - tokens = tokenizer.tokenize(the_input.text) + tokens = tokenizer.tokenize(the_input.text) # Next, we need to do the following: # @@ -226,17 +228,18 @@ def convert_input(the_input, max_seq_length): # # Again, the Transformers tokenizer does this for us! # - encode_plus_tokens = tokenizer.encode_plus(the_input.text, - pad_to_max_length=True, - max_length=max_seq_length, -# truncation=True - ) + encode_plus_tokens = tokenizer.encode_plus( + the_input.text, + pad_to_max_length=True, + max_length=max_seq_length, + # truncation=True + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] - - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + input_ids = encode_plus_tokens["input_ids"] + + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] # Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction. segment_ids = [0] * max_seq_length @@ -251,380 +254,376 @@ def convert_input(the_input, max_seq_length): label_id=label_id, review_id=the_input.review_id, date=the_input.date, - label=the_input.label) -# review_body=the_input.text) - -# print('**input_ids**\n{}\n'.format(features.input_ids)) -# print('**input_mask**\n{}\n'.format(features.input_mask)) -# print('**segment_ids**\n{}\n'.format(features.segment_ids)) -# print('**label_id**\n{}\n'.format(features.label_id)) -# print('**review_id**\n{}\n'.format(features.review_id)) -# print('**date**\n{}\n'.format(features.date)) -# print('**label**\n{}\n'.format(features.label)) -# print('**review_body**\n{}\n'.format(features.review_body)) + label=the_input.label, + ) + # review_body=the_input.text) + + # print('**input_ids**\n{}\n'.format(features.input_ids)) + # print('**input_mask**\n{}\n'.format(features.input_mask)) + # print('**segment_ids**\n{}\n'.format(features.segment_ids)) + # print('**label_id**\n{}\n'.format(features.label_id)) + # print('**review_id**\n{}\n'.format(features.review_id)) + # print('**date**\n{}\n'.format(features.date)) + # print('**label**\n{}\n'.format(features.label)) + # print('**review_body**\n{}\n'.format(features.review_body)) return features -def transform_inputs_to_tfrecord(inputs, - output_file, - max_seq_length): +def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length): """Convert a set of `Input`s to a TFRecord file.""" records = [] tf_record_writer = tf.io.TFRecordWriter(output_file) - + for (input_idx, the_input) in enumerate(inputs): if input_idx % 10000 == 0: - print('Writing input {} of {}\n'.format(input_idx, len(inputs))) + print("Writing input {} of {}\n".format(input_idx, len(inputs))) features = convert_input(the_input, max_seq_length) all_features = collections.OrderedDict() - all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) - all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) - all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) - all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) + all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids)) + all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask)) + all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids)) + all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id])) tf_record = tf.train.Example(features=tf.train.Features(feature=all_features)) tf_record_writer.write(tf_record.SerializeToString()) - records.append({#'tf_record': tf_record.SerializeToString(), - 'input_ids': features.input_ids, - 'input_mask': features.input_mask, - 'segment_ids': features.segment_ids, - 'label_id': features.label_id, - 'review_id': the_input.review_id, - 'date': the_input.date, - 'label': features.label, -# 'review_body': features.review_body - }) + records.append( + { #'tf_record': tf_record.SerializeToString(), + "input_ids": features.input_ids, + "input_mask": features.input_mask, + "segment_ids": features.segment_ids, + "label_id": features.label_id, + "review_id": the_input.review_id, + "date": the_input.date, + "label": features.label, + # 'review_body': features.review_body + } + ) ##################################### ####### TODO: REMOVE THIS BREAK ####### - ##################################### + ##################################### # break - + tf_record_writer.close() - + return records - + def list_arg(raw_value): """argparse type for a list of strings""" - return str(raw_value).split(',') + return str(raw_value).split(",") def parse_args(): # Unlike SageMaker training jobs (which have `SM_HOSTS` and `SM_CURRENT_HOST` env vars), processing jobs to need to parse the resource config file directly resconfig = {} try: - with open('/opt/ml/config/resourceconfig.json', 'r') as cfgfile: + with open("/opt/ml/config/resourceconfig.json", "r") as cfgfile: resconfig = json.load(cfgfile) except FileNotFoundError: - print('/opt/ml/config/resourceconfig.json not found. current_host is unknown.') - pass # Ignore + print("/opt/ml/config/resourceconfig.json not found. current_host is unknown.") + pass # Ignore # Local testing with CLI args - parser = argparse.ArgumentParser(description='Process') + parser = argparse.ArgumentParser(description="Process") - parser.add_argument('--hosts', type=list_arg, - default=resconfig.get('hosts', ['unknown']), - help='Comma-separated list of host names running the job' + parser.add_argument( + "--hosts", + type=list_arg, + default=resconfig.get("hosts", ["unknown"]), + help="Comma-separated list of host names running the job", ) - parser.add_argument('--current-host', type=str, - default=resconfig.get('current_host', 'unknown'), - help='Name of this host running the job' + parser.add_argument( + "--current-host", + type=str, + default=resconfig.get("current_host", "unknown"), + help="Name of this host running the job", ) - parser.add_argument('--input-data', type=str, - default='/opt/ml/processing/input/data', + parser.add_argument( + "--input-data", + type=str, + default="/opt/ml/processing/input/data", ) - parser.add_argument('--output-data', type=str, - default='/opt/ml/processing/output', + parser.add_argument( + "--output-data", + type=str, + default="/opt/ml/processing/output", ) - parser.add_argument('--train-split-percentage', type=float, + parser.add_argument( + "--train-split-percentage", + type=float, default=0.90, ) - parser.add_argument('--validation-split-percentage', type=float, - default=0.05, - ) - parser.add_argument('--test-split-percentage', type=float, + parser.add_argument( + "--validation-split-percentage", + type=float, default=0.05, ) - parser.add_argument('--balance-dataset', type=eval, - default=True + parser.add_argument( + "--test-split-percentage", + type=float, + default=0.05, ) - parser.add_argument('--max-seq-length', type=int, + parser.add_argument("--balance-dataset", type=eval, default=True) + parser.add_argument( + "--max-seq-length", + type=int, default=64, - ) - parser.add_argument('--feature-store-offline-prefix', type=str, + ) + parser.add_argument( + "--feature-store-offline-prefix", + type=str, default=None, - ) - parser.add_argument('--feature-group-name', type=str, + ) + parser.add_argument( + "--feature-group-name", + type=str, default=None, - ) - + ) + return parser.parse_args() - -def _transform_tsv_to_tfrecord(file, - max_seq_length, - balance_dataset, - prefix, - feature_group_name): - print('file {}'.format(file)) - print('max_seq_length {}'.format(max_seq_length)) - print('balance_dataset {}'.format(balance_dataset)) - print('prefix {}'.format(prefix)) - print('feature_group_name {}'.format(feature_group_name)) + +def _transform_tsv_to_tfrecord(file, max_seq_length, balance_dataset, prefix, feature_group_name): + print("file {}".format(file)) + print("max_seq_length {}".format(max_seq_length)) + print("balance_dataset {}".format(balance_dataset)) + print("prefix {}".format(prefix)) + print("feature_group_name {}".format(feature_group_name)) # need to re-load since we can't pass feature_group object in _partial functions for some reason feature_group = create_or_load_feature_group(prefix, feature_group_name) - + filename_without_extension = Path(Path(file).stem).stem - df = pd.read_csv(file, - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip') + df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip") df.isna().values.any() df = df.dropna() df = df.reset_index(drop=True) - print('Shape of dataframe {}'.format(df.shape)) + print("Shape of dataframe {}".format(df.shape)) - if balance_dataset: + if balance_dataset: # Balance the dataset down to the minority class from sklearn.utils import resample - five_star_df = df.query('star_rating == 5') - four_star_df = df.query('star_rating == 4') - three_star_df = df.query('star_rating == 3') - two_star_df = df.query('star_rating == 2') - one_star_df = df.query('star_rating == 1') - - minority_count = min(five_star_df.shape[0], - four_star_df.shape[0], - three_star_df.shape[0], - two_star_df.shape[0], - one_star_df.shape[0]) - - five_star_df = resample(five_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - four_star_df = resample(four_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - three_star_df = resample(three_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - two_star_df = resample(two_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) - - one_star_df = resample(one_star_df, - replace = False, - n_samples = minority_count, - random_state = 27) + five_star_df = df.query("star_rating == 5") + four_star_df = df.query("star_rating == 4") + three_star_df = df.query("star_rating == 3") + two_star_df = df.query("star_rating == 2") + one_star_df = df.query("star_rating == 1") + + minority_count = min( + five_star_df.shape[0], + four_star_df.shape[0], + three_star_df.shape[0], + two_star_df.shape[0], + one_star_df.shape[0], + ) + + five_star_df = resample(five_star_df, replace=False, n_samples=minority_count, random_state=27) + + four_star_df = resample(four_star_df, replace=False, n_samples=minority_count, random_state=27) + + three_star_df = resample(three_star_df, replace=False, n_samples=minority_count, random_state=27) + + two_star_df = resample(two_star_df, replace=False, n_samples=minority_count, random_state=27) + + one_star_df = resample(one_star_df, replace=False, n_samples=minority_count, random_state=27) df_balanced = pd.concat([five_star_df, four_star_df, three_star_df, two_star_df, one_star_df]) - df_balanced = df_balanced.reset_index(drop=True) - print('Shape of balanced dataframe {}'.format(df_balanced.shape)) - print(df_balanced['star_rating'].head(100)) + df_balanced = df_balanced.reset_index(drop=True) + print("Shape of balanced dataframe {}".format(df_balanced.shape)) + print(df_balanced["star_rating"].head(100)) df = df_balanced - - print('Shape of dataframe before splitting {}'.format(df.shape)) - - print('train split percentage {}'.format(args.train_split_percentage)) - print('validation split percentage {}'.format(args.validation_split_percentage)) - print('test split percentage {}'.format(args.test_split_percentage)) - + + print("Shape of dataframe before splitting {}".format(df.shape)) + + print("train split percentage {}".format(args.train_split_percentage)) + print("validation split percentage {}".format(args.validation_split_percentage)) + print("test split percentage {}".format(args.test_split_percentage)) + holdout_percentage = 1.00 - args.train_split_percentage - print('holdout percentage {}'.format(holdout_percentage)) - df_train, df_holdout = train_test_split(df, - test_size=holdout_percentage, - stratify=df['star_rating']) + print("holdout percentage {}".format(holdout_percentage)) + df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"]) test_holdout_percentage = args.test_split_percentage / holdout_percentage - print('test holdout percentage {}'.format(test_holdout_percentage)) - df_validation, df_test = train_test_split(df_holdout, - test_size=test_holdout_percentage, - stratify=df_holdout['star_rating']) - + print("test holdout percentage {}".format(test_holdout_percentage)) + df_validation, df_test = train_test_split( + df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"] + ) + df_train = df_train.reset_index(drop=True) df_validation = df_validation.reset_index(drop=True) df_test = df_test.reset_index(drop=True) - print('Shape of train dataframe {}'.format(df_train.shape)) - print('Shape of validation dataframe {}'.format(df_validation.shape)) - print('Shape of test dataframe {}'.format(df_test.shape)) + print("Shape of train dataframe {}".format(df_train.shape)) + print("Shape of validation dataframe {}".format(df_validation.shape)) + print("Shape of test dataframe {}".format(df_test.shape)) timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) - train_inputs = df_train.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - validation_inputs = df_validation.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) - - test_inputs = df_test.apply(lambda x: Input( - label = x[LABEL_COLUMN], - text = x[REVIEW_BODY_COLUMN], - review_id = x[REVIEW_ID_COLUMN], - date = timestamp - ), - axis = 1) + train_inputs = df_train.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + validation_inputs = df_validation.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) + + test_inputs = df_test.apply( + lambda x: Input( + label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp + ), + axis=1, + ) # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): - # - # + # + # # 1. Lowercase our text (if we're using a BERT lowercase model) # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) # 4. Map our words to indexes using a vocab file that BERT provides # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) - # + # # We don't have to worry about these details. The Transformers tokenizer does this for us. - # - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) + # + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow. - train_records = transform_inputs_to_tfrecord(train_inputs, - '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension), - max_seq_length) - - validation_records = transform_inputs_to_tfrecord(validation_inputs, - '{}/part-{}-{}.tfrecord'.format(validation_data, args.current_host, filename_without_extension), - max_seq_length) - - test_records = transform_inputs_to_tfrecord(test_inputs, - '{}/part-{}-{}.tfrecord'.format(test_data, args.current_host, filename_without_extension), - max_seq_length) - + train_records = transform_inputs_to_tfrecord( + train_inputs, + "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + validation_records = transform_inputs_to_tfrecord( + validation_inputs, + "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension), + max_seq_length, + ) + + test_records = transform_inputs_to_tfrecord( + test_inputs, + "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension), + max_seq_length, + ) + df_train_records = pd.DataFrame.from_dict(train_records) - df_train_records['split_type'] = 'train' - df_train_records.head() - + df_train_records["split_type"] = "train" + df_train_records.head() + df_validation_records = pd.DataFrame.from_dict(validation_records) - df_validation_records['split_type'] = 'validation' - df_validation_records.head() + df_validation_records["split_type"] = "validation" + df_validation_records.head() df_test_records = pd.DataFrame.from_dict(test_records) - df_test_records['split_type'] = 'test' - df_test_records.head() - - # Add record to feature store + df_test_records["split_type"] = "test" + df_test_records.head() + + # Add record to feature store df_fs_train_records = cast_object_to_string(df_train_records) df_fs_validation_records = cast_object_to_string(df_validation_records) df_fs_test_records = cast_object_to_string(df_test_records) - print('Ingesting Features...') - feature_group.ingest( - data_frame=df_fs_train_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_validation_records, max_workers=3, wait=True - ) - feature_group.ingest( - data_frame=df_fs_test_records, max_workers=3, wait=True - ) - print('Feature ingest completed.') + print("Ingesting Features...") + feature_group.ingest(data_frame=df_fs_train_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_validation_records, max_workers=3, wait=True) + feature_group.ingest(data_frame=df_fs_test_records, max_workers=3, wait=True) + print("Feature ingest completed.") def process(args): - print('Current host: {}'.format(args.current_host)) - - feature_group = create_or_load_feature_group(prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) + print("Current host: {}".format(args.current_host)) + + feature_group = create_or_load_feature_group( + prefix=args.feature_store_offline_prefix, feature_group_name=args.feature_group_name + ) feature_group.describe() - + print(feature_group.as_hive_ddl()) - - train_data = '{}/bert/train'.format(args.output_data) - validation_data = '{}/bert/validation'.format(args.output_data) - test_data = '{}/bert/test'.format(args.output_data) - - transform_tsv_to_tfrecord = functools.partial(_transform_tsv_to_tfrecord, - max_seq_length=args.max_seq_length, - balance_dataset=args.balance_dataset, - prefix=args.feature_store_offline_prefix, - feature_group_name=args.feature_group_name) - - input_files = glob.glob('{}/*.tsv.gz'.format(args.input_data)) + + train_data = "{}/bert/train".format(args.output_data) + validation_data = "{}/bert/validation".format(args.output_data) + test_data = "{}/bert/test".format(args.output_data) + + transform_tsv_to_tfrecord = functools.partial( + _transform_tsv_to_tfrecord, + max_seq_length=args.max_seq_length, + balance_dataset=args.balance_dataset, + prefix=args.feature_store_offline_prefix, + feature_group_name=args.feature_group_name, + ) + + input_files = glob.glob("{}/*.tsv.gz".format(args.input_data)) num_cpus = multiprocessing.cpu_count() - print('num_cpus {}'.format(num_cpus)) + print("num_cpus {}".format(num_cpus)) p = multiprocessing.Pool(num_cpus) p.map(transform_tsv_to_tfrecord, input_files) - print('Listing contents of {}'.format(args.output_data)) + print("Listing contents of {}".format(args.output_data)) dirs_output = os.listdir(args.output_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(train_data)) + print("Listing contents of {}".format(train_data)) dirs_output = os.listdir(train_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(validation_data)) + print("Listing contents of {}".format(validation_data)) dirs_output = os.listdir(validation_data) for file in dirs_output: print(file) - print('Listing contents of {}'.format(test_data)) + print("Listing contents of {}".format(test_data)) dirs_output = os.listdir(test_data) for file in dirs_output: print(file) - + offline_store_contents = None - while (offline_store_contents is None): - objects_in_bucket = s3.list_objects(Bucket=bucket, - Prefix=args.feature_store_offline_prefix) - if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): - offline_store_contents = objects_in_bucket['Contents'] + while offline_store_contents is None: + objects_in_bucket = s3.list_objects(Bucket=bucket, Prefix=args.feature_store_offline_prefix) + if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1: + offline_store_contents = objects_in_bucket["Contents"] else: - print('Waiting for data in offline store...\n') + print("Waiting for data in offline store...\n") sleep(60) - print('Data available.') - - print('Complete') - - + print("Data available.") + + print("Complete") + + if __name__ == "__main__": args = parse_args() - print('Loaded arguments:') + print("Loaded arguments:") print(args) - - print('Environment variables:') + + print("Environment variables:") print(os.environ) process(args) diff --git a/10_pipeline/src/inference.py b/10_pipeline/src/inference.py index 2975dc2d..53196737 100644 --- a/10_pipeline/src/inference.py +++ b/10_pipeline/src/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/10_pipeline/src/tf_bert_reviews.py b/10_pipeline/src/tf_bert_reviews.py index 79ae535c..34e1d0a7 100644 --- a/10_pipeline/src/tf_bert_reviews.py +++ b/10_pipeline/src/tf_bert_reviews.py @@ -9,96 +9,99 @@ import sys import os import csv -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) import tensorflow as tf import pandas as pd import numpy as np -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==3.5.1']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==3.5.1"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import DistilBertConfig from transformers import TFDistilBertModel -#from transformers import TFBertForSequenceClassification + +# from transformers import TFBertForSequenceClassification from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -111,236 +114,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -348,7 +293,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -358,114 +304,106 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES), - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', - config=config) - - input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32') - input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(CLASSES), + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + transformer_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=config) + + input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32") + input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0] - X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) + X = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1) + )(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) - X = tf.keras.layers.Dense(50, activation='relu')(X) + X = tf.keras.layers.Dense(50, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) - X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X) + X = tf.keras.layers.Dense(len(CLASSES), activation="sigmoid")(X) - model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X) + model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X) for layer in model.layers[:3]: layer.trainable = not freeze_bert_layer successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) -# model.layers[0].trainable = not freeze_bert_layer + print("Compiled model {}".format(model)) + # model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -473,34 +411,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -508,52 +451,47 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) transformer_model.save_pretrained(transformer_fine_tuned_model_path) - print('Model inputs after save_pretrained: {}'.format(model.inputs)) - + print("Model inputs after save_pretrained: {}".format(model.inputs)) + # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, - include_optimizer=False, - overwrite=True, - save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, include_optimizer=False, overwrite=True, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + # Copy test data for the evaluation step - os.system('cp -R ./test_data/ {}'.format(local_model_dir)) - + os.system("cp -R ./test_data/ {}".format(local_model_dir)) + if run_sample_predictions: + def predict(text): - encode_plus_tokens = tokenizer.encode_plus(text, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True, - return_tensors='tf') + encode_plus_tokens = tokenizer.encode_plus( + text, pad_to_max_length=True, max_length=max_seq_length, truncation=True, return_tensors="tf" + ) # The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than `max_seq_length`) - input_ids = encode_plus_tokens['input_ids'] + input_ids = encode_plus_tokens["input_ids"] - # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. - input_mask = encode_plus_tokens['attention_mask'] + # Specifies which tokens BERT should pay attention to (0 or 1). Padded `input_ids` will have 0 in each of these vector elements. + input_mask = encode_plus_tokens["attention_mask"] outputs = model.predict(x=(input_ids, input_mask)) @@ -561,59 +499,73 @@ def predict(text): prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores] - return prediction[0]['label'] + return prediction[0]["label"] + + print( + """I loved it! I will recommend this to everyone.""", + predict("""I loved it! I will recommend this to everyone."""), + ) - print("""I loved it! I will recommend this to everyone.""", predict("""I loved it! I will recommend this to everyone.""")) - print("""It's OK.""", predict("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", predict("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + predict("""Really bad. I hope they don't make this anymore."""), + ) - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - - y_test = df_test_reviews['review_body'].map(predict) + + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) - print('Test accuracy: ', accuracy) - + + accuracy = accuracy_score(y_true=y_test, y_pred=y_actual) + print("Test accuracy: ", accuracy) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix @@ -622,19 +574,17 @@ def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) - + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) + report_dict = { "metrics": { "accuracy": { diff --git a/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb b/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb index ad70633f..1df15a8a 100644 --- a/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb +++ b/10_pipeline/stepfunctions/02_Predict_Pipeline_Reviews_BERT_TensorFlow_REST_Endpoint.ipynb @@ -17,12 +17,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -49,11 +49,11 @@ "source": [ "try:\n", " step_functions_pipeline_endpoint_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -80,8 +80,8 @@ "metadata": {}, "outputs": [], "source": [ - "client = boto3.client('sagemaker')\n", - "waiter = client.get_waiter('endpoint_in_service')\n", + "client = boto3.client(\"sagemaker\")\n", + "waiter = client.get_waiter(\"endpoint_in_service\")\n", "waiter.wait(EndpointName=step_functions_pipeline_endpoint_name)" ] }, @@ -101,10 +101,12 @@ "import json\n", "from sagemaker.tensorflow.model import TensorFlowPredictor\n", "\n", - "predictor = TensorFlowPredictor(endpoint_name=step_functions_pipeline_endpoint_name,\n", - " sagemaker_session=sess,\n", - " model_name='saved_model',\n", - " model_version=0)" + "predictor = TensorFlowPredictor(\n", + " endpoint_name=step_functions_pipeline_endpoint_name,\n", + " sagemaker_session=sess,\n", + " model_name=\"saved_model\",\n", + " model_version=0,\n", + ")" ] }, { @@ -118,7 +120,7 @@ "predicted_classes = predictor.predict(reviews)\n", "\n", "for predicted_class, review in zip(predicted_classes, reviews):\n", - " print('[Predicted Star Rating: {}]'.format(predicted_class), review)" + " print(\"[Predicted Star Rating: {}]\".format(predicted_class), review)" ] }, { @@ -137,9 +139,7 @@ }, "outputs": [], "source": [ - "sm.delete_endpoint(\n", - " EndpointName=step_functions_pipeline_endpoint_name\n", - ")" + "sm.delete_endpoint(EndpointName=step_functions_pipeline_endpoint_name)" ] }, { @@ -149,7 +149,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb b/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb index 669539be..f396a696 100644 --- a/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb +++ b/10_pipeline/stepfunctions/03_Automate_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_S3_Trigger.ipynb @@ -38,13 +38,13 @@ "import json\n", "from botocore.exceptions import ClientError\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "account_id = boto3.client('sts').get_caller_identity().get('Account')" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")" ] }, { @@ -71,11 +71,11 @@ "source": [ "try:\n", " stepfunction_arn\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -104,11 +104,11 @@ "source": [ "try:\n", " stepfunction_name\n", - " print('[OK]') \n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -145,7 +145,7 @@ "metadata": {}, "outputs": [], "source": [ - "watched_bucket = 'dsoaws-test-upload-{}'.format(account_id)\n", + "watched_bucket = \"dsoaws-test-upload-{}\".format(account_id)\n", "print(watched_bucket)" ] }, @@ -173,7 +173,7 @@ "metadata": {}, "outputs": [], "source": [ - "cloudtrail_bucket = 'cloudtrail-dsoaws-{}'.format(account_id)\n", + "cloudtrail_bucket = \"cloudtrail-dsoaws-{}\".format(account_id)\n", "print(cloudtrail_bucket)" ] }, @@ -214,44 +214,30 @@ " {\n", " \"Sid\": \"AWSCloudTrailAclCheck20150319\",\n", " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"cloudtrail.amazonaws.com\"\n", - " },\n", + " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n", " \"Action\": \"s3:GetBucketAcl\",\n", - " \"Resource\": \"arn:aws:s3:::{}\".format(cloudtrail_bucket)\n", + " \"Resource\": \"arn:aws:s3:::{}\".format(cloudtrail_bucket),\n", " },\n", " {\n", " \"Sid\": \"AWSCloudTrailWrite20150319\",\n", " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"cloudtrail.amazonaws.com\"\n", - " },\n", + " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n", " \"Action\": \"s3:PutObject\",\n", " \"Resource\": \"arn:aws:s3:::{}/AWSLogs/{}/*\".format(cloudtrail_bucket, account_id),\n", - " \"Condition\": {\n", - " \"StringEquals\": {\n", - " \"s3:x-amz-acl\": \"bucket-owner-full-control\"\n", - " }\n", - " }\n", + " \"Condition\": {\"StringEquals\": {\"s3:x-amz-acl\": \"bucket-owner-full-control\"}},\n", " },\n", " {\n", " \"Sid\": \"AWSCloudTrailHTTPSOnly20180329\",\n", " \"Effect\": \"Deny\",\n", - " \"Principal\": {\n", - " \"Service\": \"cloudtrail.amazonaws.com\"\n", - " },\n", + " \"Principal\": {\"Service\": \"cloudtrail.amazonaws.com\"},\n", " \"Action\": \"s3:*\",\n", " \"Resource\": [\n", " \"arn:aws:s3:::{}/AWSLogs/{}/*\".format(cloudtrail_bucket, account_id),\n", - " \"arn:aws:s3:::{}\".format(cloudtrail_bucket)\n", + " \"arn:aws:s3:::{}\".format(cloudtrail_bucket),\n", " ],\n", - " \"Condition\": {\n", - " \"Bool\": {\n", - " \"aws:SecureTransport\": \"false\"\n", - " }\n", - " }\n", - " }\n", - " ]\n", + " \"Condition\": {\"Bool\": {\"aws:SecureTransport\": \"false\"}},\n", + " },\n", + " ],\n", "}\n", "\n", "print(policy)" @@ -272,7 +258,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"policy.json\", \"w\") as outfile: \n", + "with open(\"policy.json\", \"w\") as outfile:\n", " json.dump(policy, outfile)" ] }, @@ -307,8 +293,8 @@ "metadata": {}, "outputs": [], "source": [ - "cloudtrail = boto3.client('cloudtrail')\n", - "s3 = boto3.client('s3')" + "cloudtrail = boto3.client(\"cloudtrail\")\n", + "s3 = boto3.client(\"s3\")" ] }, { @@ -336,23 +322,23 @@ "outputs": [], "source": [ "try:\n", - " t = cloudtrail.create_trail(Name='dsoaws', S3BucketName=cloudtrail_bucket, IsMultiRegionTrail=True)\n", - " trail_name = t['Name']\n", - " trail_arn = t['TrailARN']\n", + " t = cloudtrail.create_trail(Name=\"dsoaws\", S3BucketName=cloudtrail_bucket, IsMultiRegionTrail=True)\n", + " trail_name = t[\"Name\"]\n", + " trail_arn = t[\"TrailARN\"]\n", " cloudtrail.start_logging(Name=trail_arn)\n", " print(\"Cloud Trail created. Started logging.\")\n", - " print('--------------------------------------')\n", - " print('New Trail name: {}'.format(trail_name))\n", - " print('New Trail arn: {}'.format(trail_arn))\n", + " print(\"--------------------------------------\")\n", + " print(\"New Trail name: {}\".format(trail_name))\n", + " print(\"New Trail arn: {}\".format(trail_arn))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'TrailAlreadyExistsException':\n", + " if e.response[\"Error\"][\"Code\"] == \"TrailAlreadyExistsException\":\n", " print(\"Trail already exists. This is OK.\")\n", - " print('------------------')\n", - " t = cloudtrail.get_trail(Name='dsoaws')\n", - " trail_name = t['Trail']['Name']\n", - " trail_arn = t['Trail']['TrailARN']\n", - " print('Trail name: {}'.format(trail_name))\n", - " print('Trail arn: {}'.format(trail_arn))\n", + " print(\"------------------\")\n", + " t = cloudtrail.get_trail(Name=\"dsoaws\")\n", + " trail_name = t[\"Trail\"][\"Name\"]\n", + " trail_arn = t[\"Trail\"][\"TrailARN\"]\n", + " print(\"Trail name: {}\".format(trail_name))\n", + " print(\"Trail arn: {}\".format(trail_arn))\n", " else:\n", " print(\"Unexpected error: %s\" % e)" ] @@ -370,7 +356,7 @@ "metadata": {}, "outputs": [], "source": [ - "events = boto3.client('events')" + "events = boto3.client(\"events\")" ] }, { @@ -379,9 +365,9 @@ "metadata": {}, "outputs": [], "source": [ - "response = events.describe_event_bus(Name='default')\n", - "eventbus_arn = response['Arn']\n", - "print('Bus {}'.format(eventbus_arn))" + "response = events.describe_event_bus(Name=\"default\")\n", + "eventbus_arn = response[\"Arn\"]\n", + "print(\"Bus {}\".format(eventbus_arn))" ] }, { @@ -406,7 +392,7 @@ "metadata": {}, "outputs": [], "source": [ - "!aws cloudtrail get-event-selectors --trail-name $trail_name\n" + "!aws cloudtrail get-event-selectors --trail-name $trail_name" ] }, { @@ -425,7 +411,11 @@ "metadata": {}, "outputs": [], "source": [ - "event_selector = '\\'[{ \"ReadWriteType\": \"WriteOnly\", \"IncludeManagementEvents\":true, \"DataResources\": [{ \"Type\": \"AWS::S3::Object\", \"Values\": [\"' + watched_bucket_arn + '\"] }] }]\\''\n" + "event_selector = (\n", + " '\\'[{ \"ReadWriteType\": \"WriteOnly\", \"IncludeManagementEvents\":true, \"DataResources\": [{ \"Type\": \"AWS::S3::Object\", \"Values\": [\"'\n", + " + watched_bucket_arn\n", + " + \"\\\"] }] }]'\"\n", + ")" ] }, { @@ -460,27 +450,13 @@ "outputs": [], "source": [ "pattern = {\n", - " \"source\": [\n", - " \"aws.s3\"\n", - " ],\n", - " \"detail-type\": [\n", - " \"AWS API Call via CloudTrail\"\n", - " ],\n", - " \"detail\": {\n", - " \"eventSource\": [\n", - " \"s3.amazonaws.com\"\n", - " ],\n", - " \"eventName\": [\n", - " \"PutObject\",\n", - " \"CompleteMultipartUpload\",\n", - " \"CopyObject\"\n", - " ],\n", - " \"requestParameters\": {\n", - " \"bucketName\": [\n", - " \"{}\".format(watched_bucket)\n", - " ]\n", - " }\n", - " }\n", + " \"source\": [\"aws.s3\"],\n", + " \"detail-type\": [\"AWS API Call via CloudTrail\"],\n", + " \"detail\": {\n", + " \"eventSource\": [\"s3.amazonaws.com\"],\n", + " \"eventName\": [\"PutObject\", \"CompleteMultipartUpload\", \"CopyObject\"],\n", + " \"requestParameters\": {\"bucketName\": [\"{}\".format(watched_bucket)]},\n", + " },\n", "}\n", "\n", "pattern_json = json.dumps(pattern)\n", @@ -494,11 +470,11 @@ "outputs": [], "source": [ "response = events.put_rule(\n", - " Name='S3-Trigger',\n", + " Name=\"S3-Trigger\",\n", " EventPattern=pattern_json,\n", - " State='ENABLED',\n", - " Description='Triggers an event on S3 PUT',\n", - " EventBusName='default'\n", + " State=\"ENABLED\",\n", + " Description=\"Triggers an event on S3 PUT\",\n", + " EventBusName=\"default\",\n", ")\n", "print(response)" ] @@ -509,7 +485,7 @@ "metadata": {}, "outputs": [], "source": [ - "rule_arn = response['RuleArn']\n", + "rule_arn = response[\"RuleArn\"]\n", "print(rule_arn)" ] }, @@ -533,7 +509,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam = boto3.client('iam')" + "iam = boto3.client(\"iam\")" ] }, { @@ -542,7 +518,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_name_eventbridge = 'DSOAWS_EventBridge_Invoke_StepFunctions'" + "iam_role_name_eventbridge = \"DSOAWS_EventBridge_Invoke_StepFunctions\"" ] }, { @@ -559,16 +535,8 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"events.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [{\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"events.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}],\n", "}" ] }, @@ -582,10 +550,10 @@ " iam_role_eventbridge = iam.create_role(\n", " RoleName=iam_role_name_eventbridge,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS EventBridge Role'\n", + " Description=\"DSOAWS EventBridge Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)" @@ -605,7 +573,7 @@ "outputs": [], "source": [ "role_eventbridge = iam.get_role(RoleName=iam_role_name_eventbridge)\n", - "iam_role_eventbridge_arn = role_eventbridge['Role']['Arn']\n", + "iam_role_eventbridge_arn = role_eventbridge[\"Role\"][\"Arn\"]\n", "print(iam_role_eventbridge_arn)" ] }, @@ -624,14 +592,7 @@ "source": [ "eventbridge_sfn_policy = {\n", " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Sid\": \"VisualEditor0\",\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": \"states:StartExecution\",\n", - " \"Resource\": \"*\"\n", - " }\n", - " ]\n", + " \"Statement\": [{\"Sid\": \"VisualEditor0\", \"Effect\": \"Allow\", \"Action\": \"states:StartExecution\", \"Resource\": \"*\"}],\n", "}\n", "\n", "print(eventbridge_sfn_policy)" @@ -652,18 +613,16 @@ "source": [ "try:\n", " policy_eventbridge_sfn = iam.create_policy(\n", - " PolicyName='DSOAWS_EventBridgeInvokeStepFunction',\n", - " PolicyDocument=json.dumps(eventbridge_sfn_policy)\n", + " PolicyName=\"DSOAWS_EventBridgeInvokeStepFunction\", PolicyDocument=json.dumps(eventbridge_sfn_policy)\n", " )\n", " print(\"Done.\")\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " print(\"Policy already exists\")\n", - " policy_eventbridge_sfn_arn = f'arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction'\n", + " policy_eventbridge_sfn_arn = f\"arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction\"\n", " iam.create_policy_version(\n", - " PolicyArn=policy_eventbridge_sfn_arn,\n", - " PolicyDocument=json.dumps(eventbridge_sfn_policy),\n", - " SetAsDefault=True)\n", + " PolicyArn=policy_eventbridge_sfn_arn, PolicyDocument=json.dumps(eventbridge_sfn_policy), SetAsDefault=True\n", + " )\n", " print(\"Policy updated.\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)" @@ -682,7 +641,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_eventbridge_sfn_arn = f'arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction'\n", + "policy_eventbridge_sfn_arn = f\"arn:aws:iam::{account_id}:policy/DSOAWS_EventBridgeInvokeStepFunction\"\n", "print(policy_eventbridge_sfn_arn)" ] }, @@ -700,13 +659,10 @@ "outputs": [], "source": [ "try:\n", - " response = iam.attach_role_policy(\n", - " PolicyArn=policy_eventbridge_sfn_arn,\n", - " RoleName=iam_role_name_eventbridge\n", - " )\n", + " response = iam.attach_role_policy(PolicyArn=policy_eventbridge_sfn_arn, RoleName=iam_role_name_eventbridge)\n", " print(\"Done.\")\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " print(\"Policy is already attached. This is ok.\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)" @@ -725,7 +681,7 @@ "metadata": {}, "outputs": [], "source": [ - "sfn = boto3.client('stepfunctions')" + "sfn = boto3.client(\"stepfunctions\")" ] }, { @@ -742,9 +698,10 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())\n", "\n", - "execution_name = 'run-{}'.format(timestamp)\n", + "execution_name = \"run-{}\".format(timestamp)\n", "print(execution_name)" ] }, @@ -762,7 +719,7 @@ "metadata": {}, "outputs": [], "source": [ - "raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)\n", + "raw_input_data_s3_uri = \"s3://{}/amazon-reviews-pds/tsv/\".format(bucket)\n", "print(raw_input_data_s3_uri)" ] }, @@ -779,13 +736,13 @@ "metadata": {}, "outputs": [], "source": [ - "max_seq_length=64\n", - "train_split_percentage=0.90\n", - "validation_split_percentage=0.05\n", - "test_split_percentage=0.05\n", - "balance_dataset=True\n", - "processing_instance_count=2\n", - "processing_instance_type='ml.c5.2xlarge'" + "max_seq_length = 64\n", + "train_split_percentage = 0.90\n", + "validation_split_percentage = 0.05\n", + "test_split_percentage = 0.05\n", + "balance_dataset = True\n", + "processing_instance_count = 2\n", + "processing_instance_type = \"ml.c5.2xlarge\"" ] }, { @@ -801,31 +758,31 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=False\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True\n", - "deploy_instance_count=1\n", - "#deploy_instance_type='ml.m5.4xlarge'\n", - "deploy_instance_type='ml.m5.large'" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = False\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True\n", + "deploy_instance_count = 1\n", + "# deploy_instance_type='ml.m5.4xlarge'\n", + "deploy_instance_type = \"ml.m5.large\"" ] }, { @@ -866,10 +823,10 @@ "metadata": {}, "outputs": [], "source": [ - "# You find the regional AWS ECR account IDs storing the docker images here: \n", + "# You find the regional AWS ECR account IDs storing the docker images here:\n", "# https://docs.aws.amazon.com/sagemaker/latest/dg/pre-built-docker-containers-frameworks.html\n", - "account_id_scikit_learn_image_us_east_1 = '683313688378'\n", - "account_id_scikit_learn_image_us_west_2 = '246618743249'" + "account_id_scikit_learn_image_us_east_1 = \"683313688378\"\n", + "account_id_scikit_learn_image_us_west_2 = \"246618743249\"" ] }, { @@ -878,13 +835,13 @@ "metadata": {}, "outputs": [], "source": [ - "account_id_scikit_learn_image = ''\n", - "if region == 'us-east-1':\n", + "account_id_scikit_learn_image = \"\"\n", + "if region == \"us-east-1\":\n", " account_id_scikit_learn_image = account_id_scikit_learn_image_us_east_1\n", - "elif region == 'us-west-2':\n", + "elif region == \"us-west-2\":\n", " account_id_scikit_learn_image = account_id_scikit_learn_image_us_west_2\n", "else:\n", - " print('Please look up the correct AWS ECR Account ID per Link above.')" + " print(\"Please look up the correct AWS ECR Account ID per Link above.\")" ] }, { @@ -903,205 +860,204 @@ "outputs": [], "source": [ "inputs = {\n", - " \"Processing Job\": {\n", - " \"ProcessingJobName\": \"training-pipeline-{}\".format(execution_name), \n", - " \"ProcessingInputs\": [\n", - " {\n", - " \"InputName\": \"raw_input\",\n", - " \"S3Input\": {\n", - "# TODO: Change to watched_bucket + watched_s3_prefix \n", - "# \"S3Uri\": \"s3://{}/{}/\".format(watched_bucket, watched_s3_prefix),\n", - " \"S3Uri\": \"{}\".format(raw_input_data_s3_uri), \n", - " \"LocalPath\": \"/opt/ml/processing/input/data/\",\n", - " \"S3DataType\": \"S3Prefix\",\n", - " \"S3InputMode\": \"File\",\n", - " \"S3DataDistributionType\": \"ShardedByS3Key\",\n", - " \"S3CompressionType\": \"None\"\n", - " }\n", - " },\n", - " {\n", - " \"InputName\": \"code\",\n", - " \"S3Input\": {\n", - " \"S3Uri\": \"s3://{}/{}/preprocess-scikit-text-to-bert.py\".format(bucket, processing_code_s3_prefix),\n", - " \"LocalPath\": \"/opt/ml/processing/input/code\",\n", - " \"S3DataType\": \"S3Prefix\",\n", - " \"S3InputMode\": \"File\",\n", - " \"S3DataDistributionType\": \"FullyReplicated\",\n", - " \"S3CompressionType\": \"None\"\n", - " }\n", - " }\n", - " ],\n", - " \"ProcessingOutputConfig\": {\n", - " \"Outputs\": [\n", - " {\n", - " \"OutputName\": \"bert-train\",\n", - " \"S3Output\": {\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n", - " \"LocalPath\": \"/opt/ml/processing/output/bert/train\",\n", - " \"S3UploadMode\": \"EndOfJob\"\n", - " }\n", + " \"Processing Job\": {\n", + " \"ProcessingJobName\": \"training-pipeline-{}\".format(execution_name),\n", + " \"ProcessingInputs\": [\n", + " {\n", + " \"InputName\": \"raw_input\",\n", + " \"S3Input\": {\n", + " # TODO: Change to watched_bucket + watched_s3_prefix\n", + " # \"S3Uri\": \"s3://{}/{}/\".format(watched_bucket, watched_s3_prefix),\n", + " \"S3Uri\": \"{}\".format(raw_input_data_s3_uri),\n", + " \"LocalPath\": \"/opt/ml/processing/input/data/\",\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3InputMode\": \"File\",\n", + " \"S3DataDistributionType\": \"ShardedByS3Key\",\n", + " \"S3CompressionType\": \"None\",\n", + " },\n", + " },\n", + " {\n", + " \"InputName\": \"code\",\n", + " \"S3Input\": {\n", + " \"S3Uri\": \"s3://{}/{}/preprocess-scikit-text-to-bert.py\".format(bucket, processing_code_s3_prefix),\n", + " \"LocalPath\": \"/opt/ml/processing/input/code\",\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3InputMode\": \"File\",\n", + " \"S3DataDistributionType\": \"FullyReplicated\",\n", + " \"S3CompressionType\": \"None\",\n", + " },\n", + " },\n", + " ],\n", + " \"ProcessingOutputConfig\": {\n", + " \"Outputs\": [\n", + " {\n", + " \"OutputName\": \"bert-train\",\n", + " \"S3Output\": {\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n", + " \"LocalPath\": \"/opt/ml/processing/output/bert/train\",\n", + " \"S3UploadMode\": \"EndOfJob\",\n", + " },\n", + " },\n", + " {\n", + " \"OutputName\": \"bert-validation\",\n", + " \"S3Output\": {\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n", + " \"LocalPath\": \"/opt/ml/processing/output/bert/validation\",\n", + " \"S3UploadMode\": \"EndOfJob\",\n", + " },\n", + " },\n", + " {\n", + " \"OutputName\": \"bert-test\",\n", + " \"S3Output\": {\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n", + " \"LocalPath\": \"/opt/ml/processing/output/bert/test\",\n", + " \"S3UploadMode\": \"EndOfJob\",\n", + " },\n", + " },\n", + " ]\n", " },\n", - " {\n", - " \"OutputName\": \"bert-validation\",\n", - " \"S3Output\": {\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n", - " \"LocalPath\": \"/opt/ml/processing/output/bert/validation\",\n", - " \"S3UploadMode\": \"EndOfJob\"\n", - " }\n", + " \"AppSpecification\": {\n", + " \"ImageUri\": \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\".format(\n", + " account_id_scikit_learn_image, region\n", + " ),\n", + " \"ContainerArguments\": [\n", + " \"--train-split-percentage\",\n", + " \"{}\".format(train_split_percentage),\n", + " \"--validation-split-percentage\",\n", + " \"{}\".format(validation_split_percentage),\n", + " \"--test-split-percentage\",\n", + " \"{}\".format(test_split_percentage),\n", + " \"--max-seq-length\",\n", + " \"{}\".format(max_seq_length),\n", + " \"--balance-dataset\",\n", + " \"{}\".format(balance_dataset),\n", + " ],\n", + " \"ContainerEntrypoint\": [\"python3\", \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py\"],\n", " },\n", - " {\n", - " \"OutputName\": \"bert-test\",\n", - " \"S3Output\": {\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n", - " \"LocalPath\": \"/opt/ml/processing/output/bert/test\",\n", - " \"S3UploadMode\": \"EndOfJob\"\n", - " }\n", - " }\n", - " ]\n", - " },\n", - " \"AppSpecification\": {\n", - " \"ImageUri\": \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3\".format(account_id_scikit_learn_image, region),\n", - " \"ContainerArguments\": [\n", - " \"--train-split-percentage\",\n", - " \"{}\".format(train_split_percentage),\n", - " \"--validation-split-percentage\",\n", - " \"{}\".format(validation_split_percentage),\n", - " \"--test-split-percentage\",\n", - " \"{}\".format(test_split_percentage),\n", - " \"--max-seq-length\",\n", - " \"{}\".format(max_seq_length),\n", - " \"--balance-dataset\",\n", - " \"{}\".format(balance_dataset)\n", - " ],\n", - " \"ContainerEntrypoint\": [\n", - " \"python3\",\n", - " \"/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py\"\n", - " ]\n", - " },\n", - " \"RoleArn\": \"{}\".format(role),\n", - " \"ProcessingResources\": {\n", - " \"ClusterConfig\": {\n", - " \"InstanceCount\": processing_instance_count,\n", - " \"InstanceType\": \"{}\".format(processing_instance_type),\n", - " \"VolumeSizeInGB\": 30\n", - " }\n", - " },\n", - " \"StoppingCondition\": {\n", - " \"MaxRuntimeInSeconds\": 7200\n", - " }\n", - " }, \n", - " \"Training\": {\n", - " \"AlgorithmSpecification\": {\n", - " \"TrainingImage\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04\".format(region),\n", - " \"TrainingInputMode\": \"{}\".format(input_mode)\n", - " },\n", - " \"OutputDataConfig\": {\n", - " \"S3OutputPath\": \"s3://{}/training-pipeline-{}/models\".format(bucket, execution_name)\n", - " },\n", - " \"StoppingCondition\": {\n", - " \"MaxRuntimeInSeconds\": 7200\n", - " },\n", - " \"ResourceConfig\": {\n", - " \"InstanceCount\": train_instance_count,\n", - " \"InstanceType\": \"{}\".format(train_instance_type),\n", - " \"VolumeSizeInGB\": train_volume_size\n", + " \"RoleArn\": \"{}\".format(role),\n", + " \"ProcessingResources\": {\n", + " \"ClusterConfig\": {\n", + " \"InstanceCount\": processing_instance_count,\n", + " \"InstanceType\": \"{}\".format(processing_instance_type),\n", + " \"VolumeSizeInGB\": 30,\n", + " }\n", + " },\n", + " \"StoppingCondition\": {\"MaxRuntimeInSeconds\": 7200},\n", " },\n", - " \"RoleArn\": \"{}\".format(role),\n", - " \"InputDataConfig\": [\n", - " {\n", - " \"DataSource\": {\n", - " \"S3DataSource\": {\n", - " \"S3DataType\": \"S3Prefix\",\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n", - " \"S3DataDistributionType\": \"ShardedByS3Key\"\n", - " }\n", + " \"Training\": {\n", + " \"AlgorithmSpecification\": {\n", + " \"TrainingImage\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04\".format(\n", + " region\n", + " ),\n", + " \"TrainingInputMode\": \"{}\".format(input_mode),\n", " },\n", - " \"ChannelName\": \"train\"\n", - " },\n", - " {\n", - " \"DataSource\": {\n", - " \"S3DataSource\": {\n", - " \"S3DataType\": \"S3Prefix\",\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n", - " \"S3DataDistributionType\": \"ShardedByS3Key\"\n", - " }\n", + " \"OutputDataConfig\": {\"S3OutputPath\": \"s3://{}/training-pipeline-{}/models\".format(bucket, execution_name)},\n", + " \"StoppingCondition\": {\"MaxRuntimeInSeconds\": 7200},\n", + " \"ResourceConfig\": {\n", + " \"InstanceCount\": train_instance_count,\n", + " \"InstanceType\": \"{}\".format(train_instance_type),\n", + " \"VolumeSizeInGB\": train_volume_size,\n", " },\n", - " \"ChannelName\": \"validation\"\n", - " },\n", - " {\n", - " \"DataSource\": {\n", - " \"S3DataSource\": {\n", - " \"S3DataType\": \"S3Prefix\",\n", - " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n", - " \"S3DataDistributionType\": \"ShardedByS3Key\"\n", - " }\n", + " \"RoleArn\": \"{}\".format(role),\n", + " \"InputDataConfig\": [\n", + " {\n", + " \"DataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-train\".format(bucket, execution_name),\n", + " \"S3DataDistributionType\": \"ShardedByS3Key\",\n", + " }\n", + " },\n", + " \"ChannelName\": \"train\",\n", + " },\n", + " {\n", + " \"DataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-validation\".format(bucket, execution_name),\n", + " \"S3DataDistributionType\": \"ShardedByS3Key\",\n", + " }\n", + " },\n", + " \"ChannelName\": \"validation\",\n", + " },\n", + " {\n", + " \"DataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3Uri\": \"s3://{}/{}/processing/output/bert-test\".format(bucket, execution_name),\n", + " \"S3DataDistributionType\": \"ShardedByS3Key\",\n", + " }\n", + " },\n", + " \"ChannelName\": \"test\",\n", + " },\n", + " ],\n", + " \"HyperParameters\": {\n", + " \"epochs\": \"{}\".format(epochs),\n", + " \"learning_rate\": \"{}\".format(learning_rate),\n", + " \"epsilon\": \"{}\".format(epsilon),\n", + " \"train_batch_size\": \"{}\".format(train_batch_size),\n", + " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n", + " \"test_batch_size\": \"{}\".format(test_batch_size),\n", + " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n", + " \"validation_steps\": \"{}\".format(validation_steps),\n", + " \"test_steps\": \"{}\".format(test_steps),\n", + " \"use_xla\": \"{}\".format(str(use_xla).lower()),\n", + " \"use_amp\": \"{}\".format(str(use_amp).lower()),\n", + " \"max_seq_length\": \"{}\".format(max_seq_length),\n", + " \"freeze_bert_layer\": \"{}\".format(str(freeze_bert_layer).lower()),\n", + " \"enable_sagemaker_debugger\": \"{}\".format(str(enable_sagemaker_debugger).lower()),\n", + " \"enable_checkpointing\": \"{}\".format(str(enable_checkpointing).lower()),\n", + " \"enable_tensorboard\": \"{}\".format(str(enable_tensorboard).lower()),\n", + " \"run_validation\": \"{}\".format(str(run_validation).lower()),\n", + " \"run_test\": \"{}\".format(str(run_test).lower()),\n", + " \"run_sample_predictions\": \"{}\".format(str(run_sample_predictions).lower()),\n", + " \"sagemaker_submit_directory\": '\"s3://{}/{}/estimator-source/source/sourcedir.tar.gz\"'.format(\n", + " bucket, stepfunction_name\n", + " ),\n", + " \"sagemaker_program\": '\"tf_bert_reviews.py\"',\n", + " \"sagemaker_enable_cloudwatch_metrics\": \"false\",\n", + " \"sagemaker_container_log_level\": \"20\",\n", + " \"sagemaker_job_name\": '\"training-pipeline-{}/estimator-source\"'.format(execution_name),\n", + " \"sagemaker_region\": '\"{}\"'.format(region),\n", + " \"model_dir\": '\"s3://{}/training-pipeline-{}/estimator-source/model\"'.format(bucket, execution_name),\n", " },\n", - " \"ChannelName\": \"test\"\n", - " }\n", - " ],\n", - " \"HyperParameters\": {\n", - " \"epochs\": \"{}\".format(epochs),\n", - " \"learning_rate\": \"{}\".format(learning_rate),\n", - " \"epsilon\": \"{}\".format(epsilon),\n", - " \"train_batch_size\": \"{}\".format(train_batch_size),\n", - " \"validation_batch_size\": \"{}\".format(validation_batch_size),\n", - " \"test_batch_size\": \"{}\".format(test_batch_size),\n", - " \"train_steps_per_epoch\": \"{}\".format(train_steps_per_epoch),\n", - " \"validation_steps\": \"{}\".format(validation_steps),\n", - " \"test_steps\": \"{}\".format(test_steps),\n", - " \"use_xla\": \"{}\".format(str(use_xla).lower()),\n", - " \"use_amp\": \"{}\".format(str(use_amp).lower()),\n", - " \"max_seq_length\": \"{}\".format(max_seq_length),\n", - " \"freeze_bert_layer\": \"{}\".format(str(freeze_bert_layer).lower()),\n", - " \"enable_sagemaker_debugger\": \"{}\".format(str(enable_sagemaker_debugger).lower()),\n", - " \"enable_checkpointing\": \"{}\".format(str(enable_checkpointing).lower()),\n", - " \"enable_tensorboard\": \"{}\".format(str(enable_tensorboard).lower()),\n", - " \"run_validation\": \"{}\".format(str(run_validation).lower()),\n", - " \"run_test\": \"{}\".format(str(run_test).lower()),\n", - " \"run_sample_predictions\": \"{}\".format(str(run_sample_predictions).lower()),\n", - " \"sagemaker_submit_directory\": \"\\\"s3://{}/{}/estimator-source/source/sourcedir.tar.gz\\\"\".format(bucket, stepfunction_name),\n", - " \"sagemaker_program\": \"\\\"tf_bert_reviews.py\\\"\",\n", - " \"sagemaker_enable_cloudwatch_metrics\": \"false\",\n", - " \"sagemaker_container_log_level\": \"20\",\n", - " \"sagemaker_job_name\": \"\\\"training-pipeline-{}/estimator-source\\\"\".format(execution_name),\n", - " \"sagemaker_region\": \"\\\"{}\\\"\".format(region),\n", - " \"model_dir\": \"\\\"s3://{}/training-pipeline-{}/estimator-source/model\\\"\".format(bucket, execution_name)\n", - " }, \n", - " \"TrainingJobName\": \"estimator-training-pipeline-{}\".format(execution_name),\n", - " \"DebugHookConfig\": {\n", - " \"S3OutputPath\": \"s3://{}/\".format(bucket)\n", - " }\n", - " },\n", - " \"Create Model\": {\n", - " \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n", - " \"PrimaryContainer\": {\n", - " \"Image\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu-py36-ubuntu18.04\".format(region),\n", - " \"Environment\": {\n", - " \"SAGEMAKER_PROGRAM\": \"null\",\n", - " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"null\",\n", - " \"SAGEMAKER_ENABLE_CLOUDWATCH_METRICS\": \"false\",\n", - " \"SAGEMAKER_CONTAINER_LOG_LEVEL\": \"20\",\n", - " \"SAGEMAKER_REGION\": \"{}\".format(region)\n", - " },\n", - " \"ModelDataUrl\": \"s3://{}/training-pipeline-{}/models/estimator-training-pipeline-{}/output/model.tar.gz\".format(bucket, execution_name, execution_name)\n", + " \"TrainingJobName\": \"estimator-training-pipeline-{}\".format(execution_name),\n", + " \"DebugHookConfig\": {\"S3OutputPath\": \"s3://{}/\".format(bucket)},\n", " },\n", - " \"ExecutionRoleArn\": \"{}\".format(role)\n", - " },\n", - " \"Configure Endpoint\": {\n", - " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n", - " \"ProductionVariants\": [\n", - " {\n", - " \"InitialInstanceCount\": deploy_instance_count,\n", - " \"InstanceType\": \"{}\".format(deploy_instance_type),\n", + " \"Create Model\": {\n", " \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n", - " \"VariantName\": \"AllTraffic\"\n", - " }\n", - " ]\n", - " },\n", - " \"Deploy\": {\n", - " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n", - " \"EndpointName\": \"training-pipeline-{}\".format(execution_name)\n", - " }\n", + " \"PrimaryContainer\": {\n", + " \"Image\": \"763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu-py36-ubuntu18.04\".format(\n", + " region\n", + " ),\n", + " \"Environment\": {\n", + " \"SAGEMAKER_PROGRAM\": \"null\",\n", + " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"null\",\n", + " \"SAGEMAKER_ENABLE_CLOUDWATCH_METRICS\": \"false\",\n", + " \"SAGEMAKER_CONTAINER_LOG_LEVEL\": \"20\",\n", + " \"SAGEMAKER_REGION\": \"{}\".format(region),\n", + " },\n", + " \"ModelDataUrl\": \"s3://{}/training-pipeline-{}/models/estimator-training-pipeline-{}/output/model.tar.gz\".format(\n", + " bucket, execution_name, execution_name\n", + " ),\n", + " },\n", + " \"ExecutionRoleArn\": \"{}\".format(role),\n", + " },\n", + " \"Configure Endpoint\": {\n", + " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n", + " \"ProductionVariants\": [\n", + " {\n", + " \"InitialInstanceCount\": deploy_instance_count,\n", + " \"InstanceType\": \"{}\".format(deploy_instance_type),\n", + " \"ModelName\": \"training-pipeline-{}\".format(execution_name),\n", + " \"VariantName\": \"AllTraffic\",\n", + " }\n", + " ],\n", + " },\n", + " \"Deploy\": {\n", + " \"EndpointConfigName\": \"training-pipeline-{}\".format(execution_name),\n", + " \"EndpointName\": \"training-pipeline-{}\".format(execution_name),\n", + " },\n", "}" ] }, @@ -1130,10 +1086,7 @@ "outputs": [], "source": [ "# Check for exsting targets\n", - "targets = events.list_targets_by_rule(\n", - " Rule='S3-Trigger',\n", - " EventBusName='default'\n", - ")" + "targets = events.list_targets_by_rule(Rule=\"S3-Trigger\", EventBusName=\"default\")" ] }, { @@ -1142,18 +1095,13 @@ "metadata": {}, "outputs": [], "source": [ - "number_targets = len(targets['Targets'])\n", + "number_targets = len(targets[\"Targets\"])\n", "\n", "if number_targets > 0:\n", - " for target in targets['Targets']:\n", - " print(target['Id'])\n", - " events.remove_targets(\n", - " Rule='S3-Trigger',\n", - " EventBusName='default',\n", - " Ids=[target['Id']],\n", - " Force=True\n", - ")\n", - " print(\"Target: \" +target['Id']+ \" removed.\")\n", + " for target in targets[\"Targets\"]:\n", + " print(target[\"Id\"])\n", + " events.remove_targets(Rule=\"S3-Trigger\", EventBusName=\"default\", Ids=[target[\"Id\"]], Force=True)\n", + " print(\"Target: \" + target[\"Id\"] + \" removed.\")\n", "else:\n", " print(\"No targets defined yet.\")" ] @@ -1169,16 +1117,9 @@ "target_id = str(uuid.uuid4())\n", "\n", "response = events.put_targets(\n", - " Rule='S3-Trigger',\n", - " EventBusName='default',\n", - " Targets=[\n", - " {\n", - " 'Id': target_id,\n", - " 'Arn': stepfunction_arn,\n", - " 'RoleArn': iam_role_eventbridge_arn,\n", - " 'Input': inputs_json\n", - " }\n", - " ]\n", + " Rule=\"S3-Trigger\",\n", + " EventBusName=\"default\",\n", + " Targets=[{\"Id\": target_id, \"Arn\": stepfunction_arn, \"RoleArn\": iam_role_eventbridge_arn, \"Input\": inputs_json}],\n", ")" ] }, @@ -1206,7 +1147,7 @@ "source": [ "execution_list_before_uploading = sfn.list_executions(stateMachineArn=stepfunction_arn)\n", "\n", - "number_of_executions_before_uploading = len(execution_list_before_uploading['executions'])\n", + "number_of_executions_before_uploading = len(execution_list_before_uploading[\"executions\"])\n", "\n", "print(number_of_executions_before_uploading)" ] @@ -1225,6 +1166,7 @@ "outputs": [], "source": [ "import time\n", + "\n", "time.sleep(15)" ] }, @@ -1234,7 +1176,7 @@ "metadata": {}, "outputs": [], "source": [ - "watched_s3_uri = 's3://{}/watched_input/'.format(watched_bucket)\n", + "watched_s3_uri = \"s3://{}/watched_input/\".format(watched_bucket)\n", "\n", "print('Uploading training data to \"{}\" to trigger a new training pipeline.'.format(watched_s3_uri))" ] @@ -1299,7 +1241,7 @@ "metadata": {}, "outputs": [], "source": [ - "number_of_executions_after_uploading = len(execution_list_after_uploading['executions'])\n", + "number_of_executions_after_uploading = len(execution_list_after_uploading[\"executions\"])\n", "\n", "print(number_of_executions_after_uploading)" ] @@ -1310,9 +1252,9 @@ "metadata": {}, "outputs": [], "source": [ - "current_execution = execution_list_after_uploading['executions'][0]\n", + "current_execution = execution_list_after_uploading[\"executions\"][0]\n", "\n", - "current_execution_arn = current_execution['executionArn']\n", + "current_execution_arn = current_execution[\"executionArn\"]\n", "\n", "print(current_execution_arn)" ] @@ -1325,7 +1267,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Step Functions Pipeline'.format(region, current_execution_arn)))\n" + "display(\n", + " HTML(\n", + " 'Review Step Functions Pipeline'.format(\n", + " region, current_execution_arn\n", + " )\n", + " )\n", + ")" ] }, { @@ -1335,7 +1283,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb b/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb index c8b71577..05477e55 100644 --- a/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb +++ b/10_pipeline/tfx/01_Create_Pipeline_Train_and_Deploy_Reviews_BERT_TensorFlow_TFX.ipynb @@ -103,6 +103,7 @@ "source": [ "# Restart the kernel to pick up pip installed libraries\n", "from IPython.core.display import HTML\n", + "\n", "HTML(\"\")" ] }, @@ -133,11 +134,19 @@ "import tensorflow_transform.beam as tft_beam\n", "from tensorflow_transform.beam.tft_beam_io import transform_fn_io\n", "from tensorflow_transform.saved import saved_transform_io\n", - "from tensorflow_transform.tf_metadata import (dataset_metadata, dataset_schema,\n", - " metadata_io, schema_utils)\n", - "from tfx.components import (Evaluator, ExampleValidator, ImportExampleGen,\n", - " ModelValidator, Pusher, ResolverNode, SchemaGen,\n", - " StatisticsGen, Trainer, Transform)\n", + "from tensorflow_transform.tf_metadata import dataset_metadata, dataset_schema, metadata_io, schema_utils\n", + "from tfx.components import (\n", + " Evaluator,\n", + " ExampleValidator,\n", + " ImportExampleGen,\n", + " ModelValidator,\n", + " Pusher,\n", + " ResolverNode,\n", + " SchemaGen,\n", + " StatisticsGen,\n", + " Trainer,\n", + " Transform,\n", + ")\n", "from tfx.components.base import executor_spec\n", "from tfx.components.trainer.executor import GenericExecutor\n", "from tfx.dsl.experimental import latest_blessed_model_resolver\n", @@ -150,8 +159,7 @@ "import tensorflow_model_analysis as tfma\n", "import tensorflow_text as text\n", "\n", - "from tfx.orchestration.experimental.interactive.interactive_context import \\\n", - " InteractiveContext\n", + "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\n", "\n", "%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip" ] @@ -189,40 +197,40 @@ "source": [ "def clean_before_download(base_data_dir):\n", " rmtree(base_data_dir)\n", - " \n", + "\n", + "\n", "def delete_unnecessary_files(base_path):\n", " os.remove(base_path + \"dataset_info.json\")\n", " os.remove(base_path + \"label.labels.txt\")\n", - " \n", + "\n", " counter = 2\n", " for f in glob.glob(base_path + \"imdb_reviews-unsupervised.*\"):\n", " os.remove(f)\n", " counter += 1\n", " print(f\"Deleted {counter} files\")\n", "\n", - "def get_dataset(name='imdb_reviews', version=\"1.0.0\"):\n", + "\n", + "def get_dataset(name=\"imdb_reviews\", version=\"1.0.0\"):\n", "\n", " base_data_dir = \"./content/tfds/\"\n", - " config=\"plain_text\"\n", - " version=\"1.0.0\"\n", + " config = \"plain_text\"\n", + " version = \"1.0.0\"\n", "\n", " clean_before_download(base_data_dir)\n", " tfds.disable_progress_bar()\n", - " builder = tfds.text.IMDBReviews(data_dir=base_data_dir, \n", - " config=config, \n", - " version=version)\n", - " download_config = tfds.download.DownloadConfig(\n", - " download_mode=tfds.GenerateMode.FORCE_REDOWNLOAD)\n", + " builder = tfds.text.IMDBReviews(data_dir=base_data_dir, config=config, version=version)\n", + " download_config = tfds.download.DownloadConfig(download_mode=tfds.GenerateMode.FORCE_REDOWNLOAD)\n", " builder.download_and_prepare(download_config=download_config)\n", "\n", " base_tfrecords_filename = os.path.join(base_data_dir, \"imdb_reviews\", config, version, \"\")\n", " train_tfrecords_filename = base_tfrecords_filename + \"imdb_reviews-train*\"\n", " test_tfrecords_filename = base_tfrecords_filename + \"imdb_reviews-test*\"\n", " label_filename = os.path.join(base_tfrecords_filename, \"label.labels.txt\")\n", - " labels = [label.rstrip('\\n') for label in open(label_filename)]\n", + " labels = [label.rstrip(\"\\n\") for label in open(label_filename)]\n", " delete_unnecessary_files(base_tfrecords_filename)\n", " return (train_tfrecords_filename, test_tfrecords_filename), labels\n", "\n", + "\n", "tfrecords_filenames, labels = get_dataset()" ] }, @@ -257,12 +265,11 @@ "\n", "BERT_TFHUB_URL = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\n", "\n", + "\n", "def load_bert_layer(model_url=BERT_TFHUB_URL):\n", " # Load the pre-trained BERT model as layer in Keras\n", - " bert_layer = hub.KerasLayer(\n", - " handle=model_url,\n", - " trainable=True)\n", - " return bert_layer\n" + " bert_layer = hub.KerasLayer(handle=model_url, trainable=True)\n", + " return bert_layer" ] }, { @@ -321,10 +328,13 @@ "outputs": [], "source": [ "output = example_gen_pb2.Output(\n", - " split_config=example_gen_pb2.SplitConfig(splits=[\n", - " example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=45),\n", - " example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=5)\n", - " ]))\n", + " split_config=example_gen_pb2.SplitConfig(\n", + " splits=[\n", + " example_gen_pb2.SplitConfig.Split(name=\"train\", hash_buckets=45),\n", + " example_gen_pb2.SplitConfig.Split(name=\"eval\", hash_buckets=5),\n", + " ]\n", + " )\n", + ")\n", "# Load the data from our prepared TFDS folder\n", "examples = external_input(\"./content/tfds/imdb_reviews/plain_text/1.0.0\")\n", "example_gen = ImportExampleGen(input=examples, output_config=output)\n", @@ -344,7 +354,7 @@ "source": [ "%%skip_for_export\n", "\n", - "for artifact in example_gen.outputs['examples'].get():\n", + "for artifact in example_gen.outputs[\"examples\"].get():\n", " print(artifact.uri)" ] }, @@ -370,11 +380,10 @@ "source": [ "%%skip_for_export\n", "\n", - "statistics_gen = StatisticsGen(\n", - " examples=example_gen.outputs['examples'])\n", + "statistics_gen = StatisticsGen(examples=example_gen.outputs[\"examples\"])\n", "context.run(statistics_gen)\n", "\n", - "context.show(statistics_gen.outputs['statistics'])" + "context.show(statistics_gen.outputs[\"statistics\"])" ] }, { @@ -389,12 +398,10 @@ "source": [ "%%skip_for_export\n", "\n", - "schema_gen = SchemaGen(\n", - " statistics=statistics_gen.outputs['statistics'],\n", - " infer_feature_shape=True)\n", + "schema_gen = SchemaGen(statistics=statistics_gen.outputs[\"statistics\"], infer_feature_shape=True)\n", "context.run(schema_gen)\n", "\n", - "context.show(schema_gen.outputs['schema'])" + "context.show(schema_gen.outputs[\"schema\"])" ] }, { @@ -410,7 +417,7 @@ "%%skip_for_export\n", "\n", "# check the data schema for the type of input tensors\n", - "tfdv.load_schema_text(schema_gen.outputs['schema'].get()[0].uri + \"/schema.pbtxt\")" + "tfdv.load_schema_text(schema_gen.outputs[\"schema\"].get()[0].uri + \"/schema.pbtxt\")" ] }, { @@ -426,11 +433,11 @@ "%%skip_for_export\n", "\n", "example_validator = ExampleValidator(\n", - " statistics=statistics_gen.outputs['statistics'],\n", - " schema=schema_gen.outputs['schema'])\n", + " statistics=statistics_gen.outputs[\"statistics\"], schema=schema_gen.outputs[\"schema\"]\n", + ")\n", "context.run(example_validator)\n", "\n", - "context.show(example_validator.outputs['anomalies'])" + "context.show(example_validator.outputs[\"anomalies\"])" ] }, { @@ -466,11 +473,12 @@ "MAX_SEQ_LEN = 64 # max number is 512\n", "do_lower_case = load_bert_layer().resolved_object.do_lower_case.numpy()\n", "\n", + "\n", "def preprocessing_fn(inputs):\n", " \"\"\"Preprocess input column of text into transformed columns of.\n", - " * input token ids\n", - " * input mask\n", - " * input type ids\n", + " * input token ids\n", + " * input mask\n", + " * input type ids\n", " \"\"\"\n", "\n", " CLS_ID = tf.constant(101, dtype=tf.int64)\n", @@ -478,11 +486,11 @@ " PAD_ID = tf.constant(0, dtype=tf.int64)\n", "\n", " vocab_file_path = load_bert_layer().resolved_object.vocab_file.asset_path\n", - " \n", - " bert_tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_file_path, \n", - " token_out_type=tf.int64, \n", - " lower_case=do_lower_case) \n", - " \n", + "\n", + " bert_tokenizer = text.BertTokenizer(\n", + " vocab_lookup_table=vocab_file_path, token_out_type=tf.int64, lower_case=do_lower_case\n", + " )\n", + "\n", " def tokenize_text(text, sequence_length=MAX_SEQ_LEN):\n", " \"\"\"\n", " Perform the BERT preprocessing from text -> input token ids\n", @@ -490,14 +498,14 @@ "\n", " # convert text into token ids\n", " tokens = bert_tokenizer.tokenize(text)\n", - " \n", - " # flatten the output ragged tensors \n", + "\n", + " # flatten the output ragged tensors\n", " tokens = tokens.merge_dims(1, 2)[:, :sequence_length]\n", - " \n", + "\n", " # Add start and end token ids to the id sequence\n", " start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)\n", " end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)\n", - " tokens = tokens[:, :sequence_length - 2]\n", + " tokens = tokens[:, : sequence_length - 2]\n", " tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)\n", "\n", " # truncate sequences greater than MAX_SEQ_LEN\n", @@ -508,8 +516,8 @@ " pad = sequence_length - tf.shape(tokens)[1]\n", " tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID)\n", "\n", - " # and finally reshape the word token ids to fit the output \n", - " # data structure of TFT \n", + " # and finally reshape the word token ids to fit the output\n", + " # data structure of TFT\n", " return tf.reshape(tokens, [-1, sequence_length])\n", "\n", " def preprocess_bert_input(text):\n", @@ -519,25 +527,20 @@ " input_word_ids = tokenize_text(text)\n", " input_mask = tf.cast(input_word_ids > 0, tf.int64)\n", " input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN])\n", - " \n", + "\n", " zeros_dims = tf.stack(tf.shape(input_mask))\n", " input_type_ids = tf.fill(zeros_dims, 0)\n", " input_type_ids = tf.cast(input_type_ids, tf.int64)\n", "\n", - " return (\n", - " input_word_ids, \n", - " input_mask,\n", - " input_type_ids\n", - " )\n", + " return (input_word_ids, input_mask, input_type_ids)\n", "\n", - " input_word_ids, input_mask, input_type_ids = \\\n", - " preprocess_bert_input(tf.squeeze(inputs['text'], axis=1))\n", + " input_word_ids, input_mask, input_type_ids = preprocess_bert_input(tf.squeeze(inputs[\"text\"], axis=1))\n", "\n", " return {\n", - " 'input_word_ids': input_word_ids,\n", - " 'input_mask': input_mask,\n", - " 'input_type_ids': input_type_ids,\n", - " 'label': inputs['label']\n", + " \"input_word_ids\": input_word_ids,\n", + " \"input_mask\": input_mask,\n", + " \"input_type_ids\": input_type_ids,\n", + " \"label\": inputs[\"label\"],\n", " }" ] }, @@ -552,9 +555,10 @@ "outputs": [], "source": [ "transform = Transform(\n", - " examples=example_gen.outputs['examples'],\n", - " schema=schema_gen.outputs['schema'],\n", - " module_file=os.path.abspath(\"transform.py\"))\n", + " examples=example_gen.outputs[\"examples\"],\n", + " schema=schema_gen.outputs[\"schema\"],\n", + " module_file=os.path.abspath(\"transform.py\"),\n", + ")\n", "context.run(transform)" ] }, @@ -583,7 +587,7 @@ "pp = pprint.PrettyPrinter()\n", "\n", "# Get the URI of the output artifact representing the transformed examples, which is a directory\n", - "train_uri = transform.outputs['transformed_examples'].get()[0].uri\n", + "train_uri = transform.outputs[\"transformed_examples\"].get()[0].uri\n", "\n", "print(train_uri)\n", "\n", @@ -642,30 +646,29 @@ "from tfx.components.trainer.executor import TrainerFnArgs\n", "\n", "\n", - "_LABEL_KEY = 'label'\n", + "_LABEL_KEY = \"label\"\n", "BERT_TFHUB_URL = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\n", "\n", "\n", "def _gzip_reader_fn(filenames):\n", " \"\"\"Small utility returning a record reader that can read gzip'ed files.\"\"\"\n", - " return tf.data.TFRecordDataset(filenames, compression_type='GZIP')\n", + " return tf.data.TFRecordDataset(filenames, compression_type=\"GZIP\")\n", + "\n", "\n", "def load_bert_layer(model_url=BERT_TFHUB_URL):\n", " # Load the pre-trained BERT model as layer in Keras\n", - " bert_layer = hub.KerasLayer(\n", - " handle=model_url,\n", - " trainable=False) # model can be fine-tuned \n", + " bert_layer = hub.KerasLayer(handle=model_url, trainable=False) # model can be fine-tuned\n", " return bert_layer\n", "\n", + "\n", "def get_model(tf_transform_output, max_seq_length=64, num_labels=2):\n", "\n", " # dynamically create inputs for all outputs of our transform graph\n", - " feature_spec = tf_transform_output.transformed_feature_spec() \n", + " feature_spec = tf_transform_output.transformed_feature_spec()\n", " feature_spec.pop(_LABEL_KEY)\n", "\n", " inputs = {\n", - " key: tf.keras.layers.Input(shape=(max_seq_length), name=key, dtype=tf.int64)\n", - " for key in feature_spec.keys()\n", + " key: tf.keras.layers.Input(shape=(max_seq_length), name=key, dtype=tf.int64) for key in feature_spec.keys()\n", " }\n", "\n", " input_word_ids = tf.cast(inputs[\"input_word_ids\"], dtype=tf.int32)\n", @@ -673,28 +676,19 @@ " input_type_ids = tf.cast(inputs[\"input_type_ids\"], dtype=tf.int32)\n", "\n", " bert_layer = load_bert_layer()\n", - " pooled_output, _ = bert_layer(\n", - " [input_word_ids, \n", - " input_mask, \n", - " input_type_ids\n", - " ]\n", - " )\n", - " \n", + " pooled_output, _ = bert_layer([input_word_ids, input_mask, input_type_ids])\n", + "\n", " # Add additional layers depending on your problem\n", - " x = tf.keras.layers.Dense(256, activation='relu')(pooled_output)\n", - " dense = tf.keras.layers.Dense(64, activation='relu')(x)\n", - " pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)\n", + " x = tf.keras.layers.Dense(256, activation=\"relu\")(pooled_output)\n", + " dense = tf.keras.layers.Dense(64, activation=\"relu\")(x)\n", + " pred = tf.keras.layers.Dense(1, activation=\"sigmoid\")(dense)\n", "\n", " keras_model = tf.keras.Model(\n", - " inputs=[\n", - " inputs['input_word_ids'], \n", - " inputs['input_mask'], \n", - " inputs['input_type_ids']], \n", - " outputs=pred)\n", - " keras_model.compile(loss='binary_crossentropy', \n", - " optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), \n", - " metrics=['accuracy']\n", - " )\n", + " inputs=[inputs[\"input_word_ids\"], inputs[\"input_mask\"], inputs[\"input_type_ids\"]], outputs=pred\n", + " )\n", + " keras_model.compile(\n", + " loss=\"binary_crossentropy\", optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=[\"accuracy\"]\n", + " )\n", " return keras_model\n", "\n", "\n", @@ -713,13 +707,12 @@ " transformed_features = model.tft_layer(parsed_features)\n", "\n", " outputs = model(transformed_features)\n", - " return {'outputs': outputs}\n", + " return {\"outputs\": outputs}\n", "\n", " return serve_tf_examples_fn\n", "\n", - "def _input_fn(file_pattern: Text,\n", - " tf_transform_output: tft.TFTransformOutput,\n", - " batch_size: int = 32) -> tf.data.Dataset:\n", + "\n", + "def _input_fn(file_pattern: Text, tf_transform_output: tft.TFTransformOutput, batch_size: int = 32) -> tf.data.Dataset:\n", " \"\"\"Generates features and label for tuning/training.\n", "\n", " Args:\n", @@ -732,18 +725,19 @@ " A dataset that contains (features, indices) tuple where features is a\n", " dictionary of Tensors, and indices is a single Tensor of label indices.\n", " \"\"\"\n", - " transformed_feature_spec = (\n", - " tf_transform_output.transformed_feature_spec().copy())\n", + " transformed_feature_spec = tf_transform_output.transformed_feature_spec().copy()\n", "\n", " dataset = tf.data.experimental.make_batched_features_dataset(\n", " file_pattern=file_pattern,\n", " batch_size=batch_size,\n", " features=transformed_feature_spec,\n", " reader=_gzip_reader_fn,\n", - " label_key=_LABEL_KEY)\n", + " label_key=_LABEL_KEY,\n", + " )\n", "\n", " return dataset\n", "\n", + "\n", "# TFX Trainer will call this function.\n", "def run_fn(fn_args: TrainerFnArgs):\n", " \"\"\"Train the model based on given args.\n", @@ -764,18 +758,15 @@ " train_dataset,\n", " steps_per_epoch=fn_args.train_steps,\n", " validation_data=eval_dataset,\n", - " validation_steps=fn_args.eval_steps)\n", + " validation_steps=fn_args.eval_steps,\n", + " )\n", "\n", " signatures = {\n", - " 'serving_default':\n", - " _get_serve_tf_examples_fn(model,\n", - " tf_transform_output).get_concrete_function(\n", - " tf.TensorSpec(\n", - " shape=[None],\n", - " dtype=tf.string,\n", - " name='examples')),\n", + " \"serving_default\": _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(\n", + " tf.TensorSpec(shape=[None], dtype=tf.string, name=\"examples\")\n", + " ),\n", " }\n", - " model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)\n" + " model.save(fn_args.serving_model_dir, save_format=\"tf\", signatures=signatures)" ] }, { @@ -795,11 +786,12 @@ "trainer = Trainer(\n", " module_file=os.path.abspath(\"trainer.py\"),\n", " custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),\n", - " examples=transform.outputs['transformed_examples'],\n", - " transform_graph=transform.outputs['transform_graph'],\n", - " schema=schema_gen.outputs['schema'],\n", + " examples=transform.outputs[\"transformed_examples\"],\n", + " transform_graph=transform.outputs[\"transform_graph\"],\n", + " schema=schema_gen.outputs[\"schema\"],\n", " train_args=trainer_pb2.TrainArgs(num_steps=TRAINING_STEPS),\n", - " eval_args=trainer_pb2.EvalArgs(num_steps=EVALUATION_STEPS))\n", + " eval_args=trainer_pb2.EvalArgs(num_steps=EVALUATION_STEPS),\n", + ")\n", "context.run(trainer)" ] }, @@ -814,10 +806,11 @@ "outputs": [], "source": [ "model_resolver = ResolverNode(\n", - " instance_name='latest_blessed_model_resolver',\n", + " instance_name=\"latest_blessed_model_resolver\",\n", " resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,\n", " model=Channel(type=Model),\n", - " model_blessing=Channel(type=ModelBlessing))\n", + " model_blessing=Channel(type=ModelBlessing),\n", + ")\n", "\n", "context.run(model_resolver)" ] @@ -843,34 +836,31 @@ "outputs": [], "source": [ "eval_config = tfma.EvalConfig(\n", - " model_specs=[\n", - " tfma.ModelSpec(label_key='label')\n", - " ],\n", + " model_specs=[tfma.ModelSpec(label_key=\"label\")],\n", " metrics_specs=[\n", " tfma.MetricsSpec(\n", - " metrics=[\n", - " tfma.MetricConfig(class_name='ExampleCount')\n", - " ],\n", - " thresholds = {\n", - " 'binary_accuracy': tfma.MetricThreshold(\n", - " value_threshold=tfma.GenericValueThreshold(\n", - " lower_bound={'value': 0.5}),\n", + " metrics=[tfma.MetricConfig(class_name=\"ExampleCount\")],\n", + " thresholds={\n", + " \"binary_accuracy\": tfma.MetricThreshold(\n", + " value_threshold=tfma.GenericValueThreshold(lower_bound={\"value\": 0.5}),\n", " change_threshold=tfma.GenericChangeThreshold(\n", - " direction=tfma.MetricDirection.HIGHER_IS_BETTER,\n", - " absolute={'value': -1e-10}))\n", - " }\n", + " direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={\"value\": -1e-10}\n", + " ),\n", + " )\n", + " },\n", " )\n", " ],\n", " slicing_specs=[\n", " # An empty slice spec means the overall slice, i.e. the whole dataset.\n", " tfma.SlicingSpec(),\n", - " ])\n", + " ],\n", + ")\n", "\n", "evaluator = Evaluator(\n", - " examples=example_gen.outputs['examples'],\n", - " model=trainer.outputs['model'],\n", - " baseline_model=model_resolver.outputs['model'],\n", - " eval_config=eval_config\n", + " examples=example_gen.outputs[\"examples\"],\n", + " model=trainer.outputs[\"model\"],\n", + " baseline_model=model_resolver.outputs[\"model\"],\n", + " eval_config=eval_config,\n", ")\n", "\n", "context.run(evaluator)" @@ -915,11 +905,12 @@ "serving_model_dir = \"./content/serving_model_dir\"\n", "\n", "pusher = Pusher(\n", - " model=trainer.outputs['model'],\n", - " model_blessing=evaluator.outputs['blessing'],\n", + " model=trainer.outputs[\"model\"],\n", + " model_blessing=evaluator.outputs[\"blessing\"],\n", " push_destination=pusher_pb2.PushDestination(\n", - " filesystem=pusher_pb2.PushDestination.Filesystem(\n", - " base_directory=serving_model_dir)))\n", + " filesystem=pusher_pb2.PushDestination.Filesystem(base_directory=serving_model_dir)\n", + " ),\n", + ")\n", "\n", "context.run(pusher)" ] @@ -947,14 +938,14 @@ "def _bytes_feature(value):\n", " return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n", "\n", + "\n", "push_uri = pusher.outputs.model_push.get()[0].uri\n", "latest_version = max(os.listdir(push_uri))\n", "latest_version_path = os.path.join(push_uri, latest_version)\n", "loaded_model = tf.saved_model.load(latest_version_path)\n", "\n", "example_str = b\"This is the finest show ever produced for TV. Each episode is a triumph. The casting, the writing, the timing are all second to none. This cast performs miracles.\"\n", - "example = tf.train.Example(features=tf.train.Features(feature={\n", - " 'text': _bytes_feature(example_str)}))\n", + "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n", "\n", "serialized_example = example.SerializeToString()\n", "f = loaded_model.signatures[\"serving_default\"]\n", @@ -974,14 +965,14 @@ "def _bytes_feature(value):\n", " return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n", "\n", + "\n", "push_uri = pusher.outputs.model_push.get()[0].uri\n", "latest_version = max(os.listdir(push_uri))\n", "latest_version_path = os.path.join(push_uri, latest_version)\n", "loaded_model = tf.saved_model.load(latest_version_path)\n", "\n", "example_str = b\"I loved it!\"\n", - "example = tf.train.Example(features=tf.train.Features(feature={\n", - " 'text': _bytes_feature(example_str)}))\n", + "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n", "\n", "serialized_example = example.SerializeToString()\n", "f = loaded_model.signatures[\"serving_default\"]\n", @@ -1001,14 +992,14 @@ "def _bytes_feature(value):\n", " return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n", "\n", + "\n", "push_uri = pusher.outputs.model_push.get()[0].uri\n", "latest_version = max(os.listdir(push_uri))\n", "latest_version_path = os.path.join(push_uri, latest_version)\n", "loaded_model = tf.saved_model.load(latest_version_path)\n", "\n", "example_str = b\"It's OK.\"\n", - "example = tf.train.Example(features=tf.train.Features(feature={\n", - " 'text': _bytes_feature(example_str)}))\n", + "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n", "\n", "serialized_example = example.SerializeToString()\n", "f = loaded_model.signatures[\"serving_default\"]\n", @@ -1028,14 +1019,14 @@ "def _bytes_feature(value):\n", " return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))\n", "\n", + "\n", "push_uri = pusher.outputs.model_push.get()[0].uri\n", "latest_version = max(os.listdir(push_uri))\n", "latest_version_path = os.path.join(push_uri, latest_version)\n", "loaded_model = tf.saved_model.load(latest_version_path)\n", "\n", "example_str = b\"The worst product ever.\"\n", - "example = tf.train.Example(features=tf.train.Features(feature={\n", - " 'text': _bytes_feature(example_str)}))\n", + "example = tf.train.Example(features=tf.train.Features(feature={\"text\": _bytes_feature(example_str)}))\n", "\n", "serialized_example = example.SerializeToString()\n", "f = loaded_model.signatures[\"serving_default\"]\n", @@ -1048,7 +1039,7 @@ "metadata": {}, "outputs": [], "source": [ - "print('Model has been exported to {}'.format(pusher.outputs.model_push.get()[0].uri))" + "print(\"Model has been exported to {}\".format(pusher.outputs.model_push.get()[0].uri))" ] }, { @@ -1057,7 +1048,7 @@ "metadata": {}, "outputs": [], "source": [ - "for path in os.walk('{}/'.format(pusher.outputs.model_push.get()[0].uri)):\n", + "for path in os.walk(\"{}/\".format(pusher.outputs.model_push.get()[0].uri)):\n", " print(path[0])" ] }, @@ -1068,7 +1059,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] }, diff --git a/11_stream/01_Setup_IAM.ipynb b/11_stream/01_Setup_IAM.ipynb index 5a8d0b2d..240819d9 100644 --- a/11_stream/01_Setup_IAM.ipynb +++ b/11_stream/01_Setup_IAM.ipynb @@ -17,13 +17,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_kinesis_role_name = 'DSOAWS_Kinesis'" + "iam_kinesis_role_name = \"DSOAWS_Kinesis\"" ] }, { @@ -58,31 +58,13 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesis.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"firehose.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesisanalytics.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " } \n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesis.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"firehose.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " ],\n", + "}" ] }, { @@ -100,18 +82,18 @@ " iam_role_kinesis = iam.create_role(\n", " RoleName=iam_kinesis_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Kinesis Role'\n", + " Description=\"DSOAWS Kinesis Role\",\n", " )\n", - " print('Role succesfully created.')\n", + " print(\"Role succesfully created.\")\n", " iam_kinesis_role_passed = True\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role_kinesis = iam.get_role(RoleName=iam_kinesis_role_name)\n", - " print('Role already exists. That is OK.')\n", + " print(\"Role already exists. That is OK.\")\n", " iam_kinesis_role_passed = True\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " \n", + " print(\"Unexpected error: %s\" % e)\n", + "\n", "time.sleep(30)" ] }, @@ -121,8 +103,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_kinesis_name = iam_role_kinesis['Role']['RoleName']\n", - "print('Role Name: {}'.format(iam_role_kinesis_name))" + "iam_role_kinesis_name = iam_role_kinesis[\"Role\"][\"RoleName\"]\n", + "print(\"Role Name: {}\".format(iam_role_kinesis_name))" ] }, { @@ -131,8 +113,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_kinesis_arn = iam_role_kinesis['Role']['Arn']\n", - "print('Role ARN: {}'.format(iam_role_kinesis_arn))" + "iam_role_kinesis_arn = iam_role_kinesis[\"Role\"][\"Arn\"]\n", + "print(\"Role ARN: {}\".format(iam_role_kinesis_arn))" ] }, { @@ -141,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "account_id = sts.get_caller_identity()['Account']" + "account_id = sts.get_caller_identity()[\"Account\"]" ] }, { @@ -157,7 +139,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream_name = 'dsoaws-kinesis-data-stream'" + "stream_name = \"dsoaws-kinesis-data-stream\"" ] }, { @@ -173,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehose_name = 'dsoaws-kinesis-data-firehose'" + "firehose_name = \"dsoaws-kinesis-data-firehose\"" ] }, { @@ -189,7 +171,7 @@ "metadata": {}, "outputs": [], "source": [ - "lambda_fn_name_cloudwatch = 'DeliverKinesisAnalyticsToCloudWatch'" + "lambda_fn_name_cloudwatch = \"DeliverKinesisAnalyticsToCloudWatch\"" ] }, { @@ -198,7 +180,7 @@ "metadata": {}, "outputs": [], "source": [ - "lambda_fn_name_invoke_sm_endpoint = 'InvokeSageMakerEndpointFromKinesis'" + "lambda_fn_name_invoke_sm_endpoint = \"InvokeSageMakerEndpointFromKinesis\"" ] }, { @@ -207,7 +189,7 @@ "metadata": {}, "outputs": [], "source": [ - "lambda_fn_name_sns = 'PushNotificationToSNS'" + "lambda_fn_name_sns = \"PushNotificationToSNS\"" ] }, { @@ -226,79 +208,54 @@ "outputs": [], "source": [ "kinesis_policy_doc = {\n", - " \n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " { \n", - " \"Effect\": \"Allow\", \n", + " {\n", + " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:AbortMultipartUpload\",\n", " \"s3:GetBucketLocation\",\n", " \"s3:GetObject\",\n", " \"s3:ListBucket\",\n", " \"s3:ListBucketMultipartUploads\",\n", - " \"s3:PutObject\"\n", - " ], \n", - " \"Resource\": [ \n", - " \"arn:aws:s3:::{}\".format(bucket),\n", - " \"arn:aws:s3:::{}/*\".format(bucket)\n", - " ] \n", + " \"s3:PutObject\",\n", + " ],\n", + " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket), \"arn:aws:s3:::{}/*\".format(bucket)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"logs:PutLogEvents\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)\n", - " ]\n", + " \"Action\": [\"logs:PutLogEvents\"],\n", + " \"Resource\": [\"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"kinesis:*\",\n", " ],\n", - " \"Resource\": [\n", - " \"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)\n", - " ]\n", + " \"Resource\": [\"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"firehose:*\",\n", " ],\n", - " \"Resource\": [\n", - " \"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)\n", - " ]\n", + " \"Resource\": [\"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"kinesisanalytics:*\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ]\n", + " \"Resource\": [\"*\"],\n", " },\n", " {\n", " \"Sid\": \"UseLambdaFunction\",\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"lambda:InvokeFunction\",\n", - " \"lambda:GetFunctionConfiguration\"\n", - " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ] \n", + " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n", + " \"Resource\": [\"*\"],\n", " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": \"iam:PassRole\",\n", - " \"Resource\": [\n", - " \"arn:aws:iam::*:role/service-role/kinesis*\"\n", - " ] \n", - " }\n", - " ]\n", + " {\"Effect\": \"Allow\", \"Action\": \"iam:PassRole\", \"Resource\": [\"arn:aws:iam::*:role/service-role/kinesis*\"]},\n", + " ],\n", "}\n", "\n", "print(json.dumps(kinesis_policy_doc, indent=4, sort_keys=True, default=str))" @@ -320,9 +277,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=iam_role_kinesis_name,\n", - " PolicyName='DSOAWS_KinesisPolicy',\n", - " PolicyDocument=json.dumps(kinesis_policy_doc)\n", + " RoleName=iam_role_kinesis_name, PolicyName=\"DSOAWS_KinesisPolicy\", PolicyDocument=json.dumps(kinesis_policy_doc)\n", ")\n", "\n", "time.sleep(30)" @@ -350,7 +305,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_lambda_role_name = 'DSOAWS_Lambda'" + "iam_lambda_role_name = \"DSOAWS_Lambda\"" ] }, { @@ -371,21 +326,9 @@ "assume_role_policy_doc = {\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"lambda.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesisanalytics.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"lambda.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " ],\n", "}" ] }, @@ -403,18 +346,18 @@ " iam_role_lambda = iam.create_role(\n", " RoleName=iam_lambda_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Lambda Role'\n", + " Description=\"DSOAWS Lambda Role\",\n", " )\n", - " print('Role succesfully created.')\n", + " print(\"Role succesfully created.\")\n", " iam_lambda_role_passed = True\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role_lambda = iam.get_role(RoleName=iam_lambda_role_name)\n", - " print('Role already exists. This is OK.')\n", + " print(\"Role already exists. This is OK.\")\n", " iam_lambda_role_passed = True\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " \n", + " print(\"Unexpected error: %s\" % e)\n", + "\n", "time.sleep(30)" ] }, @@ -424,8 +367,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_lambda_name = iam_role_lambda['Role']['RoleName']\n", - "print('Role Name: {}'.format(iam_role_lambda_name))" + "iam_role_lambda_name = iam_role_lambda[\"Role\"][\"RoleName\"]\n", + "print(\"Role Name: {}\".format(iam_role_lambda_name))" ] }, { @@ -434,8 +377,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_lambda_arn = iam_role_lambda['Role']['Arn']\n", - "print('Role ARN: {}'.format(iam_role_lambda_arn))" + "iam_role_lambda_arn = iam_role_lambda[\"Role\"][\"Arn\"]\n", + "print(\"Role ARN: {}\".format(iam_role_lambda_arn))" ] }, { @@ -457,41 +400,23 @@ " {\n", " \"Sid\": \"UseLambdaFunction\",\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"lambda:InvokeFunction\",\n", - " \"lambda:GetFunctionConfiguration\"\n", - " ],\n", - " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id)\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": \"cloudwatch:*\",\n", - " \"Resource\": \"*\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": \"sns:*\",\n", - " \"Resource\": \"*\"\n", + " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n", + " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id),\n", " },\n", + " {\"Effect\": \"Allow\", \"Action\": \"cloudwatch:*\", \"Resource\": \"*\"},\n", + " {\"Effect\": \"Allow\", \"Action\": \"sns:*\", \"Resource\": \"*\"},\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogGroup\",\n", - " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id)\n", + " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id),\n", " },\n", + " {\"Effect\": \"Allow\", \"Action\": \"sagemaker:InvokeEndpoint\", \"Resource\": \"*\"},\n", " {\n", " \"Effect\": \"Allow\",\n", - " \"Action\": \"sagemaker:InvokeEndpoint\",\n", - " \"Resource\": \"*\"\n", - " }, \n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"logs:CreateLogStream\",\n", - " \"logs:PutLogEvents\"\n", - " ],\n", - " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id)\n", - " }\n", - " ]\n", + " \"Action\": [\"logs:CreateLogStream\", \"logs:PutLogEvents\"],\n", + " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id),\n", + " },\n", + " ],\n", "}" ] }, @@ -513,9 +438,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=iam_role_lambda_name,\n", - " PolicyName='DSOAWS_LambdaPolicy',\n", - " PolicyDocument=json.dumps(lambda_policy_doc)\n", + " RoleName=iam_role_lambda_name, PolicyName=\"DSOAWS_LambdaPolicy\", PolicyDocument=json.dumps(lambda_policy_doc)\n", ")\n", "\n", "time.sleep(30)" diff --git a/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb b/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb index 5e83862d..2f0c944b 100644 --- a/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb +++ b/11_stream/02_Create_Lambda_To_Invoke_SageMaker.ipynb @@ -35,14 +35,14 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n", - "lam = boto3.Session().client(service_name='lambda', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n", + "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)" ] }, { @@ -70,9 +70,9 @@ "try:\n", " iam_lambda_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -102,9 +102,9 @@ "try:\n", " iam_lambda_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -123,11 +123,11 @@ "outputs": [], "source": [ "if not iam_lambda_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -148,9 +148,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -175,7 +175,7 @@ "metadata": {}, "outputs": [], "source": [ - "lambda_fn_name_invoke_ep='InvokeSageMakerEndpointFromKinesis'" + "lambda_fn_name_invoke_ep = \"InvokeSageMakerEndpointFromKinesis\"" ] }, { @@ -222,11 +222,11 @@ "source": [ "try:\n", " pytorch_endpoint_name\n", - " print('[OK]')\n", + " print(\"[OK]\")\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -247,13 +247,13 @@ "outputs": [], "source": [ "try:\n", - " waiter = sm.get_waiter('endpoint_in_service')\n", + " waiter = sm.get_waiter(\"endpoint_in_service\")\n", " waiter.wait(EndpointName=pytorch_endpoint_name)\n", "except:\n", - " print('###################')\n", - " print('The endpoint is not running.')\n", - " print('Please re-run the model deployment section to deploy the endpoint.')\n", - " print('###################') " + " print(\"###################\")\n", + " print(\"The endpoint is not running.\")\n", + " print(\"Please re-run the model deployment section to deploy the endpoint.\")\n", + " print(\"###################\")" ] }, { @@ -265,30 +265,27 @@ "import json\n", "import boto3\n", "\n", - "runtime = boto3.client('runtime.sagemaker')\n", + "runtime = boto3.client(\"runtime.sagemaker\")\n", "\n", - "inputs = [\n", - " {\"features\": [\"This is great!\"]},\n", - " {\"features\": [\"This is bad.\"]}\n", - "] \n", + "inputs = [{\"features\": [\"This is great!\"]}, {\"features\": [\"This is bad.\"]}]\n", "\n", "response = runtime.invoke_endpoint(\n", - " EndpointName=pytorch_endpoint_name,\n", - " ContentType='application/jsonlines', \n", - " Accept='application/jsonlines', \n", - " Body=json.dumps(inputs).encode('utf-8')\n", + " EndpointName=pytorch_endpoint_name,\n", + " ContentType=\"application/jsonlines\",\n", + " Accept=\"application/jsonlines\",\n", + " Body=json.dumps(inputs).encode(\"utf-8\"),\n", ")\n", - "print('response: {}'.format(response))\n", + "print(\"response: {}\".format(response))\n", "\n", - "predicted_classes_str = response['Body'].read().decode('utf-8')\n", + "predicted_classes_str = response[\"Body\"].read().decode(\"utf-8\")\n", "predicted_classes_json = json.loads(predicted_classes_str)\n", "\n", "predicted_classes = predicted_classes_json.splitlines()\n", - "print('predicted_classes: {}'.format(predicted_classes))\n", + "print(\"predicted_classes: {}\".format(predicted_classes))\n", "\n", "for predicted_class_json, input_data in zip(predicted_classes, inputs):\n", - " predicted_class = json.loads(predicted_class_json)['predicted_label']\n", - " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0])) " + " predicted_class = json.loads(predicted_class_json)[\"predicted_label\"]\n", + " print('Predicted star_rating: {} for review_body \"{}\"'.format(predicted_class, input_data[\"features\"][0]))" ] }, { @@ -313,7 +310,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('src/InvokeSageMakerEndpointFromKinesis.zip', 'rb') as f: \n", + "with open(\"src/InvokeSageMakerEndpointFromKinesis.zip\", \"rb\") as f:\n", " code = f.read()" ] }, @@ -332,33 +329,28 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = lam.create_function(\n", - " FunctionName='{}'.format(lambda_fn_name_invoke_ep),\n", - " Runtime='python3.7',\n", - " Role='{}'.format(iam_role_lambda_arn),\n", - " Handler='src/invoke_sm_endpoint_from_kinesis.lambda_handler',\n", - " Code={\n", - " 'ZipFile': code\n", - " },\n", - " Description='Query SageMaker Endpoint for star rating prediction on review input text.',\n", + " FunctionName=\"{}\".format(lambda_fn_name_invoke_ep),\n", + " Runtime=\"python3.7\",\n", + " Role=\"{}\".format(iam_role_lambda_arn),\n", + " Handler=\"src/invoke_sm_endpoint_from_kinesis.lambda_handler\",\n", + " Code={\"ZipFile\": code},\n", + " Description=\"Query SageMaker Endpoint for star rating prediction on review input text.\",\n", " # max timeout supported by Firehose is 5min\n", " Timeout=300,\n", " MemorySize=128,\n", - " Publish=True\n", + " Publish=True,\n", " )\n", - " print('Lambda Function {} successfully created.'.format(lambda_fn_name_invoke_ep))\n", + " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_invoke_ep))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceConflictException':\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n", " response = lam.update_function_code(\n", - " FunctionName='{}'.format(lambda_fn_name_invoke_ep),\n", - " ZipFile=code,\n", - " Publish=True,\n", - " DryRun=False\n", - " ) \n", - " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_invoke_ep)) \n", + " FunctionName=\"{}\".format(lambda_fn_name_invoke_ep), ZipFile=code, Publish=True, DryRun=False\n", + " )\n", + " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_invoke_ep))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -369,7 +361,7 @@ "source": [ "response = lam.get_function(FunctionName=lambda_fn_name_invoke_ep)\n", "\n", - "lambda_fn_arn_invoke_ep = response['Configuration']['FunctionArn']\n", + "lambda_fn_arn_invoke_ep = response[\"Configuration\"][\"FunctionArn\"]\n", "print(lambda_fn_arn_invoke_ep)" ] }, @@ -396,8 +388,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_invoke_ep)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Function'.format(\n", + " region, lambda_fn_name_invoke_ep\n", + " )\n", + " )\n", + ")" ] }, { @@ -414,13 +412,8 @@ "outputs": [], "source": [ "response = lam.update_function_configuration(\n", - " FunctionName=lambda_fn_name_invoke_ep,\n", - " Environment={\n", - " 'Variables': {\n", - " 'ENDPOINT_NAME': pytorch_endpoint_name\n", - " }\n", - " }\n", - " )" + " FunctionName=lambda_fn_name_invoke_ep, Environment={\"Variables\": {\"ENDPOINT_NAME\": pytorch_endpoint_name}}\n", + ")" ] }, { diff --git a/11_stream/03_Create_Kinesis_Data_Firehose.ipynb b/11_stream/03_Create_Kinesis_Data_Firehose.ipynb index 8491f8bc..49db44c0 100644 --- a/11_stream/03_Create_Kinesis_Data_Firehose.ipynb +++ b/11_stream/03_Create_Kinesis_Data_Firehose.ipynb @@ -27,13 +27,13 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)" ] }, { @@ -54,9 +54,9 @@ "try:\n", " firehose_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -93,9 +93,9 @@ "try:\n", " iam_kinesis_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -125,9 +125,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -157,9 +157,9 @@ "try:\n", " iam_kinesis_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -178,11 +178,11 @@ "outputs": [], "source": [ "if not iam_kinesis_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -210,9 +210,9 @@ "try:\n", " lambda_fn_arn_invoke_ep\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -239,69 +239,59 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = firehose.create_delivery_stream(\n", " DeliveryStreamName=firehose_name,\n", - " DeliveryStreamType='DirectPut',\n", + " DeliveryStreamType=\"DirectPut\",\n", " ExtendedS3DestinationConfiguration={\n", - " 'RoleARN': iam_role_kinesis_arn,\n", - " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n", - " 'Prefix': 'kinesis-data-firehose/', \n", - " 'ErrorOutputPrefix': 'kinesis-data-firehose-error/',\n", - " 'BufferingHints': {\n", - " 'SizeInMBs': 1,\n", - " 'IntervalInSeconds': 60\n", + " \"RoleARN\": iam_role_kinesis_arn,\n", + " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n", + " \"Prefix\": \"kinesis-data-firehose/\",\n", + " \"ErrorOutputPrefix\": \"kinesis-data-firehose-error/\",\n", + " \"BufferingHints\": {\"SizeInMBs\": 1, \"IntervalInSeconds\": 60},\n", + " \"CompressionFormat\": \"UNCOMPRESSED\",\n", + " \"CloudWatchLoggingOptions\": {\n", + " \"Enabled\": True,\n", + " \"LogGroupName\": \"/aws/kinesisfirehose/dsoaws-kinesis-data-firehose\",\n", + " \"LogStreamName\": \"S3Delivery\",\n", " },\n", - " 'CompressionFormat': 'UNCOMPRESSED',\n", - " 'CloudWatchLoggingOptions': {\n", - " 'Enabled': True,\n", - " 'LogGroupName': '/aws/kinesisfirehose/dsoaws-kinesis-data-firehose',\n", - " 'LogStreamName': 'S3Delivery'\n", + " \"ProcessingConfiguration\": {\n", + " \"Enabled\": True,\n", + " \"Processors\": [\n", + " {\n", + " \"Type\": \"Lambda\",\n", + " \"Parameters\": [\n", + " {\n", + " \"ParameterName\": \"LambdaArn\",\n", + " \"ParameterValue\": \"{}:$LATEST\".format(lambda_fn_arn_invoke_ep),\n", + " },\n", + " {\"ParameterName\": \"BufferSizeInMBs\", \"ParameterValue\": \"1\"},\n", + " {\"ParameterName\": \"BufferIntervalInSeconds\", \"ParameterValue\": \"60\"},\n", + " ],\n", + " }\n", + " ],\n", " },\n", - " 'ProcessingConfiguration': {\n", - " 'Enabled': True,\n", - " 'Processors': [{\n", - " 'Type': 'Lambda',\n", - " 'Parameters': [\n", - " {\n", - " 'ParameterName': 'LambdaArn',\n", - " 'ParameterValue': '{}:$LATEST'.format(lambda_fn_arn_invoke_ep)\n", - " },\n", - " {\n", - " 'ParameterName': 'BufferSizeInMBs',\n", - " 'ParameterValue': '1'\n", - " },\n", - " {\n", - " 'ParameterName': 'BufferIntervalInSeconds',\n", - " 'ParameterValue': '60'\n", - " }, \n", - " ]\n", - " }]\n", + " \"S3BackupMode\": \"Enabled\",\n", + " \"S3BackupConfiguration\": {\n", + " \"RoleARN\": iam_role_kinesis_arn,\n", + " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n", + " \"Prefix\": \"kinesis-data-firehose-source-record/\",\n", + " \"ErrorOutputPrefix\": \"!{firehose:error-output-type}/\",\n", + " \"BufferingHints\": {\"SizeInMBs\": 1, \"IntervalInSeconds\": 60},\n", + " \"CompressionFormat\": \"UNCOMPRESSED\",\n", " },\n", - " 'S3BackupMode': 'Enabled',\n", - " 'S3BackupConfiguration': {\n", - " 'RoleARN': iam_role_kinesis_arn,\n", - " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n", - " 'Prefix': 'kinesis-data-firehose-source-record/', \n", - " 'ErrorOutputPrefix': '!{firehose:error-output-type}/',\n", - " 'BufferingHints': {\n", - " 'SizeInMBs': 1,\n", - " 'IntervalInSeconds': 60\n", - " },\n", - " 'CompressionFormat': 'UNCOMPRESSED'\n", + " \"CloudWatchLoggingOptions\": {\n", + " \"Enabled\": False,\n", " },\n", - " 'CloudWatchLoggingOptions': {\n", - " 'Enabled': False,\n", - " }\n", - " }\n", + " },\n", " )\n", - " print('Delivery stream {} successfully created.'.format(firehose_name))\n", + " print(\"Delivery stream {} successfully created.\".format(firehose_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Delivery stream {} already exists.'.format(firehose_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Delivery stream {} already exists.\".format(firehose_name))\n", " else:\n", - " print('Unexpected error: %s' % e)" + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -319,14 +309,14 @@ "source": [ "import time\n", "\n", - "status = ''\n", - "while status != 'ACTIVE': \n", + "status = \"\"\n", + "while status != \"ACTIVE\":\n", " r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n", - " description = r.get('DeliveryStreamDescription')\n", - " status = description.get('DeliveryStreamStatus')\n", + " description = r.get(\"DeliveryStreamDescription\")\n", + " status = description.get(\"DeliveryStreamStatus\")\n", " time.sleep(5)\n", - " \n", - "print('Delivery Stream {} is active'.format(firehose_name))" + "\n", + "print(\"Delivery Stream {} is active\".format(firehose_name))" ] }, { @@ -335,7 +325,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehose_arn = r['DeliveryStreamDescription']['DeliveryStreamARN']\n", + "firehose_arn = r[\"DeliveryStreamDescription\"][\"DeliveryStreamARN\"]\n", "print(firehose_arn)" ] }, @@ -362,8 +352,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Firehose'.format(region, firehose_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Firehose'.format(\n", + " region, firehose_name\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/11_stream/04_Create_Kinesis_Data_Stream.ipynb b/11_stream/04_Create_Kinesis_Data_Stream.ipynb index b5cdc435..b64d4ab8 100644 --- a/11_stream/04_Create_Kinesis_Data_Stream.ipynb +++ b/11_stream/04_Create_Kinesis_Data_Stream.ipynb @@ -26,14 +26,14 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "kinesis = boto3.Session().client(service_name='kinesis', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "kinesis = boto3.Session().client(service_name=\"kinesis\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)" ] }, { @@ -68,9 +68,9 @@ "try:\n", " stream_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -99,19 +99,16 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", - " response = kinesis.create_stream(\n", - " StreamName=stream_name, \n", - " ShardCount=shard_count\n", - " )\n", - " print('Data Stream {} successfully created.'.format(stream_name))\n", + "try:\n", + " response = kinesis.create_stream(StreamName=stream_name, ShardCount=shard_count)\n", + " print(\"Data Stream {} successfully created.\".format(stream_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", - " \n", + "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Data Stream {} already exists.'.format(stream_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Data Stream {} already exists.\".format(stream_name))\n", " else:\n", - " print('Unexpected error: %s' % e)" + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -122,14 +119,14 @@ "source": [ "import time\n", "\n", - "status = ''\n", - "while status != 'ACTIVE': \n", + "status = \"\"\n", + "while status != \"ACTIVE\":\n", " r = kinesis.describe_stream(StreamName=stream_name)\n", - " description = r.get('StreamDescription')\n", - " status = description.get('StreamStatus')\n", + " description = r.get(\"StreamDescription\")\n", + " status = description.get(\"StreamStatus\")\n", " time.sleep(5)\n", - " \n", - "print('Stream {} is active'.format(stream_name))" + "\n", + "print(\"Stream {} is active\".format(stream_name))" ] }, { @@ -145,9 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream_response = kinesis.describe_stream(\n", - " StreamName=stream_name\n", - ")\n", + "stream_response = kinesis.describe_stream(StreamName=stream_name)\n", "\n", "print(json.dumps(stream_response, indent=4, sort_keys=True, default=str))" ] @@ -160,7 +155,7 @@ }, "outputs": [], "source": [ - "stream_arn = stream_response['StreamDescription']['StreamARN']\n", + "stream_arn = stream_response[\"StreamDescription\"][\"StreamARN\"]\n", "print(stream_arn)" ] }, @@ -187,8 +182,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Stream'.format(region, stream_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Stream'.format(\n", + " region, stream_name\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb b/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb index 641d3741..1eba16ab 100644 --- a/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb +++ b/11_stream/05_Create_Lambda_Destination_CloudWatch.ipynb @@ -38,16 +38,16 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "account_id = sts.get_caller_identity()['Account']\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "\n", - "lam = boto3.Session().client(service_name='lambda', region_name=region)" + "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)" ] }, { @@ -75,9 +75,9 @@ "try:\n", " lambda_fn_name_cloudwatch\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -114,9 +114,9 @@ "try:\n", " iam_lambda_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -146,9 +146,9 @@ "try:\n", " iam_lambda_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -167,11 +167,11 @@ "outputs": [], "source": [ "if not iam_lambda_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -192,9 +192,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -253,7 +253,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('src/DeliverKinesisAnalyticsToCloudWatch.zip', 'rb') as f: \n", + "with open(\"src/DeliverKinesisAnalyticsToCloudWatch.zip\", \"rb\") as f:\n", " code = f.read()" ] }, @@ -272,33 +272,28 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = lam.create_function(\n", - " FunctionName='{}'.format(lambda_fn_name_cloudwatch),\n", - " Runtime='python3.7',\n", - " Role='{}'.format(iam_role_lambda_arn),\n", - " Handler='src/deliver_metrics_to_cloudwatch.lambda_handler',\n", - " Code={\n", - " 'ZipFile': code\n", - " },\n", - " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n", + " FunctionName=\"{}\".format(lambda_fn_name_cloudwatch),\n", + " Runtime=\"python3.7\",\n", + " Role=\"{}\".format(iam_role_lambda_arn),\n", + " Handler=\"src/deliver_metrics_to_cloudwatch.lambda_handler\",\n", + " Code={\"ZipFile\": code},\n", + " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n", " Timeout=900,\n", " MemorySize=128,\n", - " Publish=True\n", + " Publish=True,\n", " )\n", - " print('Lambda Function {} successfully created.'.format(lambda_fn_name_cloudwatch))\n", + " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_cloudwatch))\n", "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceConflictException':\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n", " response = lam.update_function_code(\n", - " FunctionName='{}'.format(lambda_fn_name_cloudwatch),\n", - " ZipFile=code,\n", - " Publish=True,\n", - " DryRun=False\n", - " ) \n", - " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_cloudwatch)) \n", + " FunctionName=\"{}\".format(lambda_fn_name_cloudwatch), ZipFile=code, Publish=True, DryRun=False\n", + " )\n", + " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_cloudwatch))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -309,7 +304,7 @@ "source": [ "response = lam.get_function(FunctionName=lambda_fn_name_cloudwatch)\n", "\n", - "lambda_fn_arn_cloudwatch = response['Configuration']['FunctionArn']\n", + "lambda_fn_arn_cloudwatch = response[\"Configuration\"][\"FunctionArn\"]\n", "print(lambda_fn_arn_cloudwatch)" ] }, @@ -338,8 +333,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_cloudwatch)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Function'.format(\n", + " region, lambda_fn_name_cloudwatch\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/11_stream/06_Create_Lambda_Destination_SNS.ipynb b/11_stream/06_Create_Lambda_Destination_SNS.ipynb index 0d8c6de6..a1b05c27 100644 --- a/11_stream/06_Create_Lambda_Destination_SNS.ipynb +++ b/11_stream/06_Create_Lambda_Destination_SNS.ipynb @@ -38,17 +38,17 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "account_id = sts.get_caller_identity()['Account']\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "\n", - "lam = boto3.Session().client(service_name='lambda', region_name=region)\n", - "sns = boto3.Session().client(service_name='sns', region_name=region)" + "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)\n", + "sns = boto3.Session().client(service_name=\"sns\", region_name=region)" ] }, { @@ -76,9 +76,9 @@ "try:\n", " lambda_fn_name_sns\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -115,9 +115,9 @@ "try:\n", " iam_lambda_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -147,9 +147,9 @@ "try:\n", " iam_lambda_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -168,11 +168,11 @@ "outputs": [], "source": [ "if not iam_lambda_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -193,9 +193,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -231,7 +231,7 @@ "outputs": [], "source": [ "response = sns.create_topic(\n", - " Name='review_anomaly_scores',\n", + " Name=\"review_anomaly_scores\",\n", ")\n", "print(response)" ] @@ -242,7 +242,7 @@ "metadata": {}, "outputs": [], "source": [ - "sns_topic_arn = response['TopicArn']\n", + "sns_topic_arn = response[\"TopicArn\"]\n", "print(sns_topic_arn)" ] }, @@ -302,7 +302,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('src/PushNotificationToSNS.zip', 'rb') as f: \n", + "with open(\"src/PushNotificationToSNS.zip\", \"rb\") as f:\n", " code = f.read()" ] }, @@ -321,33 +321,28 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = lam.create_function(\n", - " FunctionName='{}'.format(lambda_fn_name_sns),\n", - " Runtime='python3.7',\n", - " Role='{}'.format(iam_role_lambda_arn),\n", - " Handler='src/push_notification_to_sns.lambda_handler',\n", - " Code={\n", - " 'ZipFile': code\n", - " },\n", - " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n", + " FunctionName=\"{}\".format(lambda_fn_name_sns),\n", + " Runtime=\"python3.7\",\n", + " Role=\"{}\".format(iam_role_lambda_arn),\n", + " Handler=\"src/push_notification_to_sns.lambda_handler\",\n", + " Code={\"ZipFile\": code},\n", + " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n", " Timeout=300,\n", " MemorySize=128,\n", - " Publish=True\n", + " Publish=True,\n", " )\n", - " print('Lambda Function {} successfully created.'.format(lambda_fn_name_sns))\n", + " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name_sns))\n", "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceConflictException':\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n", " response = lam.update_function_code(\n", - " FunctionName='{}'.format(lambda_fn_name_sns),\n", - " ZipFile=code,\n", - " Publish=True,\n", - " DryRun=False\n", - " ) \n", - " print('Updating existing Lambda Function {}. This is OK.'.format(lambda_fn_name_sns)) \n", + " FunctionName=\"{}\".format(lambda_fn_name_sns), ZipFile=code, Publish=True, DryRun=False\n", + " )\n", + " print(\"Updating existing Lambda Function {}. This is OK.\".format(lambda_fn_name_sns))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -358,7 +353,7 @@ "source": [ "response = lam.get_function(FunctionName=lambda_fn_name_sns)\n", "\n", - "lambda_fn_arn_sns = response['Configuration']['FunctionArn']\n", + "lambda_fn_arn_sns = response[\"Configuration\"][\"FunctionArn\"]\n", "print(lambda_fn_arn_sns)" ] }, @@ -387,13 +382,8 @@ "outputs": [], "source": [ "response = lam.update_function_configuration(\n", - " FunctionName=lambda_fn_name_sns,\n", - " Environment={\n", - " 'Variables': {\n", - " 'SNS_TOPIC_ARN': sns_topic_arn\n", - " }\n", - " }\n", - " )" + " FunctionName=lambda_fn_name_sns, Environment={\"Variables\": {\"SNS_TOPIC_ARN\": sns_topic_arn}}\n", + ")" ] }, { @@ -410,8 +400,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Function'.format(region, lambda_fn_name_sns)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Function'.format(\n", + " region, lambda_fn_name_sns\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb b/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb index ed473b47..2b8e1f71 100644 --- a/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb +++ b/11_stream/07_Create_Kinesis_Data_Analytics_App.ipynb @@ -41,17 +41,17 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "account_id = sts.get_caller_identity()['Account']\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n", - "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n", + "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)" ] }, { @@ -72,9 +72,9 @@ "try:\n", " firehose_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -104,9 +104,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -136,9 +136,9 @@ "try:\n", " stream_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -168,9 +168,9 @@ "try:\n", " lambda_fn_arn_cloudwatch\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -200,9 +200,9 @@ "try:\n", " lambda_fn_arn_sns\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -232,9 +232,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -264,9 +264,9 @@ "try:\n", " lambda_fn_arn_invoke_ep\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -298,7 +298,7 @@ "metadata": {}, "outputs": [], "source": [ - "kinesis_data_analytics_app_name = 'dsoaws-kinesis-data-analytics-sql-app'" + "kinesis_data_analytics_app_name = \"dsoaws-kinesis-data-analytics-sql-app\"" ] }, { @@ -307,7 +307,7 @@ "metadata": {}, "outputs": [], "source": [ - "in_app_stream_name = 'SOURCE_SQL_STREAM_001' # Default\n", + "in_app_stream_name = \"SOURCE_SQL_STREAM_001\" # Default\n", "print(in_app_stream_name)" ] }, @@ -333,7 +333,7 @@ "metadata": {}, "outputs": [], "source": [ - "sql_code = ''' \\\n", + "sql_code = \"\"\" \\\n", " CREATE OR REPLACE STREAM \"AVG_STAR_RATING_SQL_STREAM\" ( \\\n", " avg_star_rating DOUBLE); \\\n", " CREATE OR REPLACE PUMP \"AVG_STAR_RATING_SQL_STREAM_PUMP\" AS \\\n", @@ -364,12 +364,9 @@ " {} \\\n", " ) \\\n", " ); \\\n", - " '''.format(in_app_stream_name,\n", - " in_app_stream_name,\n", - " window_seconds,\n", - " in_app_stream_name,\n", - " in_app_stream_name,\n", - " window_seconds)\n", + " \"\"\".format(\n", + " in_app_stream_name, in_app_stream_name, window_seconds, in_app_stream_name, in_app_stream_name, window_seconds\n", + ")\n", "\n", "print(sql_code)" ] @@ -382,99 +379,73 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = kinesis_analytics.create_application(\n", " ApplicationName=kinesis_data_analytics_app_name,\n", " Inputs=[\n", " {\n", - " 'NamePrefix': 'SOURCE_SQL_STREAM',\n", - " 'KinesisFirehoseInput': {\n", - " 'ResourceARN': '{}'.format(firehose_arn),\n", - " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n", + " \"NamePrefix\": \"SOURCE_SQL_STREAM\",\n", + " \"KinesisFirehoseInput\": {\n", + " \"ResourceARN\": \"{}\".format(firehose_arn),\n", + " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n", " },\n", - " 'InputProcessingConfiguration': { \n", - " 'InputLambdaProcessor': { \n", - " 'ResourceARN': '{}'.format(lambda_fn_arn_invoke_ep),\n", - " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n", + " \"InputProcessingConfiguration\": {\n", + " \"InputLambdaProcessor\": {\n", + " \"ResourceARN\": \"{}\".format(lambda_fn_arn_invoke_ep),\n", + " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n", " }\n", - " }, \n", - " 'InputSchema': {\n", - " 'RecordFormat': {\n", - " 'RecordFormatType': 'CSV',\n", - " 'MappingParameters': {\n", - " 'CSVMappingParameters': {\n", - " 'RecordRowDelimiter': '\\n',\n", - " 'RecordColumnDelimiter': '\\t'\n", - " }\n", - " }\n", - " },\n", - " 'RecordColumns': [\n", - " {\n", - " 'Name': 'review_id',\n", - " 'Mapping': 'review_id',\n", - " 'SqlType': 'VARCHAR(14)'\n", - " }, \n", - " {\n", - " 'Name': 'star_rating',\n", - " 'Mapping': 'star_rating',\n", - " 'SqlType': 'INTEGER'\n", + " },\n", + " \"InputSchema\": {\n", + " \"RecordFormat\": {\n", + " \"RecordFormatType\": \"CSV\",\n", + " \"MappingParameters\": {\n", + " \"CSVMappingParameters\": {\"RecordRowDelimiter\": \"\\n\", \"RecordColumnDelimiter\": \"\\t\"}\n", " },\n", - " {\n", - " 'Name': 'product_category',\n", - " 'Mapping': 'product_category',\n", - " 'SqlType': 'VARCHAR(24)'\n", - " }, \n", - " {\n", - " 'Name': 'review_body',\n", - " 'Mapping': 'review_body',\n", - " 'SqlType': 'VARCHAR(65535)'\n", - " } \n", - " ]\n", - " }\n", + " },\n", + " \"RecordColumns\": [\n", + " {\"Name\": \"review_id\", \"Mapping\": \"review_id\", \"SqlType\": \"VARCHAR(14)\"},\n", + " {\"Name\": \"star_rating\", \"Mapping\": \"star_rating\", \"SqlType\": \"INTEGER\"},\n", + " {\"Name\": \"product_category\", \"Mapping\": \"product_category\", \"SqlType\": \"VARCHAR(24)\"},\n", + " {\"Name\": \"review_body\", \"Mapping\": \"review_body\", \"SqlType\": \"VARCHAR(65535)\"},\n", + " ],\n", + " },\n", " },\n", " ],\n", " Outputs=[\n", " {\n", - " 'Name': 'AVG_STAR_RATING_SQL_STREAM',\n", - " 'LambdaOutput': {\n", - " 'ResourceARN': '{}'.format(lambda_fn_arn_cloudwatch),\n", - " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n", + " \"Name\": \"AVG_STAR_RATING_SQL_STREAM\",\n", + " \"LambdaOutput\": {\n", + " \"ResourceARN\": \"{}\".format(lambda_fn_arn_cloudwatch),\n", + " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n", " },\n", - " 'DestinationSchema': {\n", - " 'RecordFormatType': 'CSV'\n", - " }\n", + " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n", " },\n", " {\n", - " 'Name': 'ANOMALY_SCORE_SQL_STREAM', \n", - " 'LambdaOutput': {\n", - " 'ResourceARN': '{}'.format(lambda_fn_arn_sns),\n", - " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n", + " \"Name\": \"ANOMALY_SCORE_SQL_STREAM\",\n", + " \"LambdaOutput\": {\n", + " \"ResourceARN\": \"{}\".format(lambda_fn_arn_sns),\n", + " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n", " },\n", - " 'DestinationSchema': {\n", - " 'RecordFormatType': 'CSV'\n", - " }\n", + " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n", " },\n", " {\n", - " 'Name': 'APPROXIMATE_COUNT_SQL_STREAM', \n", - " 'KinesisStreamsOutput': {\n", - " 'ResourceARN': '{}'.format(stream_arn),\n", - " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n", + " \"Name\": \"APPROXIMATE_COUNT_SQL_STREAM\",\n", + " \"KinesisStreamsOutput\": {\n", + " \"ResourceARN\": \"{}\".format(stream_arn),\n", + " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n", " },\n", - " 'DestinationSchema': {\n", - " 'RecordFormatType': 'CSV'\n", - " }\n", - " }\n", + " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n", + " },\n", " ],\n", - " ApplicationCode=sql_code\n", + " ApplicationCode=sql_code,\n", " )\n", - " print('SQL application {} successfully created.'.format(kinesis_data_analytics_app_name))\n", + " print(\"SQL application {} successfully created.\".format(kinesis_data_analytics_app_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('SQL App {} already exists.'.format(kinesis_data_analytics_app_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"SQL App {} already exists.\".format(kinesis_data_analytics_app_name))\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " " + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -495,7 +466,7 @@ "metadata": {}, "outputs": [], "source": [ - "input_id = response['ApplicationDetail']['InputDescriptions'][0]['InputId']\n", + "input_id = response[\"ApplicationDetail\"][\"InputDescriptions\"][0][\"InputId\"]\n", "print(input_id)" ] }, @@ -512,24 +483,17 @@ "metadata": {}, "outputs": [], "source": [ - "try: \n", + "try:\n", " response = kinesis_analytics.start_application(\n", " ApplicationName=kinesis_data_analytics_app_name,\n", - " InputConfigurations=[\n", - " {\n", - " 'Id': input_id,\n", - " 'InputStartingPositionConfiguration': {\n", - " 'InputStartingPosition': 'NOW'\n", - " }\n", - " }\n", - " ]\n", + " InputConfigurations=[{\"Id\": input_id, \"InputStartingPositionConfiguration\": {\"InputStartingPosition\": \"NOW\"}}],\n", " )\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Application {} is already starting.'.format(kinesis_data_analytics_app_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Application {} is already starting.\".format(kinesis_data_analytics_app_name))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -555,8 +519,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -578,17 +548,16 @@ "\n", "import time\n", "\n", - "app_status = response['ApplicationDetail']['ApplicationStatus']\n", - "print('Application status {}'.format(app_status))\n", + "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + "print(\"Application status {}\".format(app_status))\n", "\n", - "while app_status != 'RUNNING':\n", + "while app_status != \"RUNNING\":\n", " time.sleep(5)\n", - " response = kinesis_analytics.describe_application(\n", - " ApplicationName=kinesis_data_analytics_app_name)\n", - " app_status = response['ApplicationDetail']['ApplicationStatus']\n", - " print('Application status {}'.format(app_status))\n", + " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n", + " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + " print(\"Application status {}\".format(app_status))\n", "\n", - "print('Application status {}'.format(app_status))" + "print(\"Application status {}\".format(app_status))" ] }, { diff --git a/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb b/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb index 86ffdea6..6fdb3b51 100644 --- a/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb +++ b/11_stream/08_Put_Reviews_On_Kinesis_Data_Firehose.ipynb @@ -25,14 +25,14 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n", - "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)\n" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n", + "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)" ] }, { @@ -53,9 +53,9 @@ "try:\n", " firehose_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -85,9 +85,9 @@ "try:\n", " firehose_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -117,9 +117,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -149,9 +149,9 @@ "try:\n", " kinesis_data_analytics_app_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -181,9 +181,9 @@ "try:\n", " lambda_fn_name_cloudwatch\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -201,7 +201,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')\n", + "firehoses = firehose.list_delivery_streams(DeliveryStreamType=\"DirectPut\")\n", "\n", "print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))" ] @@ -231,10 +231,12 @@ "import csv\n", "import pandas as pd\n", "\n", - "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df.shape" ] }, @@ -255,14 +257,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_star_rating_and_review_body = df[['review_id', \n", - " 'star_rating', \n", - " 'product_category', \n", - " 'review_body']][0:1]\n", + "df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][0:1]\n", "\n", - "df_star_rating_and_review_body.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)" + "df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)" ] }, { @@ -291,16 +288,15 @@ "\n", "import time\n", "\n", - "app_status = response['ApplicationDetail']['ApplicationStatus']\n", + "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", "\n", - "while app_status != 'RUNNING':\n", + "while app_status != \"RUNNING\":\n", " time.sleep(5)\n", - " response = kinesis_analytics.describe_application(\n", - " ApplicationName=kinesis_data_analytics_app_name)\n", - " app_status = response['ApplicationDetail']['ApplicationStatus']\n", - " print('Application status {}'.format(app_status))\n", + " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n", + " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + " print(\"Application status {}\".format(app_status))\n", "\n", - "print('Application status {}'.format(app_status))" + "print(\"Application status {}\".format(app_status))" ] }, { @@ -331,8 +327,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Logs'.format(region, lambda_fn_name_cloudwatch)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Logs'.format(\n", + " region, lambda_fn_name_cloudwatch\n", + " )\n", + " )\n", + ")" ] }, { @@ -356,8 +358,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML(\"\"\"Review CloudWatch Metrics\"\"\".format(region, region)))\n" + "\n", + "display(\n", + " HTML(\n", + " \"\"\"Review CloudWatch Metrics\"\"\".format(\n", + " region, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -381,8 +389,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -407,9 +421,7 @@ }, "outputs": [], "source": [ - "firehose_response = firehose.describe_delivery_stream(\n", - " DeliveryStreamName=firehose_name\n", - ")\n", + "firehose_response = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n", "\n", "print(json.dumps(firehose_response, indent=4, sort_keys=True, default=str))" ] @@ -427,22 +439,13 @@ "for start_idx in range(0, 500, step):\n", " end_idx = start_idx + step\n", "\n", - " df_star_rating_and_review_body = df[['review_id', \n", - " 'product_category', \n", - " 'review_body']][start_idx:end_idx]\n", + " df_star_rating_and_review_body = df[[\"review_id\", \"product_category\", \"review_body\"]][start_idx:end_idx]\n", + "\n", + " reviews_tsv = df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)\n", "\n", - " reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)\n", - " \n", " # print(reviews_tsv.encode('utf-8'))\n", - " \n", - " response = firehose.put_record( \n", - " Record={\n", - " 'Data': reviews_tsv.encode('utf-8')\n", - " },\n", - " DeliveryStreamName=firehose_name\n", - " )" + "\n", + " response = firehose.put_record(Record={\"Data\": reviews_tsv.encode(\"utf-8\")}, DeliveryStreamName=firehose_name)" ] }, { @@ -452,8 +455,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -488,8 +497,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review S3 Source Records'.format(bucket, region)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review S3 Source Records'.format(\n", + " bucket, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -520,8 +535,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review S3 Transformed Records'.format(bucket, region)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review S3 Transformed Records'.format(\n", + " bucket, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -552,8 +573,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To UI Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To UI Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -612,8 +639,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -639,21 +672,21 @@ "for start_idx in range(0, 10000, anomaly_step):\n", " timestamp = int(time.time())\n", "\n", - " df_anomalies = pd.DataFrame([\n", - " {'review_id': str(timestamp), \n", - " 'product_category': 'Digital_Software', \n", - " 'review_body': 'This is an awful waste of time.'}, \n", - " ], columns=['review_id', 'star_rating', 'product_category', 'review_body'])\n", + " df_anomalies = pd.DataFrame(\n", + " [\n", + " {\n", + " \"review_id\": str(timestamp),\n", + " \"product_category\": \"Digital_Software\",\n", + " \"review_body\": \"This is an awful waste of time.\",\n", + " },\n", + " ],\n", + " columns=[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"],\n", + " )\n", "\n", - " reviews_tsv_anomalies = df_anomalies.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)\n", - " \n", - " response = firehose.put_record( \n", - " Record={\n", - " 'Data': reviews_tsv_anomalies.encode('utf-8')\n", - " },\n", - " DeliveryStreamName=firehose_name\n", + " reviews_tsv_anomalies = df_anomalies.to_csv(sep=\"\\t\", header=None, index=False)\n", + "\n", + " response = firehose.put_record(\n", + " Record={\"Data\": reviews_tsv_anomalies.encode(\"utf-8\")}, DeliveryStreamName=firehose_name\n", " )" ] }, @@ -664,8 +697,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -699,7 +738,7 @@ "\n", "#

Shutting down your kernel for this notebook to release resources.

\n", "# \n", - " \n", + "\n", "# " ] }, diff --git a/11_stream/archive/11_stream.orig/00_Overview.ipynb b/11_stream/archive/11_stream.orig/00_Overview.ipynb index c009d721..59211e9c 100644 --- a/11_stream/archive/11_stream.orig/00_Overview.ipynb +++ b/11_stream/archive/11_stream.orig/00_Overview.ipynb @@ -88,7 +88,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb b/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb index e0750488..4abc7b09 100644 --- a/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb +++ b/11_stream/archive/11_stream.orig/01_Setup_IAM.ipynb @@ -17,13 +17,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_kinesis_role_name = 'DSOAWS_Kinesis'" + "iam_kinesis_role_name = \"DSOAWS_Kinesis\"" ] }, { @@ -58,31 +58,13 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesis.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"firehose.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesisanalytics.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " } \n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesis.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"firehose.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " ],\n", + "}" ] }, { @@ -100,18 +82,18 @@ " iam_role_kinesis = iam.create_role(\n", " RoleName=iam_kinesis_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Kinesis Role'\n", + " Description=\"DSOAWS Kinesis Role\",\n", " )\n", - " print('Role succesfully created.')\n", + " print(\"Role succesfully created.\")\n", " iam_kinesis_role_passed = True\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role_kinesis = iam.get_role(RoleName=iam_kinesis_role_name)\n", - " print('Role already exists. That is OK.')\n", + " print(\"Role already exists. That is OK.\")\n", " iam_kinesis_role_passed = True\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " \n", + " print(\"Unexpected error: %s\" % e)\n", + "\n", "time.sleep(30)" ] }, @@ -121,8 +103,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_kinesis_name = iam_role_kinesis['Role']['RoleName']\n", - "print('Role Name: {}'.format(iam_role_kinesis_name))" + "iam_role_kinesis_name = iam_role_kinesis[\"Role\"][\"RoleName\"]\n", + "print(\"Role Name: {}\".format(iam_role_kinesis_name))" ] }, { @@ -131,8 +113,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_kinesis_arn = iam_role_kinesis['Role']['Arn']\n", - "print('Role ARN: {}'.format(iam_role_kinesis_arn))" + "iam_role_kinesis_arn = iam_role_kinesis[\"Role\"][\"Arn\"]\n", + "print(\"Role ARN: {}\".format(iam_role_kinesis_arn))" ] }, { @@ -141,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "account_id = sts.get_caller_identity()['Account']" + "account_id = sts.get_caller_identity()[\"Account\"]" ] }, { @@ -157,7 +139,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream_name = 'dsoaws-kinesis-data-stream'" + "stream_name = \"dsoaws-kinesis-data-stream\"" ] }, { @@ -173,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehose_name = 'dsoaws-kinesis-data-firehose'" + "firehose_name = \"dsoaws-kinesis-data-firehose\"" ] }, { @@ -189,7 +171,7 @@ "metadata": {}, "outputs": [], "source": [ - "lambda_fn_name = 'DeliverKinesisAnalyticsToCloudWatch'" + "lambda_fn_name = \"DeliverKinesisAnalyticsToCloudWatch\"" ] }, { @@ -208,32 +190,27 @@ "outputs": [], "source": [ "kinesis_policy_doc = {\n", - " \n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " { \n", - " \"Effect\": \"Allow\", \n", + " {\n", + " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:AbortMultipartUpload\",\n", " \"s3:GetBucketLocation\",\n", " \"s3:GetObject\",\n", " \"s3:ListBucket\",\n", " \"s3:ListBucketMultipartUploads\",\n", - " \"s3:PutObject\"\n", - " ], \n", - " \"Resource\": [ \n", + " \"s3:PutObject\",\n", + " ],\n", + " \"Resource\": [\n", " \"arn:aws:s3:::{}/kinesis-data-firehose\".format(bucket),\n", - " \"arn:aws:s3:::{}/kinesis-data-firehose/*\".format(bucket)\n", - " ] \n", + " \"arn:aws:s3:::{}/kinesis-data-firehose/*\".format(bucket),\n", + " ],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"logs:PutLogEvents\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)\n", - " ]\n", + " \"Action\": [\"logs:PutLogEvents\"],\n", + " \"Resource\": [\"arn:aws:logs:{}:{}:log-group:/*\".format(region, account_id)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", @@ -243,43 +220,34 @@ " \"kinesis:Put*\",\n", " \"kinesis:List*\",\n", " ],\n", - " \"Resource\": [\n", - " \"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)\n", - " ]\n", + " \"Resource\": [\"arn:aws:kinesis:{}:{}:stream/{}\".format(region, account_id, stream_name)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"firehose:*\",\n", " ],\n", - " \"Resource\": [\n", - " \"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)\n", - " ]\n", + " \"Resource\": [\"arn:aws:firehose:{}:{}:deliverystream/{}\".format(region, account_id, firehose_name)],\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"kinesisanalytics:*\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ]\n", + " \"Resource\": [\"*\"],\n", " },\n", " {\n", " \"Sid\": \"UseLambdaFunction\",\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"lambda:InvokeFunction\",\n", - " \"lambda:GetFunctionConfiguration\"\n", - " ],\n", - " \"Resource\": \"arn:aws:lambda:{}:{}:function:{}:$LATEST\".format(region, account_id, lambda_fn_name)\n", + " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n", + " \"Resource\": \"arn:aws:lambda:{}:{}:function:{}:$LATEST\".format(region, account_id, lambda_fn_name),\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"iam:PassRole\",\n", - " \"Resource\": \"arn:aws:iam::*:role/service-role/kinesis-analytics*\"\n", - " }\n", - " ]\n", + " \"Resource\": \"arn:aws:iam::*:role/service-role/kinesis-analytics*\",\n", + " },\n", + " ],\n", "}\n", "\n", "print(json.dumps(kinesis_policy_doc, indent=4, sort_keys=True, default=str))" @@ -301,9 +269,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=iam_role_kinesis_name,\n", - " PolicyName='DSOAWS_KinesisPolicy',\n", - " PolicyDocument=json.dumps(kinesis_policy_doc)\n", + " RoleName=iam_role_kinesis_name, PolicyName=\"DSOAWS_KinesisPolicy\", PolicyDocument=json.dumps(kinesis_policy_doc)\n", ")\n", "\n", "time.sleep(30)" @@ -331,7 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "iam_lambda_role_name = 'DSOAWS_Lambda'" + "iam_lambda_role_name = \"DSOAWS_Lambda\"" ] }, { @@ -352,21 +318,9 @@ "assume_role_policy_doc = {\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"lambda.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"kinesisanalytics.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"lambda.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"kinesisanalytics.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"},\n", + " ],\n", "}" ] }, @@ -384,18 +338,18 @@ " iam_role_lambda = iam.create_role(\n", " RoleName=iam_lambda_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Lambda Role'\n", + " Description=\"DSOAWS Lambda Role\",\n", " )\n", - " print('Role succesfully created.')\n", + " print(\"Role succesfully created.\")\n", " iam_lambda_role_passed = True\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role_lambda = iam.get_role(RoleName=iam_lambda_role_name)\n", - " print('Role already exists. This is OK.')\n", + " print(\"Role already exists. This is OK.\")\n", " iam_lambda_role_passed = True\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " \n", + " print(\"Unexpected error: %s\" % e)\n", + "\n", "time.sleep(30)" ] }, @@ -405,8 +359,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_lambda_name = iam_role_lambda['Role']['RoleName']\n", - "print('Role Name: {}'.format(iam_role_lambda_name))" + "iam_role_lambda_name = iam_role_lambda[\"Role\"][\"RoleName\"]\n", + "print(\"Role Name: {}\".format(iam_role_lambda_name))" ] }, { @@ -415,8 +369,8 @@ "metadata": {}, "outputs": [], "source": [ - "iam_role_lambda_arn = iam_role_lambda['Role']['Arn']\n", - "print('Role ARN: {}'.format(iam_role_lambda_arn))" + "iam_role_lambda_arn = iam_role_lambda[\"Role\"][\"Arn\"]\n", + "print(\"Role ARN: {}\".format(iam_role_lambda_arn))" ] }, { @@ -438,31 +392,21 @@ " {\n", " \"Sid\": \"UseLambdaFunction\",\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"lambda:InvokeFunction\",\n", - " \"lambda:GetFunctionConfiguration\"\n", - " ],\n", - " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id)\n", - " },\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Action\": \"cloudwatch:*\",\n", - " \"Resource\": \"*\"\n", + " \"Action\": [\"lambda:InvokeFunction\", \"lambda:GetFunctionConfiguration\"],\n", + " \"Resource\": \"arn:aws:lambda:{}:{}:function:*\".format(region, account_id),\n", " },\n", + " {\"Effect\": \"Allow\", \"Action\": \"cloudwatch:*\", \"Resource\": \"*\"},\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogGroup\",\n", - " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id)\n", + " \"Resource\": \"arn:aws:logs:{}:{}:*\".format(region, account_id),\n", " },\n", " {\n", " \"Effect\": \"Allow\",\n", - " \"Action\": [\n", - " \"logs:CreateLogStream\",\n", - " \"logs:PutLogEvents\"\n", - " ],\n", - " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id)\n", - " }\n", - " ]\n", + " \"Action\": [\"logs:CreateLogStream\", \"logs:PutLogEvents\"],\n", + " \"Resource\": \"arn:aws:logs:{}:{}:log-group:/aws/lambda/*\".format(region, account_id),\n", + " },\n", + " ],\n", "}" ] }, @@ -484,9 +428,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=iam_role_lambda_name,\n", - " PolicyName='DSOAWS_LambdaPolicy',\n", - " PolicyDocument=json.dumps(lambda_policy_doc)\n", + " RoleName=iam_role_lambda_name, PolicyName=\"DSOAWS_LambdaPolicy\", PolicyDocument=json.dumps(lambda_policy_doc)\n", ")\n", "\n", "time.sleep(30)" @@ -605,7 +547,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb b/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb index 5892eb1d..5f281451 100644 --- a/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb +++ b/11_stream/archive/11_stream.orig/02_Create_Kinesis_Data_Firehose.ipynb @@ -27,13 +27,13 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)" ] }, { @@ -54,9 +54,9 @@ "try:\n", " firehose_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -93,9 +93,9 @@ "try:\n", " iam_kinesis_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -125,9 +125,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -157,9 +157,9 @@ "try:\n", " iam_kinesis_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -178,11 +178,11 @@ "outputs": [], "source": [ "if not iam_kinesis_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -200,24 +200,23 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = firehose.create_delivery_stream(\n", " DeliveryStreamName=firehose_name,\n", - " DeliveryStreamType='DirectPut',\n", + " DeliveryStreamType=\"DirectPut\",\n", " S3DestinationConfiguration={\n", - " 'RoleARN': iam_role_kinesis_arn,\n", - " 'BucketARN': 'arn:aws:s3:::{}'.format(bucket),\n", - " 'Prefix': 'kinesis-data-firehose', \n", - " }\n", + " \"RoleARN\": iam_role_kinesis_arn,\n", + " \"BucketARN\": \"arn:aws:s3:::{}\".format(bucket),\n", + " \"Prefix\": \"kinesis-data-firehose\",\n", + " },\n", " )\n", - " print('Delivery stream {} successfully created.'.format(firehose_name))\n", + " print(\"Delivery stream {} successfully created.\".format(firehose_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Delivery stream {} already exists.'.format(firehose_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Delivery stream {} already exists.\".format(firehose_name))\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " " + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -228,14 +227,14 @@ "source": [ "import time\n", "\n", - "status = ''\n", - "while status != 'ACTIVE': \n", + "status = \"\"\n", + "while status != \"ACTIVE\":\n", " r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n", - " description = r.get('DeliveryStreamDescription')\n", - " status = description.get('DeliveryStreamStatus')\n", + " description = r.get(\"DeliveryStreamDescription\")\n", + " status = description.get(\"DeliveryStreamStatus\")\n", " time.sleep(5)\n", - " \n", - "print('Delivery Stream {} is active'.format(firehose_name))" + "\n", + "print(\"Delivery Stream {} is active\".format(firehose_name))" ] }, { @@ -253,12 +252,12 @@ "source": [ "r = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n", "\n", - "status = description.get('DeliveryStreamStatus')\n", + "status = description.get(\"DeliveryStreamStatus\")\n", "print(status)\n", "\n", "print()\n", "\n", - "description = r.get('DeliveryStreamDescription')\n", + "description = r.get(\"DeliveryStreamDescription\")\n", "print(json.dumps(description, indent=4, sort_keys=True, default=str))" ] }, @@ -268,7 +267,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehose_arn = r['DeliveryStreamDescription']['DeliveryStreamARN']\n", + "firehose_arn = r[\"DeliveryStreamDescription\"][\"DeliveryStreamARN\"]\n", "print(firehose_arn)" ] }, @@ -295,8 +294,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Firehose'.format(region, firehose_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Firehose'.format(\n", + " region, firehose_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -322,7 +327,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb b/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb index d9ba7992..db2bba69 100644 --- a/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb +++ b/11_stream/archive/11_stream.orig/03_Create_Kinesis_Data_Stream.ipynb @@ -26,14 +26,14 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "kinesis = boto3.Session().client(service_name='kinesis', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "kinesis = boto3.Session().client(service_name=\"kinesis\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)" ] }, { @@ -61,9 +61,9 @@ "try:\n", " stream_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -92,19 +92,16 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", - " response = kinesis.create_stream(\n", - " StreamName=stream_name, \n", - " ShardCount=shard_count\n", - " )\n", - " print('Data Stream {} successfully created.'.format(stream_name))\n", + "try:\n", + " response = kinesis.create_stream(StreamName=stream_name, ShardCount=shard_count)\n", + " print(\"Data Stream {} successfully created.\".format(stream_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", - " \n", + "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Data Stream {} already exists.'.format(stream_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Data Stream {} already exists.\".format(stream_name))\n", " else:\n", - " print('Unexpected error: %s' % e)" + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -115,14 +112,14 @@ "source": [ "import time\n", "\n", - "status = ''\n", - "while status != 'ACTIVE': \n", + "status = \"\"\n", + "while status != \"ACTIVE\":\n", " r = kinesis.describe_stream(StreamName=stream_name)\n", - " description = r.get('StreamDescription')\n", - " status = description.get('StreamStatus')\n", + " description = r.get(\"StreamDescription\")\n", + " status = description.get(\"StreamStatus\")\n", " time.sleep(5)\n", - " \n", - "print('Stream {} is active'.format(stream_name))" + "\n", + "print(\"Stream {} is active\".format(stream_name))" ] }, { @@ -138,9 +135,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream_response = kinesis.describe_stream(\n", - " StreamName=stream_name\n", - ")\n", + "stream_response = kinesis.describe_stream(StreamName=stream_name)\n", "\n", "print(json.dumps(stream_response, indent=4, sort_keys=True, default=str))" ] @@ -153,7 +148,7 @@ }, "outputs": [], "source": [ - "stream_arn = stream_response['StreamDescription']['StreamARN']\n", + "stream_arn = stream_response[\"StreamDescription\"][\"StreamARN\"]\n", "print(stream_arn)" ] }, @@ -180,8 +175,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Stream'.format(region, stream_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Stream'.format(\n", + " region, stream_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -207,7 +208,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb b/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb index 6e99c2c7..7a6faf3f 100644 --- a/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb +++ b/11_stream/archive/11_stream.orig/04_Create_Lambda_Destination.ipynb @@ -38,16 +38,16 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "account_id = sts.get_caller_identity()['Account']\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "\n", - "lam = boto3.Session().client(service_name='lambda', region_name=region)" + "lam = boto3.Session().client(service_name=\"lambda\", region_name=region)" ] }, { @@ -75,9 +75,9 @@ "try:\n", " lambda_fn_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -114,9 +114,9 @@ "try:\n", " iam_lambda_role_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -146,9 +146,9 @@ "try:\n", " iam_lambda_role_passed\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -167,11 +167,11 @@ "outputs": [], "source": [ "if not iam_lambda_role_passed:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")\n", "else:\n", - " print('[OK]')" + " print(\"[OK]\")" ] }, { @@ -192,9 +192,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -253,7 +253,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('src/DeliverKinesisAnalyticsToCloudWatch.zip', 'rb') as f: \n", + "with open(\"src/DeliverKinesisAnalyticsToCloudWatch.zip\", \"rb\") as f:\n", " code = f.read()" ] }, @@ -272,27 +272,25 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = lam.create_function(\n", - " FunctionName='{}'.format(lambda_fn_name),\n", - " Runtime='python2.7',\n", - " Role='{}'.format(iam_role_lambda_arn),\n", - " Handler='src/lambda_function.lambda_handler',\n", - " Code={\n", - " 'ZipFile': code\n", - " },\n", - " Description='Deliver output records from Kinesis Analytics application to CloudWatch.',\n", + " FunctionName=\"{}\".format(lambda_fn_name),\n", + " Runtime=\"python2.7\",\n", + " Role=\"{}\".format(iam_role_lambda_arn),\n", + " Handler=\"src/lambda_function.lambda_handler\",\n", + " Code={\"ZipFile\": code},\n", + " Description=\"Deliver output records from Kinesis Analytics application to CloudWatch.\",\n", " Timeout=60,\n", " MemorySize=128,\n", - " Publish=True\n", + " Publish=True,\n", " )\n", - " print('Lambda Function {} successfully created.'.format(lambda_fn_name))\n", + " print(\"Lambda Function {} successfully created.\".format(lambda_fn_name))\n", "\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceConflictException':\n", - " print('Lambda Function {} already exists. This is OK.'.format(lambda_fn_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceConflictException\":\n", + " print(\"Lambda Function {} already exists. This is OK.\".format(lambda_fn_name))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -313,7 +311,7 @@ "source": [ "response = lam.get_function(FunctionName=lambda_fn_name)\n", "\n", - "lambda_fn_arn = response['Configuration']['FunctionArn']\n", + "lambda_fn_arn = response[\"Configuration\"][\"FunctionArn\"]\n", "print(lambda_fn_arn)" ] }, @@ -342,8 +340,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Function'.format(region, lambda_fn_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Function'.format(\n", + " region, lambda_fn_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -369,7 +373,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb b/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb index 7ddc0be0..6c7aae36 100644 --- a/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb +++ b/11_stream/archive/11_stream.orig/05_Create_Kinesis_Data_Analytics_App.ipynb @@ -27,17 +27,17 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sts = boto3.Session().client(service_name='sts', region_name=region)\n", - "account_id = sts.get_caller_identity()['Account']\n", + "sts = boto3.Session().client(service_name=\"sts\", region_name=region)\n", + "account_id = sts.get_caller_identity()[\"Account\"]\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n", - "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n", + "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)" ] }, { @@ -58,9 +58,9 @@ "try:\n", " firehose_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -90,9 +90,9 @@ "try:\n", " firehose_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -122,9 +122,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -154,9 +154,9 @@ "try:\n", " stream_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -186,9 +186,9 @@ "try:\n", " lambda_fn_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -218,9 +218,9 @@ "try:\n", " iam_role_lambda_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run all previous notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run all previous notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -252,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "kinesis_data_analytics_app_name = 'dsoaws-kinesis-data-analytics-sql-app'" + "kinesis_data_analytics_app_name = \"dsoaws-kinesis-data-analytics-sql-app\"" ] }, { @@ -261,7 +261,7 @@ "metadata": {}, "outputs": [], "source": [ - "in_app_stream_name = 'firehose_001' # Default" + "in_app_stream_name = \"firehose_001\" # Default" ] }, { @@ -286,7 +286,7 @@ "metadata": {}, "outputs": [], "source": [ - "sql_code = ''' \\\n", + "sql_code = \"\"\" \\\n", " CREATE OR REPLACE STREAM \"AVG_STAR_RATING_SQL_STREAM\" ( \\\n", " avg_star_rating DOUBLE); \\\n", " CREATE OR REPLACE PUMP \"AVG_STAR_RATING_STREAM_PUMP\" AS \\\n", @@ -317,12 +317,9 @@ " {} \\\n", " ) \\\n", " ); \\\n", - " '''.format(in_app_stream_name, \n", - " in_app_stream_name, \n", - " window_seconds,\n", - " in_app_stream_name, \n", - " in_app_stream_name, \n", - " window_seconds)\n", + " \"\"\".format(\n", + " in_app_stream_name, in_app_stream_name, window_seconds, in_app_stream_name, in_app_stream_name, window_seconds\n", + ")\n", "\n", "print(sql_code)" ] @@ -335,83 +332,59 @@ "source": [ "from botocore.exceptions import ClientError\n", "\n", - "try: \n", + "try:\n", " response = kinesis_analytics.create_application(\n", " ApplicationName=kinesis_data_analytics_app_name,\n", " Inputs=[\n", " {\n", - " 'NamePrefix': 'firehose',\n", - " 'KinesisFirehoseInput': {\n", - " 'ResourceARN': '{}'.format(firehose_arn),\n", - " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n", + " \"NamePrefix\": \"firehose\",\n", + " \"KinesisFirehoseInput\": {\n", + " \"ResourceARN\": \"{}\".format(firehose_arn),\n", + " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n", " },\n", - " 'InputSchema': {\n", - " 'RecordFormat': {\n", - " 'RecordFormatType': 'CSV',\n", - " 'MappingParameters': {\n", - " 'CSVMappingParameters': {\n", - " 'RecordRowDelimiter': '\\n',\n", - " 'RecordColumnDelimiter': '\\t'\n", - " }\n", - " }\n", - " },\n", - " 'RecordColumns': [\n", - " {\n", - " 'Name': 'review_id',\n", - " 'Mapping': 'review_id',\n", - " 'SqlType': 'VARCHAR(14)'\n", - " }, \n", - " {\n", - " 'Name': 'star_rating',\n", - " 'Mapping': 'star_rating',\n", - " 'SqlType': 'INTEGER'\n", + " \"InputSchema\": {\n", + " \"RecordFormat\": {\n", + " \"RecordFormatType\": \"CSV\",\n", + " \"MappingParameters\": {\n", + " \"CSVMappingParameters\": {\"RecordRowDelimiter\": \"\\n\", \"RecordColumnDelimiter\": \"\\t\"}\n", " },\n", - " {\n", - " 'Name': 'product_category',\n", - " 'Mapping': 'product_category',\n", - " 'SqlType': 'VARCHAR(24)'\n", - " }, \n", - " {\n", - " 'Name': 'review_body',\n", - " 'Mapping': 'review_body',\n", - " 'SqlType': 'VARCHAR(65535)'\n", - " } \n", - " ]\n", - " }\n", + " },\n", + " \"RecordColumns\": [\n", + " {\"Name\": \"review_id\", \"Mapping\": \"review_id\", \"SqlType\": \"VARCHAR(14)\"},\n", + " {\"Name\": \"star_rating\", \"Mapping\": \"star_rating\", \"SqlType\": \"INTEGER\"},\n", + " {\"Name\": \"product_category\", \"Mapping\": \"product_category\", \"SqlType\": \"VARCHAR(24)\"},\n", + " {\"Name\": \"review_body\", \"Mapping\": \"review_body\", \"SqlType\": \"VARCHAR(65535)\"},\n", + " ],\n", + " },\n", " },\n", " ],\n", " Outputs=[\n", " {\n", - " 'Name': 'AVG_STAR_RATING_STREAM', \n", - " 'KinesisStreamsOutput': {\n", - " 'ResourceARN': '{}'.format(stream_arn),\n", - " 'RoleARN': '{}'.format(iam_role_kinesis_arn)\n", + " \"Name\": \"AVG_STAR_RATING_STREAM\",\n", + " \"KinesisStreamsOutput\": {\n", + " \"ResourceARN\": \"{}\".format(stream_arn),\n", + " \"RoleARN\": \"{}\".format(iam_role_kinesis_arn),\n", " },\n", - " 'DestinationSchema': {\n", - " 'RecordFormatType': 'CSV'\n", - " }\n", + " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n", " },\n", " {\n", - " 'Name': 'AVG_STAR_RATING_SQL_STREAM',\n", - " 'LambdaOutput': {\n", - " 'ResourceARN': '{}'.format(lambda_fn_arn),\n", - " 'RoleARN': '{}'.format(iam_role_lambda_arn)\n", + " \"Name\": \"AVG_STAR_RATING_SQL_STREAM\",\n", + " \"LambdaOutput\": {\n", + " \"ResourceARN\": \"{}\".format(lambda_fn_arn),\n", + " \"RoleARN\": \"{}\".format(iam_role_lambda_arn),\n", " },\n", - " 'DestinationSchema': {\n", - " 'RecordFormatType': 'CSV'\n", - " }\n", - " }\n", + " \"DestinationSchema\": {\"RecordFormatType\": \"CSV\"},\n", + " },\n", " ],\n", - " ApplicationCode=sql_code\n", + " ApplicationCode=sql_code,\n", " )\n", - " print('SQL application {} successfully created.'.format(kinesis_data_analytics_app_name))\n", + " print(\"SQL application {} successfully created.\".format(kinesis_data_analytics_app_name))\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('SQL App {} already exists.'.format(kinesis_data_analytics_app_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"SQL App {} already exists.\".format(kinesis_data_analytics_app_name))\n", " else:\n", - " print('Unexpected error: %s' % e)\n", - " " + " print(\"Unexpected error: %s\" % e)" ] }, { @@ -432,7 +405,7 @@ "metadata": {}, "outputs": [], "source": [ - "input_id = response['ApplicationDetail']['InputDescriptions'][0]['InputId']\n", + "input_id = response[\"ApplicationDetail\"][\"InputDescriptions\"][0][\"InputId\"]\n", "print(input_id)" ] }, @@ -449,24 +422,17 @@ "metadata": {}, "outputs": [], "source": [ - "try: \n", + "try:\n", " response = kinesis_analytics.start_application(\n", " ApplicationName=kinesis_data_analytics_app_name,\n", - " InputConfigurations=[\n", - " {\n", - " 'Id': input_id,\n", - " 'InputStartingPositionConfiguration': {\n", - " 'InputStartingPosition': 'NOW'\n", - " }\n", - " }\n", - " ]\n", + " InputConfigurations=[{\"Id\": input_id, \"InputStartingPositionConfiguration\": {\"InputStartingPosition\": \"NOW\"}}],\n", " )\n", " print(json.dumps(response, indent=4, sort_keys=True, default=str))\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceInUseException':\n", - " print('Application {} is already starting.'.format(kinesis_data_analytics_app_name))\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceInUseException\":\n", + " print(\"Application {} is already starting.\".format(kinesis_data_analytics_app_name))\n", " else:\n", - " print('Error: {}'.format(e))" + " print(\"Error: {}\".format(e))" ] }, { @@ -492,8 +458,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -515,17 +487,16 @@ "\n", "import time\n", "\n", - "app_status = response['ApplicationDetail']['ApplicationStatus']\n", - "print('Application status {}'.format(app_status))\n", + "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + "print(\"Application status {}\".format(app_status))\n", "\n", - "while app_status != 'RUNNING':\n", + "while app_status != \"RUNNING\":\n", " time.sleep(5)\n", - " response = kinesis_analytics.describe_application(\n", - " ApplicationName=kinesis_data_analytics_app_name)\n", - " app_status = response['ApplicationDetail']['ApplicationStatus']\n", - " print('Application status {}'.format(app_status))\n", + " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n", + " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + " print(\"Application status {}\".format(app_status))\n", "\n", - "print('Application status {}'.format(app_status))" + "print(\"Application status {}\".format(app_status))" ] }, { @@ -560,7 +531,7 @@ "outputs": [], "source": [ "%%javascript\n", - "Jupyter.notebook.save_checkpoint();\n", + "Jupyter.notebook.save_checkpoint()\n", "Jupyter.notebook.session.delete();" ] } diff --git a/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb b/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb index d2196329..afb6adf5 100644 --- a/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb +++ b/11_stream/archive/11_stream.orig/06_Put_Reviews_On_Kinesis_Data_Firehose.ipynb @@ -25,14 +25,14 @@ "import pandas as pd\n", "import json\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "firehose = boto3.Session().client(service_name='firehose', region_name=region)\n", - "kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)\n" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "firehose = boto3.Session().client(service_name=\"firehose\", region_name=region)\n", + "kinesis_analytics = boto3.Session().client(service_name=\"kinesisanalytics\", region_name=region)" ] }, { @@ -53,9 +53,9 @@ "try:\n", " firehose_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -85,9 +85,9 @@ "try:\n", " firehose_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -117,9 +117,9 @@ "try:\n", " iam_role_kinesis_arn\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -149,9 +149,9 @@ "try:\n", " kinesis_data_analytics_app_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -181,9 +181,9 @@ "try:\n", " lambda_fn_name\n", "except NameError:\n", - " print('+++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in this section before you continue.')\n", - " print('+++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in this section before you continue.\")\n", + " print(\"+++++++++++++++++++++++++++++++\")" ] }, { @@ -201,7 +201,7 @@ "metadata": {}, "outputs": [], "source": [ - "firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')\n", + "firehoses = firehose.list_delivery_streams(DeliveryStreamType=\"DirectPut\")\n", "\n", "print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))" ] @@ -231,10 +231,12 @@ "import csv\n", "import pandas as pd\n", "\n", - "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', \n", - " delimiter='\\t', \n", - " quoting=csv.QUOTE_NONE,\n", - " compression='gzip')\n", + "df = pd.read_csv(\n", + " \"./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz\",\n", + " delimiter=\"\\t\",\n", + " quoting=csv.QUOTE_NONE,\n", + " compression=\"gzip\",\n", + ")\n", "df.shape" ] }, @@ -255,14 +257,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_star_rating_and_review_body = df[['review_id', \n", - " 'star_rating', \n", - " 'product_category', \n", - " 'review_body']][0:1]\n", + "df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][0:1]\n", "\n", - "df_star_rating_and_review_body.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)" + "df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)" ] }, { @@ -291,16 +288,15 @@ "\n", "import time\n", "\n", - "app_status = response['ApplicationDetail']['ApplicationStatus']\n", + "app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", "\n", - "while app_status != 'RUNNING':\n", + "while app_status != \"RUNNING\":\n", " time.sleep(5)\n", - " response = kinesis_analytics.describe_application(\n", - " ApplicationName=kinesis_data_analytics_app_name)\n", - " app_status = response['ApplicationDetail']['ApplicationStatus']\n", - " print('Application status {}'.format(app_status))\n", + " response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)\n", + " app_status = response[\"ApplicationDetail\"][\"ApplicationStatus\"]\n", + " print(\"Application status {}\".format(app_status))\n", "\n", - "print('Application status {}'.format(app_status))" + "print(\"Application status {}\".format(app_status))" ] }, { @@ -331,8 +327,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Lambda Logs'.format(region, lambda_fn_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Lambda Logs'.format(\n", + " region, lambda_fn_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -356,8 +358,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML(\"\"\"Review CloudWatch Metrics\"\"\".format(region, region)))\n" + "\n", + "display(\n", + " HTML(\n", + " \"\"\"Review CloudWatch Metrics\"\"\".format(\n", + " region, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -381,8 +389,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -407,9 +421,7 @@ }, "outputs": [], "source": [ - "firehose_response = firehose.describe_delivery_stream(\n", - " DeliveryStreamName=firehose_name\n", - ")\n", + "firehose_response = firehose.describe_delivery_stream(DeliveryStreamName=firehose_name)\n", "\n", "print(json.dumps(firehose_response, indent=4, sort_keys=True, default=str))" ] @@ -424,23 +436,15 @@ "for start_idx in range(0, 10000, step):\n", " end_idx = start_idx + step\n", "\n", - " df_star_rating_and_review_body = df[['review_id', \n", - " 'star_rating', \n", - " 'product_category', \n", - " 'review_body']][start_idx:end_idx]\n", + " df_star_rating_and_review_body = df[[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"]][\n", + " start_idx:end_idx\n", + " ]\n", + "\n", + " reviews_tsv = df_star_rating_and_review_body.to_csv(sep=\"\\t\", header=None, index=False)\n", "\n", - " reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)\n", - " \n", " # print(reviews_tsv.encode('utf-8'))\n", - " \n", - " response = firehose.put_record( \n", - " Record={\n", - " 'Data': reviews_tsv.encode('utf-8')\n", - " },\n", - " DeliveryStreamName=firehose_name\n", - " )" + "\n", + " response = firehose.put_record(Record={\"Data\": reviews_tsv.encode(\"utf-8\")}, DeliveryStreamName=firehose_name)" ] }, { @@ -450,8 +454,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Review Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Review Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -486,8 +496,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To UI Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To UI Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -546,8 +562,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -570,22 +592,22 @@ "for start_idx in range(0, 10000, anomaly_step):\n", " timestamp = int(time.time())\n", "\n", - " df_anomalies = pd.DataFrame([\n", - " {'review_id': str(timestamp), \n", - " 'star_rating': 100, \n", - " 'product_category': 'Digital_Software', \n", - " 'review_body': 'blahblah'}, \n", - " ], columns=['review_id', 'star_rating', 'product_category', 'review_body'])\n", + " df_anomalies = pd.DataFrame(\n", + " [\n", + " {\n", + " \"review_id\": str(timestamp),\n", + " \"star_rating\": 100,\n", + " \"product_category\": \"Digital_Software\",\n", + " \"review_body\": \"blahblah\",\n", + " },\n", + " ],\n", + " columns=[\"review_id\", \"star_rating\", \"product_category\", \"review_body\"],\n", + " )\n", + "\n", + " reviews_tsv_anomalies = df_anomalies.to_csv(sep=\"\\t\", header=None, index=False)\n", "\n", - " reviews_tsv_anomalies = df_anomalies.to_csv(sep='\\t',\n", - " header=None,\n", - " index=False)\n", - " \n", - " response = firehose.put_record( \n", - " Record={\n", - " 'Data': reviews_tsv_anomalies.encode('utf-8')\n", - " },\n", - " DeliveryStreamName=firehose_name\n", + " response = firehose.put_record(\n", + " Record={\"Data\": reviews_tsv_anomalies.encode(\"utf-8\")}, DeliveryStreamName=firehose_name\n", " )" ] }, @@ -596,8 +618,14 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", - " \n", - "display(HTML('Go To Kinesis Data Analytics App'.format(region, kinesis_data_analytics_app_name)))\n" + "\n", + "display(\n", + " HTML(\n", + " 'Go To Kinesis Data Analytics App'.format(\n", + " region, kinesis_data_analytics_app_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -621,8 +649,8 @@ "outputs": [], "source": [ "#%%javascript\n", - "#Jupyter.notebook.save_checkpoint();\n", - "#Jupyter.notebook.session.delete();" + "# Jupyter.notebook.save_checkpoint();\n", + "# Jupyter.notebook.session.delete();" ] } ], diff --git a/11_stream/archive/11_stream.orig/src/lambda_function.py b/11_stream/archive/11_stream.orig/src/lambda_function.py index 889896bf..a867f165 100644 --- a/11_stream/archive/11_stream.orig/src/lambda_function.py +++ b/11_stream/archive/11_stream.orig/src/lambda_function.py @@ -10,51 +10,51 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) -client = boto3.client('cloudwatch') +client = boto3.client("cloudwatch") + def lambda_handler(event, context): output = [] success = 0 failure = 0 - for record in event['records']: + for record in event["records"]: try: - #logger.info(f'event: {event}') - payload = base64.b64decode(record['data']) + # logger.info(f'event: {event}') + payload = base64.b64decode(record["data"]) datapoint = float(payload) # logger.info(f'avg_star_rating: {payload}') client.put_metric_data( - Namespace='kinesis/analytics/AVGStarRating', + Namespace="kinesis/analytics/AVGStarRating", MetricData=[ { - 'MetricName': 'AVGStarRating', - 'Dimensions': [ - { - 'Name': 'Product Category', - 'Value': 'All' - }, + "MetricName": "AVGStarRating", + "Dimensions": [ + {"Name": "Product Category", "Value": "All"}, ], - 'Value': datapoint, - 'StorageResolution': 1 + "Value": datapoint, + "StorageResolution": 1, } - ] + ], ) - output.append({'recordId': record['recordId'], 'result': 'Ok'}) + output.append({"recordId": record["recordId"], "result": "Ok"}) success += 1 print(datapoint) - + except Exception as exp: - output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'}) + output.append({"recordId": record["recordId"], "result": "DeliveryFailed"}) failure += 1 exception_type, exception_value, exception_traceback = sys.exc_info() traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback) - err_msg = json.dumps({ - "errorType": exception_type.__name__, - "errorMessage": str(exception_value), - "stackTrace": traceback_string - }) + err_msg = json.dumps( + { + "errorType": exception_type.__name__, + "errorMessage": str(exception_value), + "stackTrace": traceback_string, + } + ) logger.error(err_msg) - print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure)) - return {'records': output} \ No newline at end of file + print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure)) + return {"records": output} diff --git a/11_stream/src/deliver_metrics_to_cloudwatch.py b/11_stream/src/deliver_metrics_to_cloudwatch.py index 889896bf..a867f165 100644 --- a/11_stream/src/deliver_metrics_to_cloudwatch.py +++ b/11_stream/src/deliver_metrics_to_cloudwatch.py @@ -10,51 +10,51 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) -client = boto3.client('cloudwatch') +client = boto3.client("cloudwatch") + def lambda_handler(event, context): output = [] success = 0 failure = 0 - for record in event['records']: + for record in event["records"]: try: - #logger.info(f'event: {event}') - payload = base64.b64decode(record['data']) + # logger.info(f'event: {event}') + payload = base64.b64decode(record["data"]) datapoint = float(payload) # logger.info(f'avg_star_rating: {payload}') client.put_metric_data( - Namespace='kinesis/analytics/AVGStarRating', + Namespace="kinesis/analytics/AVGStarRating", MetricData=[ { - 'MetricName': 'AVGStarRating', - 'Dimensions': [ - { - 'Name': 'Product Category', - 'Value': 'All' - }, + "MetricName": "AVGStarRating", + "Dimensions": [ + {"Name": "Product Category", "Value": "All"}, ], - 'Value': datapoint, - 'StorageResolution': 1 + "Value": datapoint, + "StorageResolution": 1, } - ] + ], ) - output.append({'recordId': record['recordId'], 'result': 'Ok'}) + output.append({"recordId": record["recordId"], "result": "Ok"}) success += 1 print(datapoint) - + except Exception as exp: - output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'}) + output.append({"recordId": record["recordId"], "result": "DeliveryFailed"}) failure += 1 exception_type, exception_value, exception_traceback = sys.exc_info() traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback) - err_msg = json.dumps({ - "errorType": exception_type.__name__, - "errorMessage": str(exception_value), - "stackTrace": traceback_string - }) + err_msg = json.dumps( + { + "errorType": exception_type.__name__, + "errorMessage": str(exception_value), + "stackTrace": traceback_string, + } + ) logger.error(err_msg) - print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure)) - return {'records': output} \ No newline at end of file + print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure)) + return {"records": output} diff --git a/11_stream/src/invoke_sm_endpoint_from_kinesis.py b/11_stream/src/invoke_sm_endpoint_from_kinesis.py index 2994ec12..3fc01ed3 100644 --- a/11_stream/src/invoke_sm_endpoint_from_kinesis.py +++ b/11_stream/src/invoke_sm_endpoint_from_kinesis.py @@ -7,26 +7,27 @@ import json # grab environment variables -ENDPOINT_NAME = os.environ['ENDPOINT_NAME'] -print('Endpoint: {}'.format(ENDPOINT_NAME)) -runtime = boto3.client('runtime.sagemaker') +ENDPOINT_NAME = os.environ["ENDPOINT_NAME"] +print("Endpoint: {}".format(ENDPOINT_NAME)) +runtime = boto3.client("runtime.sagemaker") + +print("Loading function") -print('Loading function') def lambda_handler(event, context): outputs = [] - - r = event['records'] - print('records: {}'.format(r)) - print('type_records: {}'.format(type(r))) - + + r = event["records"] + print("records: {}".format(r)) + print("type_records: {}".format(type(r))) + # TODO: Handle batches - for record in event['records']: - print(record['recordId']) - payload = base64.b64decode(record['data']) - print('payload: {}'.format(payload)) + for record in event["records"]: + print(record["recordId"]) + payload = base64.b64decode(record["data"]) + print("payload: {}".format(payload)) text = payload.decode("utf-8") - print('text: {}'.format(text)) + print("text: {}".format(text)) # Do custom processing on the payload here split_inputs = text.split("\t") @@ -34,44 +35,44 @@ def lambda_handler(event, context): print(split_inputs) review_body = split_inputs[2] print(review_body) - - inputs = [ - {"features": [review_body]} - ] + + inputs = [{"features": [review_body]}] response = runtime.invoke_endpoint( - EndpointName=pytorch_endpoint_name, - ContentType='application/jsonlines', - Accept='application/jsonlines', - Body=json.dumps(inputs).encode('utf-8') + EndpointName=pytorch_endpoint_name, + ContentType="application/jsonlines", + Accept="application/jsonlines", + Body=json.dumps(inputs).encode("utf-8"), ) - print('response: {}'.format(response)) + print("response: {}".format(response)) - predicted_classes_str = response['Body'].read().decode() + predicted_classes_str = response["Body"].read().decode() predicted_classes_json = json.loads(predicted_classes_str) predicted_classes = predicted_classes_json.splitlines() - print('predicted_classes: {}'.format(predicted_classes)) + print("predicted_classes: {}".format(predicted_classes)) for predicted_class_json, input_data in zip(predicted_classes, inputs): - predicted_class = json.loads(predicted_class_json)['predicted_label'] - print('Predicted star_rating: {} for review_body "{}"'.format(predicted_class, input_data["features"][0])) + predicted_class = json.loads(predicted_class_json)["predicted_label"] + print('Predicted star_rating: {} for review_body "{}"'.format(predicted_class, input_data["features"][0])) # Built output_record # review_id, star_rating, product_category, review_body - output_data = '{}\t{}\t{}\t{}'.format(split_inputs[0], str(predicted_class), split_inputs[1], input_data["review_body"]) - print('output_data: {}'.format(output_data)) - output_data_encoded = output_data.encode('utf-8') + output_data = "{}\t{}\t{}\t{}".format( + split_inputs[0], str(predicted_class), split_inputs[1], input_data["review_body"] + ) + print("output_data: {}".format(output_data)) + output_data_encoded = output_data.encode("utf-8") output_record = { - 'recordId': record['recordId'], - 'result': 'Ok', - 'data': base64.b64encode(output_data_encoded).decode('utf-8') + "recordId": record["recordId"], + "result": "Ok", + "data": base64.b64encode(output_data_encoded).decode("utf-8"), } outputs.append(output_record) - print('Successfully processed {} records.'.format(len(event['records']))) - print('type(output): {}'.format(type(outputs))) - print('Output Length: {} .'.format(len(outputs))) + print("Successfully processed {} records.".format(len(event["records"]))) + print("type(output): {}".format(type(outputs))) + print("Output Length: {} .".format(len(outputs))) - return {'records': outputs} \ No newline at end of file + return {"records": outputs} diff --git a/11_stream/src/push_notification_to_sns.py b/11_stream/src/push_notification_to_sns.py index e2d31cc7..3d9bb587 100644 --- a/11_stream/src/push_notification_to_sns.py +++ b/11_stream/src/push_notification_to_sns.py @@ -3,11 +3,12 @@ import base64 import os -SNS_TOPIC_ARN = os.environ['SNS_TOPIC_ARN'] +SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"] -sns = boto3.client('sns') +sns = boto3.client("sns") + +print("Loading function") -print('Loading function') def lambda_handler(event, context): output = [] @@ -15,30 +16,34 @@ def lambda_handler(event, context): failure = 0 highest_score = 0 - print('event: {}'.format(event)) - r = event['records'] - print('records: {}'.format(r)) - print('type_records: {}'.format(type(r))) - - for record in event['records']: + print("event: {}".format(event)) + r = event["records"] + print("records: {}".format(r)) + print("type_records: {}".format(type(r))) + + for record in event["records"]: try: # Uncomment the below line to publish the decoded data to the SNS topic. - payload = base64.b64decode(record['data']) - print('payload: {}'.format(payload)) + payload = base64.b64decode(record["data"]) + print("payload: {}".format(payload)) text = payload.decode("utf-8") - print('text: {}'.format(text)) + print("text: {}".format(text)) score = float(text) if (score != 0) and (score > highest_score): highest_score = score - print('New highest_score: {}'.format(highest_score)) + print("New highest_score: {}".format(highest_score)) # sns.publish(TopicArn=SNS_TOPIC_ARN, Message='New anomaly score: {}'.format(text), Subject='New Reviews Anomaly Score Detected') - output.append({'recordId': record['recordId'], 'result': 'Ok'}) + output.append({"recordId": record["recordId"], "result": "Ok"}) success += 1 except Exception as e: print(e) - output.append({'recordId': record['recordId'], 'result': 'DeliveryFailed'}) + output.append({"recordId": record["recordId"], "result": "DeliveryFailed"}) failure += 1 - if (highest_score != 0): - sns.publish(TopicArn=SNS_TOPIC_ARN, Message='New anomaly score: {}'.format(str(highest_score)), Subject='New Reviews Anomaly Score Detected') - print('Successfully delivered {0} records, failed to deliver {1} records'.format(success, failure)) - return {'records': output} \ No newline at end of file + if highest_score != 0: + sns.publish( + TopicArn=SNS_TOPIC_ARN, + Message="New anomaly score: {}".format(str(highest_score)), + Subject="New Reviews Anomaly Score Detected", + ) + print("Successfully delivered {0} records, failed to deliver {1} records".format(success, failure)) + return {"records": output} diff --git a/12_security/01_Secrets_Manager.ipynb b/12_security/01_Secrets_Manager.ipynb index bffc6d57..5a5e4dc6 100644 --- a/12_security/01_Secrets_Manager.ipynb +++ b/12_security/01_Secrets_Manager.ipynb @@ -19,12 +19,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "secretsmanager = boto3.client('secretsmanager')" + "secretsmanager = boto3.client(\"secretsmanager\")" ] }, { @@ -37,18 +37,15 @@ "\n", "try:\n", " response = secretsmanager.create_secret(\n", - " Name='dsoaws_redshift_login',\n", - " Description='DSOAWS Redshift Login',\n", + " Name=\"dsoaws_redshift_login\",\n", + " Description=\"DSOAWS Redshift Login\",\n", " SecretString='[{\"username\":\"dsoaws\"},{\"password\":\"Password9\"}]',\n", " Tags=[\n", - " {\n", - " 'Key': 'name',\n", - " 'Value': 'dsoaws_redshift_login'\n", - " },\n", - " ]\n", + " {\"Key\": \"name\", \"Value\": \"dsoaws_redshift_login\"},\n", + " ],\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'ResourceExistsException':\n", + " if e.response[\"Error\"][\"Code\"] == \"ResourceExistsException\":\n", " print(\"Secret already exists. This is ok.\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)" @@ -62,14 +59,14 @@ "source": [ "import json\n", "\n", - "secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')\n", - "cred = json.loads(secret['SecretString'])\n", + "secret = secretsmanager.get_secret_value(SecretId=\"dsoaws_redshift_login\")\n", + "cred = json.loads(secret[\"SecretString\"])\n", "\n", - "redshift_username = cred[0]['username']\n", - "redshift_pw = cred[1]['password']\n", + "redshift_username = cred[0][\"username\"]\n", + "redshift_pw = cred[1][\"password\"]\n", "\n", - "print('redshift_username: {}'.format(redshift_username))\n", - "print('redshift_pw: {}'.format(redshift_pw))" + "print(\"redshift_username: {}\".format(redshift_username))\n", + "print(\"redshift_pw: {}\".format(redshift_pw))" ] }, { diff --git a/12_security/02_Insecure_DataAccess_S3.ipynb b/12_security/02_Insecure_DataAccess_S3.ipynb index c2b1b38c..26c6559f 100644 --- a/12_security/02_Insecure_DataAccess_S3.ipynb +++ b/12_security/02_Insecure_DataAccess_S3.ipynb @@ -10,12 +10,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { diff --git a/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb b/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb index 2d4588c3..dceedfa9 100644 --- a/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb +++ b/12_security/03_Secure_DataAccess_S3_BucketPolicy_Role.ipynb @@ -17,13 +17,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)" ] }, { @@ -36,7 +36,7 @@ "\n", "timestamp = int(time.time())\n", "\n", - "bucket_secure_name = 'bucket-secure-bucket-policy-role-{}'.format(timestamp)\n", + "bucket_secure_name = \"bucket-secure-bucket-policy-role-{}\".format(timestamp)\n", "print(bucket_secure_name)" ] }, @@ -66,19 +66,17 @@ "source": [ "# Create the bucket policy\n", "bucket_policy_deny = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Deny',\n", - " 'Principal': '*',\n", - " 'Action': [\n", - " 's3:ListBucket'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Sid\": \"\",\n", + " \"Effect\": \"Deny\",\n", + " \"Principal\": \"*\",\n", + " \"Action\": [\"s3:ListBucket\"],\n", + " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n", + " }\n", + " ],\n", + "}" ] }, { diff --git a/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb b/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb index 8db2ebdb..b36dc50c 100644 --- a/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb +++ b/12_security/03a_Secure_DataAccess_S3_BucketPolicy_VPC.ipynb @@ -17,14 +17,14 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)\n", - "ec2 = boto3.Session().client(service_name='ec2', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n", + "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)" ] }, { @@ -37,7 +37,7 @@ "\n", "timestamp = int(time.time())\n", "\n", - "bucket_secure_name = 'bucket-secure-bucket-policy-vpc-{}'.format(timestamp)\n", + "bucket_secure_name = \"bucket-secure-bucket-policy-vpc-{}\".format(timestamp)\n", "print(bucket_secure_name)" ] }, @@ -65,7 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "different_vpc_id='blah'" + "different_vpc_id = \"blah\"" ] }, { @@ -76,7 +76,7 @@ "source": [ "all_vpcs = ec2.describe_vpcs()\n", "\n", - "vpc_id=all_vpcs['Vpcs'][0]['VpcId']\n", + "vpc_id = all_vpcs[\"Vpcs\"][0][\"VpcId\"]\n", "\n", "print(vpc_id)" ] @@ -89,26 +89,22 @@ "source": [ "# Create the bucket policy\n", "bucket_policy_deny = {\n", - " \"Version\": \"2008-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Deny\",\n", - " \"Principal\": \"*\",\n", - " \"Action\": [\n", - " \"s3:ListBucket\"\n", - " ],\n", - " \"Resource\": [\n", - " \"arn:aws:s3:::{}\".format(bucket_secure_name)\n", - " ],\n", - " \"Condition\": {\n", - " \"StringNotEquals\": {\n", - "# \"aws:sourceVpc\": different_vpc_id\n", - " \"aws:sourceVpc\": vpc_id\n", - " }\n", + " \"Version\": \"2008-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Effect\": \"Deny\",\n", + " \"Principal\": \"*\",\n", + " \"Action\": [\"s3:ListBucket\"],\n", + " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n", + " \"Condition\": {\n", + " \"StringNotEquals\": {\n", + " # \"aws:sourceVpc\": different_vpc_id\n", + " \"aws:sourceVpc\": vpc_id\n", " }\n", - " }\n", - " ]\n", - " }" + " },\n", + " }\n", + " ],\n", + "}" ] }, { @@ -127,8 +123,7 @@ "import json\n", "import time\n", "\n", - "response = s3.put_bucket_policy(Bucket=bucket_secure_name, \n", - " Policy=json.dumps(bucket_policy_deny))\n", + "response = s3.put_bucket_policy(Bucket=bucket_secure_name, Policy=json.dumps(bucket_policy_deny))\n", "\n", "print(response)\n", "\n", diff --git a/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb b/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb index 4bcf0548..e5f7706c 100644 --- a/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb +++ b/12_security/04_Secure_DataAccess_S3_IAMPolicy_Role.ipynb @@ -16,13 +16,13 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "role_name = role.split('/')[-1]\n", + "role_name = role.split(\"/\")[-1]\n", "print(role_name)" ] }, @@ -42,6 +42,7 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())" ] }, @@ -51,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "bucket_secure_name = 'bucket-secure-iam-policy-role-{}'.format(timestamp)\n", + "bucket_secure_name = \"bucket-secure-iam-policy-role-{}\".format(timestamp)\n", "print(bucket_secure_name)" ] }, @@ -79,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_name='DSOAWS_SecureBucket_Policy_IAM_{}'.format(timestamp)\n", + "policy_name = \"DSOAWS_SecureBucket_Policy_IAM_{}\".format(timestamp)\n", "print(policy_name)" ] }, @@ -91,18 +92,16 @@ "source": [ "# Create the IAM policy\n", "iam_policy_deny = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Deny',\n", - " 'Action': [\n", - " 's3:ListBucket'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Sid\": \"\",\n", + " \"Effect\": \"Deny\",\n", + " \"Action\": [\"s3:ListBucket\"],\n", + " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n", + " }\n", + " ],\n", + "}" ] }, { @@ -113,13 +112,9 @@ }, "outputs": [], "source": [ - "import json \n", + "import json\n", "\n", - "response = iam.put_role_policy(\n", - " RoleName=role_name,\n", - " PolicyName=policy_name,\n", - " PolicyDocument=json.dumps(iam_policy_deny)\n", - ")\n", + "response = iam.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(iam_policy_deny))\n", "\n", "print(response)\n", "\n", @@ -159,10 +154,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=role_name,\n", - " PolicyName=policy_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=role_name, PolicyName=policy_name)\n", "print(response)\n", "\n", "time.sleep(30)" diff --git a/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb b/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb index fc23205e..66dcc3d3 100644 --- a/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb +++ b/12_security/04a_Secure_DataAccess_S3_IAMPolicy_VPC.ipynb @@ -16,13 +16,13 @@ "import boto3\n", "import sagemaker\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "s3 = boto3.Session().client(service_name='s3', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "s3 = boto3.Session().client(service_name=\"s3\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "role_name = role.split('/')[-1]\n", + "role_name = role.split(\"/\")[-1]\n", "print(role_name)" ] }, @@ -42,6 +42,7 @@ "outputs": [], "source": [ "import time\n", + "\n", "timestamp = int(time.time())" ] }, @@ -51,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "bucket_secure_name = 'bucket-secure-iam-policy-vpc-{}'.format(timestamp)\n", + "bucket_secure_name = \"bucket-secure-iam-policy-vpc-{}\".format(timestamp)\n", "print(bucket_secure_name)" ] }, @@ -79,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_name='DSOAWS_Secure_IAMPolicy_VPC_{}'.format(timestamp)\n", + "policy_name = \"DSOAWS_Secure_IAMPolicy_VPC_{}\".format(timestamp)\n", "print(policy_name)" ] }, @@ -89,7 +90,7 @@ "metadata": {}, "outputs": [], "source": [ - "different_vpc_id='blah'" + "different_vpc_id = \"blah\"" ] }, { @@ -100,24 +101,17 @@ "source": [ "# Create the IAM policy\n", "iam_policy_deny = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Deny',\n", - " 'Action': [\n", - " 's3:ListBucket'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket_secure_name)\n", - " ],\n", - " 'Condition': {\n", - " 'StringNotEquals': {\n", - " 'aws:sourceVpc': different_vpc_id\n", - " }\n", - " }\n", - " \n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Sid\": \"\",\n", + " \"Effect\": \"Deny\",\n", + " \"Action\": [\"s3:ListBucket\"],\n", + " \"Resource\": [\"arn:aws:s3:::{}\".format(bucket_secure_name)],\n", + " \"Condition\": {\"StringNotEquals\": {\"aws:sourceVpc\": different_vpc_id}},\n", + " }\n", + " ],\n", + "}" ] }, { @@ -126,13 +120,9 @@ "metadata": {}, "outputs": [], "source": [ - "import json \n", + "import json\n", "\n", - "response = iam.put_role_policy(\n", - " RoleName=role_name,\n", - " PolicyName=policy_name,\n", - " PolicyDocument=json.dumps(iam_policy_deny)\n", - ")\n", + "response = iam.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(iam_policy_deny))\n", "\n", "print(response)\n", "\n", @@ -163,10 +153,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=role_name,\n", - " PolicyName=policy_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=role_name, PolicyName=policy_name)\n", "print(response)\n", "\n", "time.sleep(30)" diff --git a/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb b/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb index 6fc42392..1709f5e9 100644 --- a/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb +++ b/12_security/05_Secure_SageMaker_Notebook_Instance.ipynb @@ -19,17 +19,17 @@ "outputs": [], "source": [ "sm.create_notebook_instance(\n", - " NotebookInstanceName='dsoaws',\n", - " InstanceType='ml.t3.medium',\n", - " SubnetId='',\n", + " NotebookInstanceName=\"dsoaws\",\n", + " InstanceType=\"ml.t3.medium\",\n", + " SubnetId=\"\",\n", " SecurityGroupIds=[\n", - " '',\n", + " \"\",\n", " ],\n", - " RoleArn='arn:aws:iam:::role/service-role/',\n", - " KmsKeyId='',\n", - " DirectInternetAccess='Disabled',\n", + " RoleArn=\"arn:aws:iam:::role/service-role/\",\n", + " KmsKeyId=\"\",\n", + " DirectInternetAccess=\"Disabled\",\n", " VolumeSizeInGB=10,\n", - " RootAccess='Disabled'\n", + " RootAccess=\"Disabled\",\n", ")" ] }, diff --git a/12_security/07_Insecure_Train.ipynb b/12_security/07_Insecure_Train.ipynb index b40992be..6a9439af 100644 --- a/12_security/07_Insecure_Train.ipynb +++ b/12_security/07_Insecure_Train.ipynb @@ -10,12 +10,12 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)" ] }, { @@ -36,9 +36,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -68,9 +68,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -100,9 +100,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -192,12 +192,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -226,28 +223,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='Pipe'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"Pipe\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -257,10 +254,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -280,38 +277,41 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - "# max_run=7200, # number of seconds\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " # max_run=7200, # number of seconds\n", + ")" ] }, { @@ -327,11 +327,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -341,7 +340,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -352,7 +351,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -363,7 +368,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -374,7 +385,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/08_Secure_Train_IAMPolicy_Role.ipynb b/12_security/08_Secure_Train_IAMPolicy_Role.ipynb index d9ab9bc6..6e4f97b4 100644 --- a/12_security/08_Secure_Train_IAMPolicy_Role.ipynb +++ b/12_security/08_Secure_Train_IAMPolicy_Role.ipynb @@ -10,13 +10,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -37,9 +37,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -69,9 +69,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -101,9 +101,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -193,12 +193,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -227,28 +224,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='Pipe'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"Pipe\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -258,10 +255,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -280,17 +277,11 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"sagemaker.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n", + " ],\n", + "}" ] }, { @@ -310,7 +301,7 @@ "metadata": {}, "outputs": [], "source": [ - "secure_iam_role_name = 'DSOAWS_Secure_Train_Role_{}'.format(timestamp)" + "secure_iam_role_name = \"DSOAWS_Secure_Train_Role_{}\".format(timestamp)" ] }, { @@ -328,12 +319,12 @@ " secure_iam_role = iam.create_role(\n", " RoleName=secure_iam_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Secure Role'\n", + " Description=\"DSOAWS Secure Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role = iam.get_role(RoleName=secure_iam_role_name)\n", - "# print(\"Role already exists\")\n", + " # print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)\n", "\n", @@ -349,18 +340,9 @@ "outputs": [], "source": [ "iam_policy_allow_s3 = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Allow',\n", - " 'Action': [\n", - " 's3:*'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n", + "}" ] }, { @@ -369,7 +351,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)" + "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)" ] }, { @@ -381,9 +363,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name,\n", - " PolicyDocument=json.dumps(iam_policy_allow_s3)\n", + " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n", ")\n", "\n", "print(response)\n", @@ -406,11 +386,9 @@ " \"Action\": [\n", " \"sagemaker:CreateTrainingJob\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ]\n", + " \"Resource\": [\"*\"],\n", " }\n", - " ]\n", + " ],\n", "}" ] }, @@ -420,8 +398,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_Role_{}'.format(timestamp)\n", - "\n" + "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_Role_{}\".format(timestamp)" ] }, { @@ -432,38 +409,41 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=secure_iam_role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - "# max_run=7200, # number of seconds\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=secure_iam_role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " # max_run=7200, # number of seconds\n", + ")" ] }, { @@ -479,11 +459,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -493,7 +472,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -504,7 +483,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -515,7 +500,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -526,7 +517,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -576,10 +573,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_deny_create_training_job_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n", "print(response)\n", "\n", "time.sleep(30)" @@ -591,10 +585,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n", "print(response)\n", "\n", "time.sleep(30)" diff --git a/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb b/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb index b4d93d0a..d9c3d6c1 100644 --- a/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb +++ b/12_security/08a_Secure_Train_IAMPolicy_VPC.ipynb @@ -11,14 +11,14 @@ "import pandas as pd\n", "\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "ec2 = boto3.Session().client(service_name='ec2', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)" ] }, { @@ -39,9 +39,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -71,9 +71,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -103,9 +103,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -195,12 +195,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -229,29 +226,29 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "#input_mode='Pipe'\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "# input_mode='Pipe'\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -261,10 +258,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -283,17 +280,11 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"sagemaker.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n", + " ],\n", + "}" ] }, { @@ -313,7 +304,7 @@ "metadata": {}, "outputs": [], "source": [ - "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)" + "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)" ] }, { @@ -331,12 +322,12 @@ " secure_iam_role = iam.create_role(\n", " RoleName=secure_iam_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Secure Role'\n", + " Description=\"DSOAWS Secure Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role = iam.get_role(RoleName=secure_iam_role_name)\n", - "# print(\"Role already exists\")\n", + " # print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)\n", "\n", @@ -359,18 +350,9 @@ "outputs": [], "source": [ "iam_policy_allow_s3 = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Allow',\n", - " 'Action': [\n", - " 's3:*'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n", + "}" ] }, { @@ -379,7 +361,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)" + "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)" ] }, { @@ -391,9 +373,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name,\n", - " PolicyDocument=json.dumps(iam_policy_allow_s3)\n", + " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n", ")\n", "\n", "print(response)\n", @@ -460,17 +440,10 @@ " \"Action\": [\n", " \"sagemaker:CreateTrainingJob\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ],\n", - " \"Condition\": {\n", - " \"Null\": {\n", - " \"sagemaker:VpcSubnets\": \"true\",\n", - " \"sagemaker:VpcSecurityGroupIds\": \"true\"\n", - " }\n", - " }\n", + " \"Resource\": [\"*\"],\n", + " \"Condition\": {\"Null\": {\"sagemaker:VpcSubnets\": \"true\", \"sagemaker:VpcSecurityGroupIds\": \"true\"}},\n", " }\n", - " ]\n", + " ],\n", "}" ] }, @@ -480,7 +453,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)" + "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)" ] }, { @@ -494,7 +467,7 @@ "response = iam.put_role_policy(\n", " RoleName=secure_iam_role_name,\n", " PolicyName=policy_deny_create_training_job_name,\n", - " PolicyDocument=json.dumps(policy_deny_create_training_job)\n", + " PolicyDocument=json.dumps(policy_deny_create_training_job),\n", ")\n", "\n", "print(response)\n", @@ -510,41 +483,44 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - "# role=role,\n", - " role=secure_iam_role_name,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - "# subnets=None,\n", - "# security_group_ids=None,\n", - "# max_run=7200, # number of seconds\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " # role=role,\n", + " role=secure_iam_role_name,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " # subnets=None,\n", + " # security_group_ids=None,\n", + " # max_run=7200, # number of seconds\n", + ")" ] }, { @@ -563,11 +539,9 @@ "outputs": [], "source": [ "estimator.fit(\n", - " inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -577,7 +551,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -588,7 +562,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -599,7 +579,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -610,7 +596,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -651,10 +643,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_deny_create_training_job_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n", "print(response)\n", "\n", "time.sleep(30)" @@ -666,10 +655,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n", "print(response)\n", "\n", "time.sleep(30)" @@ -707,19 +693,20 @@ "outputs": [], "source": [ "import json\n", + "\n", "notebook_instance_name = None\n", "\n", "try:\n", - " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n", + " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n", " data = json.load(notebook_info)\n", - " resource_arn = data['ResourceArn']\n", - " region = resource_arn.split(':')[3]\n", - " notebook_instance_name = data['ResourceName']\n", - " print('Notebook Instance Name: {}'.format(notebook_instance_name))\n", + " resource_arn = data[\"ResourceArn\"]\n", + " region = resource_arn.split(\":\")[3]\n", + " notebook_instance_name = data[\"ResourceName\"]\n", + " print(\"Notebook Instance Name: {}\".format(notebook_instance_name))\n", "except:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -728,9 +715,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = sm.describe_notebook_instance(\n", - " NotebookInstanceName=notebook_instance_name\n", - ")\n", + "response = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)\n", "\n", "print(response)" ] @@ -741,11 +726,11 @@ "metadata": {}, "outputs": [], "source": [ - "print('SubnetId: {}'.format(response['SubnetId']))\n", - "print('SecurityGroups: {}'.format(response['SecurityGroups']))\n", - "print('IAM Role: {}'.format(response['RoleArn']))\n", - "print('NetworkInterfaceId: {}'.format(response['NetworkInterfaceId']))\n", - "print('DirectInternetAccess: {}'.format(response['DirectInternetAccess']))" + "print(\"SubnetId: {}\".format(response[\"SubnetId\"]))\n", + "print(\"SecurityGroups: {}\".format(response[\"SecurityGroups\"]))\n", + "print(\"IAM Role: {}\".format(response[\"RoleArn\"]))\n", + "print(\"NetworkInterfaceId: {}\".format(response[\"NetworkInterfaceId\"]))\n", + "print(\"DirectInternetAccess: {}\".format(response[\"DirectInternetAccess\"]))" ] }, { @@ -754,7 +739,7 @@ "metadata": {}, "outputs": [], "source": [ - "subnet_id=response['SubnetId']\n", + "subnet_id = response[\"SubnetId\"]\n", "print(subnet_id)" ] }, @@ -764,7 +749,7 @@ "metadata": {}, "outputs": [], "source": [ - "security_group_ids=response['SecurityGroups']\n", + "security_group_ids = response[\"SecurityGroups\"]\n", "print(security_group_ids)" ] }, @@ -776,11 +761,11 @@ "source": [ "from pprint import pprint\n", "\n", - "all_vpcs = ec2.describe_vpcs()['Vpcs']\n", + "all_vpcs = ec2.describe_vpcs()[\"Vpcs\"]\n", "\n", "print(len(all_vpcs))\n", "\n", - "pprint(all_vpcs)\n" + "pprint(all_vpcs)" ] }, { @@ -789,7 +774,7 @@ "metadata": {}, "outputs": [], "source": [ - "vpc_id = ec2.describe_vpcs()['Vpcs'][-1]['VpcId']\n", + "vpc_id = ec2.describe_vpcs()[\"Vpcs\"][-1][\"VpcId\"]\n", "print(vpc_id)" ] }, @@ -808,43 +793,44 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - "# role=secure_iam_role_name,\n", - " role=role, \n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " subnets=[\n", - " subnet_id\n", - " ],\n", - " security_group_ids=security_group_ids\n", - "# max_run=7200, # number of seconds\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " # role=secure_iam_role_name,\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " subnets=[subnet_id],\n", + " security_group_ids=security_group_ids\n", + " # max_run=7200, # number of seconds\n", + ")" ] }, { @@ -871,11 +857,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -885,7 +870,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -896,7 +881,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -907,7 +898,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -918,7 +915,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb b/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb index 023bbda8..53283aff 100644 --- a/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb +++ b/12_security/08b_Secure_Train_IAMPolicy_VPC_ConditionKey.ipynb @@ -11,14 +11,14 @@ "import pandas as pd\n", "\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)\n", - "ec2 = boto3.Session().client(service_name='ec2', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)\n", + "ec2 = boto3.Session().client(service_name=\"ec2\", region_name=region)" ] }, { @@ -39,9 +39,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -71,9 +71,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -103,9 +103,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -195,12 +195,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -229,29 +226,29 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "#input_mode='Pipe'\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "# input_mode='Pipe'\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -261,10 +258,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -283,17 +280,11 @@ "outputs": [], "source": [ "assume_role_policy_doc = {\n", - " \"Version\": \"2012-10-17\",\n", - " \"Statement\": [\n", - " {\n", - " \"Effect\": \"Allow\",\n", - " \"Principal\": {\n", - " \"Service\": \"sagemaker.amazonaws.com\"\n", - " },\n", - " \"Action\": \"sts:AssumeRole\"\n", - " }\n", - " ]\n", - "} " + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\"Effect\": \"Allow\", \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"}, \"Action\": \"sts:AssumeRole\"}\n", + " ],\n", + "}" ] }, { @@ -313,7 +304,7 @@ "metadata": {}, "outputs": [], "source": [ - "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)" + "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)" ] }, { @@ -331,12 +322,12 @@ " secure_iam_role = iam.create_role(\n", " RoleName=secure_iam_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Secure Role'\n", + " Description=\"DSOAWS Secure Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role = iam.get_role(RoleName=secure_iam_role_name)\n", - "# print(\"Role already exists\")\n", + " # print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)\n", "\n", @@ -352,18 +343,9 @@ "outputs": [], "source": [ "iam_policy_allow_s3 = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Allow',\n", - " 'Action': [\n", - " 's3:*'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n", + "}" ] }, { @@ -372,7 +354,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)" + "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)" ] }, { @@ -384,9 +366,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name,\n", - " PolicyDocument=json.dumps(iam_policy_allow_s3)\n", + " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n", ")\n", "\n", "print(response)\n", @@ -400,8 +380,8 @@ "metadata": {}, "outputs": [], "source": [ - "different_subnet_id='blah'\n", - "different_security_group_ids=['blah']" + "different_subnet_id = \"blah\"\n", + "different_security_group_ids = [\"blah\"]" ] }, { @@ -419,20 +399,15 @@ " \"Action\": [\n", " \"sagemaker:CreateTrainingJob\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ],\n", - " \"Condition\": {\n", + " \"Resource\": [\"*\"],\n", + " \"Condition\": {\n", " \"StringNotEquals\": {\n", " \"sagemaker:VpcSecurityGroupIds\": different_security_group_ids,\n", - " \"sagemaker:VpcSubnets\": [\n", - " different_subnet_id\n", - " ]\n", + " \"sagemaker:VpcSubnets\": [different_subnet_id],\n", " }\n", - " }\n", - "\n", + " },\n", " }\n", - " ]\n", + " ],\n", "}" ] }, @@ -442,7 +417,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)" + "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)" ] }, { @@ -456,7 +431,7 @@ "response = iam.put_role_policy(\n", " RoleName=secure_iam_role_name,\n", " PolicyName=policy_deny_create_training_job_name,\n", - " PolicyDocument=json.dumps(policy_deny_create_training_job)\n", + " PolicyDocument=json.dumps(policy_deny_create_training_job),\n", ")\n", "\n", "print(response)\n", @@ -472,37 +447,40 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=secure_iam_role_name,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " subnets=None,\n", - " security_group_ids=None,\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=secure_iam_role_name,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " subnets=None,\n", + " security_group_ids=None,\n", + ")" ] }, { @@ -521,11 +499,9 @@ "outputs": [], "source": [ "estimator.fit(\n", - " inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -535,7 +511,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -546,7 +522,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -557,7 +539,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -568,7 +556,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { @@ -595,10 +589,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_deny_create_training_job_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)\n", "print(response)\n", "\n", "time.sleep(30)" @@ -610,10 +601,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = iam.delete_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name\n", - ")\n", + "response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)\n", "print(response)\n", "\n", "time.sleep(30)" @@ -651,19 +639,20 @@ "outputs": [], "source": [ "import json\n", + "\n", "notebook_instance_name = None\n", "\n", "try:\n", - " with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:\n", + " with open(\"/opt/ml/metadata/resource-metadata.json\") as notebook_info:\n", " data = json.load(notebook_info)\n", - " resource_arn = data['ResourceArn']\n", - " region = resource_arn.split(':')[3]\n", - " notebook_instance_name = data['ResourceName']\n", - " print('Notebook Instance Name: {}'.format(notebook_instance_name))\n", + " resource_arn = data[\"ResourceArn\"]\n", + " region = resource_arn.split(\":\")[3]\n", + " notebook_instance_name = data[\"ResourceName\"]\n", + " print(\"Notebook Instance Name: {}\".format(notebook_instance_name))\n", "except:\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.')\n", - " print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.\")\n", + " print(\"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -674,9 +663,7 @@ }, "outputs": [], "source": [ - "response = sm.describe_notebook_instance(\n", - " NotebookInstanceName=notebook_instance_name\n", - ")\n", + "response = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)\n", "\n", "print(response)" ] @@ -687,11 +674,11 @@ "metadata": {}, "outputs": [], "source": [ - "print('SubnetId: {}'.format(response['SubnetId']))\n", - "print('SecurityGroups: {}'.format(response['SecurityGroups']))\n", - "print('IAM Role: {}'.format(response['RoleArn']))\n", - "print('NetworkInterfaceId: {}'.format(response['NetworkInterfaceId']))\n", - "print('DirectInternetAccess: {}'.format(response['DirectInternetAccess']))" + "print(\"SubnetId: {}\".format(response[\"SubnetId\"]))\n", + "print(\"SecurityGroups: {}\".format(response[\"SecurityGroups\"]))\n", + "print(\"IAM Role: {}\".format(response[\"RoleArn\"]))\n", + "print(\"NetworkInterfaceId: {}\".format(response[\"NetworkInterfaceId\"]))\n", + "print(\"DirectInternetAccess: {}\".format(response[\"DirectInternetAccess\"]))" ] }, { @@ -700,7 +687,7 @@ "metadata": {}, "outputs": [], "source": [ - "subnet_id=response['SubnetId']\n", + "subnet_id = response[\"SubnetId\"]\n", "print(subnet_id)" ] }, @@ -710,7 +697,7 @@ "metadata": {}, "outputs": [], "source": [ - "security_group_ids=response['SecurityGroups']\n", + "security_group_ids = response[\"SecurityGroups\"]\n", "print(security_group_ids)" ] }, @@ -731,7 +718,7 @@ "metadata": {}, "outputs": [], "source": [ - "secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)" + "secure_iam_role_name = \"DSOAWS_Secure_Train_VPC_{}\".format(timestamp)" ] }, { @@ -749,12 +736,12 @@ " secure_iam_role = iam.create_role(\n", " RoleName=secure_iam_role_name,\n", " AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n", - " Description='DSOAWS Secure Role'\n", + " Description=\"DSOAWS Secure Role\",\n", " )\n", "except ClientError as e:\n", - " if e.response['Error']['Code'] == 'EntityAlreadyExists':\n", + " if e.response[\"Error\"][\"Code\"] == \"EntityAlreadyExists\":\n", " iam_role = iam.get_role(RoleName=secure_iam_role_name)\n", - "# print(\"Role already exists\")\n", + " # print(\"Role already exists\")\n", " else:\n", " print(\"Unexpected error: %s\" % e)\n", "\n", @@ -770,18 +757,9 @@ "outputs": [], "source": [ "iam_policy_allow_s3 = {\n", - " 'Version': '2012-10-17',\n", - " 'Statement': [{\n", - " 'Sid': '',\n", - " 'Effect': 'Allow',\n", - " 'Action': [\n", - " 's3:*'\n", - " ],\n", - " 'Resource': [\n", - " 'arn:aws:s3:::{}'.format(bucket)\n", - " ]\n", - " }]\n", - " }" + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [{\"Sid\": \"\", \"Effect\": \"Allow\", \"Action\": [\"s3:*\"], \"Resource\": [\"arn:aws:s3:::{}\".format(bucket)]}],\n", + "}" ] }, { @@ -790,7 +768,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)" + "policy_allow_s3_name = \"DSOAWS_Secure_Train_Allow_S3_{}\".format(timestamp)" ] }, { @@ -802,9 +780,7 @@ "import time\n", "\n", "response = iam.put_role_policy(\n", - " RoleName=secure_iam_role_name,\n", - " PolicyName=policy_allow_s3_name,\n", - " PolicyDocument=json.dumps(iam_policy_allow_s3)\n", + " RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)\n", ")\n", "\n", "print(response)\n", @@ -827,20 +803,15 @@ " \"Action\": [\n", " \"sagemaker:CreateTrainingJob\",\n", " ],\n", - " \"Resource\": [\n", - " \"*\"\n", - " ],\n", - " \"Condition\": {\n", + " \"Resource\": [\"*\"],\n", + " \"Condition\": {\n", " \"StringNotEquals\": {\n", " \"sagemaker:VpcSecurityGroupIds\": security_group_ids,\n", - " \"sagemaker:VpcSubnets\": [\n", - " subnet_id\n", - " ]\n", + " \"sagemaker:VpcSubnets\": [subnet_id],\n", " }\n", - " }\n", - "\n", + " },\n", " }\n", - " ]\n", + " ],\n", "}" ] }, @@ -850,7 +821,7 @@ "metadata": {}, "outputs": [], "source": [ - "policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)" + "policy_deny_create_training_job_name = \"DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}\".format(timestamp)" ] }, { @@ -864,7 +835,7 @@ "response = iam.put_role_policy(\n", " RoleName=secure_iam_role_name,\n", " PolicyName=policy_deny_create_training_job_name,\n", - " PolicyDocument=json.dumps(policy_deny_create_training_job)\n", + " PolicyDocument=json.dumps(policy_deny_create_training_job),\n", ")\n", "\n", "print(response)\n", @@ -887,39 +858,40 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=secure_iam_role_name,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " subnets=[\n", - " subnet_id\n", - " ],\n", - " security_group_ids=security_group_ids\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=secure_iam_role_name,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " subnets=[subnet_id],\n", + " security_group_ids=security_group_ids,\n", + ")" ] }, { @@ -946,11 +918,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -960,7 +931,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -971,7 +942,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -982,7 +959,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -993,7 +976,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb b/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb index 159b8976..9e3537b9 100644 --- a/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb +++ b/12_security/09_Secure_Train_EncryptionAtRest_KMS.ipynb @@ -10,13 +10,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "kms = boto3.Session().client(service_name='kms', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "kms = boto3.Session().client(service_name=\"kms\", region_name=region)" ] }, { @@ -37,9 +37,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -77,9 +77,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -117,9 +117,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -268,12 +268,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -310,28 +307,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=10\n", - "validation_steps=10\n", - "test_steps=10\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 10\n", + "validation_steps = 10\n", + "test_steps = 10\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -341,10 +338,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -363,8 +360,8 @@ "source": [ "create_ebs_key_response = kms.create_key()\n", "\n", - "ebs_kms_key_id=create_ebs_key_response['KeyMetadata']['KeyId']\n", - "ebs_kms_key_arn=create_ebs_key_response['KeyMetadata']['Arn']" + "ebs_kms_key_id = create_ebs_key_response[\"KeyMetadata\"][\"KeyId\"]\n", + "ebs_kms_key_arn = create_ebs_key_response[\"KeyMetadata\"][\"Arn\"]" ] }, { @@ -375,8 +372,8 @@ "source": [ "create_s3_key_response = kms.create_key()\n", "\n", - "s3_kms_key_id=create_s3_key_response['KeyMetadata']['KeyId']\n", - "s3_kms_key_arn=create_s3_key_response['KeyMetadata']['Arn']" + "s3_kms_key_id = create_s3_key_response[\"KeyMetadata\"][\"KeyId\"]\n", + "s3_kms_key_arn = create_s3_key_response[\"KeyMetadata\"][\"Arn\"]" ] }, { @@ -395,37 +392,40 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " volume_kms_key=ebs_kms_key_id,\n", - " output_kms_key=s3_kms_key_id,\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " volume_kms_key=ebs_kms_key_id,\n", + " output_kms_key=s3_kms_key_id,\n", + ")" ] }, { @@ -441,11 +441,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -463,7 +462,7 @@ ], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -487,7 +486,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -511,7 +516,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -535,7 +546,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/10_Secure_Train_EncryptionInTransit.ipynb b/12_security/10_Secure_Train_EncryptionInTransit.ipynb index 693d3132..4504a149 100644 --- a/12_security/10_Secure_Train_EncryptionInTransit.ipynb +++ b/12_security/10_Secure_Train_EncryptionInTransit.ipynb @@ -10,13 +10,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -37,9 +37,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -77,9 +77,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -117,9 +117,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -268,12 +268,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -310,28 +307,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=10\n", - "validation_steps=10\n", - "test_steps=10\n", - "train_instance_count=2\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='File'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 10\n", + "validation_steps = 10\n", + "test_steps = 10\n", + "train_instance_count = 2\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"File\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -341,10 +338,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -364,36 +361,39 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " encrypt_inter_container_traffic=True,\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " encrypt_inter_container_traffic=True,\n", + ")" ] }, { @@ -409,11 +409,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -423,7 +422,7 @@ "outputs": [], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -434,7 +433,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -445,7 +450,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -456,7 +467,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/11_Secure_Train_NetworkIsolation.ipynb b/12_security/11_Secure_Train_NetworkIsolation.ipynb index a8bb4bc5..a33d64f0 100644 --- a/12_security/11_Secure_Train_NetworkIsolation.ipynb +++ b/12_security/11_Secure_Train_NetworkIsolation.ipynb @@ -10,13 +10,13 @@ "import sagemaker\n", "import pandas as pd\n", "\n", - "sess = sagemaker.Session()\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "iam = boto3.Session().client(service_name='iam', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "iam = boto3.Session().client(service_name=\"iam\", region_name=region)" ] }, { @@ -37,9 +37,9 @@ "try:\n", " processed_train_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -77,9 +77,9 @@ "try:\n", " processed_validation_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -117,9 +117,9 @@ "try:\n", " processed_test_data_s3_uri\n", "except NameError:\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')\n", - " print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')\n", - " print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')" + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n", + " print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n", + " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")" ] }, { @@ -268,12 +268,9 @@ "source": [ "from sagemaker.inputs import TrainingInput\n", "\n", - "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, \n", - " distribution='ShardedByS3Key') \n", - "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", - "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, \n", - " distribution='ShardedByS3Key')\n", + "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n", + "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n", "\n", "print(s3_input_train_data.config)\n", "print(s3_input_validation_data.config)\n", @@ -310,28 +307,28 @@ "metadata": {}, "outputs": [], "source": [ - "epochs=1\n", - "learning_rate=0.00001\n", - "epsilon=0.00000001\n", - "train_batch_size=128\n", - "validation_batch_size=128\n", - "test_batch_size=128\n", - "train_steps_per_epoch=100\n", - "validation_steps=100\n", - "test_steps=100\n", - "train_instance_count=1\n", - "train_instance_type='ml.c5.9xlarge'\n", - "train_volume_size=1024\n", - "use_xla=True\n", - "use_amp=True\n", - "freeze_bert_layer=False\n", - "enable_sagemaker_debugger=True\n", - "enable_checkpointing=False\n", - "enable_tensorboard=False\n", - "input_mode='Pipe'\n", - "run_validation=True\n", - "run_test=True\n", - "run_sample_predictions=True" + "epochs = 1\n", + "learning_rate = 0.00001\n", + "epsilon = 0.00000001\n", + "train_batch_size = 128\n", + "validation_batch_size = 128\n", + "test_batch_size = 128\n", + "train_steps_per_epoch = 100\n", + "validation_steps = 100\n", + "test_steps = 100\n", + "train_instance_count = 1\n", + "train_instance_type = \"ml.c5.9xlarge\"\n", + "train_volume_size = 1024\n", + "use_xla = True\n", + "use_amp = True\n", + "freeze_bert_layer = False\n", + "enable_sagemaker_debugger = True\n", + "enable_checkpointing = False\n", + "enable_tensorboard = False\n", + "input_mode = \"Pipe\"\n", + "run_validation = True\n", + "run_test = True\n", + "run_sample_predictions = True" ] }, { @@ -341,10 +338,10 @@ "outputs": [], "source": [ "metrics_definitions = [\n", - " {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\\\.]+)'},\n", - " {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\\\.]+)'},\n", + " {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n", + " {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n", "]" ] }, @@ -364,38 +361,41 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "estimator = TensorFlow(entry_point='tf_bert_reviews.py',\n", - " source_dir='src',\n", - " role=role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " volume_size=train_volume_size,\n", - "# use_spot_instances=True,\n", - "# max_wait=7200, # Seconds to wait for spot instances to become available\n", - " py_version='py3',\n", - " framework_version='2.1.0',\n", - " hyperparameters={'epochs': epochs,\n", - " 'learning_rate': learning_rate,\n", - " 'epsilon': epsilon,\n", - " 'train_batch_size': train_batch_size,\n", - " 'validation_batch_size': validation_batch_size,\n", - " 'test_batch_size': test_batch_size, \n", - " 'train_steps_per_epoch': train_steps_per_epoch,\n", - " 'validation_steps': validation_steps,\n", - " 'test_steps': test_steps,\n", - " 'use_xla': use_xla,\n", - " 'use_amp': use_amp, \n", - " 'max_seq_length': max_seq_length,\n", - " 'freeze_bert_layer': freeze_bert_layer,\n", - " 'enable_sagemaker_debugger': enable_sagemaker_debugger,\n", - " 'enable_checkpointing': enable_checkpointing,\n", - " 'enable_tensorboard': enable_tensorboard, \n", - " 'run_validation': run_validation,\n", - " 'run_test': run_test,\n", - " 'run_sample_predictions': run_sample_predictions},\n", - " input_mode=input_mode,\n", - " enable_network_isolation=True\n", - " )" + "estimator = TensorFlow(\n", + " entry_point=\"tf_bert_reviews.py\",\n", + " source_dir=\"src\",\n", + " role=role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " volume_size=train_volume_size,\n", + " # use_spot_instances=True,\n", + " # max_wait=7200, # Seconds to wait for spot instances to become available\n", + " py_version=\"py3\",\n", + " framework_version=\"2.1.0\",\n", + " hyperparameters={\n", + " \"epochs\": epochs,\n", + " \"learning_rate\": learning_rate,\n", + " \"epsilon\": epsilon,\n", + " \"train_batch_size\": train_batch_size,\n", + " \"validation_batch_size\": validation_batch_size,\n", + " \"test_batch_size\": test_batch_size,\n", + " \"train_steps_per_epoch\": train_steps_per_epoch,\n", + " \"validation_steps\": validation_steps,\n", + " \"test_steps\": test_steps,\n", + " \"use_xla\": use_xla,\n", + " \"use_amp\": use_amp,\n", + " \"max_seq_length\": max_seq_length,\n", + " \"freeze_bert_layer\": freeze_bert_layer,\n", + " \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n", + " \"enable_checkpointing\": enable_checkpointing,\n", + " \"enable_tensorboard\": enable_tensorboard,\n", + " \"run_validation\": run_validation,\n", + " \"run_test\": run_test,\n", + " \"run_sample_predictions\": run_sample_predictions,\n", + " },\n", + " input_mode=input_mode,\n", + " enable_network_isolation=True,\n", + ")" ] }, { @@ -415,11 +415,10 @@ "metadata": {}, "outputs": [], "source": [ - "estimator.fit(inputs={'train': s3_input_train_data, \n", - " 'validation': s3_input_validation_data,\n", - " 'test': s3_input_test_data\n", - " }, \n", - " wait=False)" + "estimator.fit(\n", + " inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n", + " wait=False,\n", + ")" ] }, { @@ -437,7 +436,7 @@ ], "source": [ "training_job_name = estimator.latest_training_job.name\n", - "print('Training Job Name: {}'.format(training_job_name))" + "print(\"Training Job Name: {}\".format(training_job_name))" ] }, { @@ -461,7 +460,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review Training Job After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review Training Job After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -485,7 +490,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review CloudWatch Logs After About 5 Minutes'.format(region, training_job_name)))\n" + "display(\n", + " HTML(\n", + " 'Review CloudWatch Logs After About 5 Minutes'.format(\n", + " region, training_job_name\n", + " )\n", + " )\n", + ")" ] }, { @@ -509,7 +520,13 @@ "source": [ "from IPython.core.display import display, HTML\n", "\n", - "display(HTML('Review S3 Output Data After The Training Job Has Completed'.format(bucket, training_job_name, region)))\n" + "display(\n", + " HTML(\n", + " 'Review S3 Output Data After The Training Job Has Completed'.format(\n", + " bucket, training_job_name, region\n", + " )\n", + " )\n", + ")" ] }, { diff --git a/12_security/src/inference.py b/12_security/src/inference.py index 2975dc2d..53196737 100644 --- a/12_security/src/inference.py +++ b/12_security/src/inference.py @@ -1,102 +1,97 @@ import json import subprocess import sys -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.3.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==4.1.1']) + +subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow==2.3.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.1.1"]) # Workaround for https://github.com/huggingface/tokenizers/issues/120 and # https://github.com/kaushaltrivedi/fast-bert/issues/174 -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'tokenizers']) import tensorflow as tf from transformers import DistilBertTokenizer -classes=[1, 2, 3, 4, 5] +classes = [1, 2, 3, 4, 5] + +max_seq_length = 64 -max_seq_length=64 +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') def input_handler(data, context): - data_str = data.read().decode('utf-8') - print('data_str: {}'.format(data_str)) - print('type data_str: {}'.format(type(data_str))) - + data_str = data.read().decode("utf-8") + print("data_str: {}".format(data_str)) + print("type data_str: {}".format(type(data_str))) + jsonlines = data_str.split("\n") - print('jsonlines: {}'.format(jsonlines)) - print('type jsonlines: {}'.format(type(jsonlines))) - + print("jsonlines: {}".format(jsonlines)) + print("type jsonlines: {}".format(type(jsonlines))) + transformed_instances = [] - + for jsonline in jsonlines: - print('jsonline: {}'.format(jsonline)) - print('type jsonline: {}'.format(type(jsonline))) + print("jsonline: {}".format(jsonline)) + print("type jsonline: {}".format(type(jsonline))) # features[0] is review_body # features[1..n] are others (ie. 1: product_category, etc) review_body = json.loads(jsonline)["features"][0] print("""review_body: {}""".format(review_body)) - - encode_plus_tokens = tokenizer.encode_plus(review_body, - pad_to_max_length=True, - max_length=max_seq_length, - truncation=True) + + encode_plus_tokens = tokenizer.encode_plus( + review_body, pad_to_max_length=True, max_length=max_seq_length, truncation=True + ) # Convert the text-based tokens to ids from the pre-trained BERT vocabulary - input_ids = encode_plus_tokens['input_ids'] - + input_ids = encode_plus_tokens["input_ids"] + # Specifies which tokens BERT should pay attention to (0 or 1) - input_mask = encode_plus_tokens['attention_mask'] - - transformed_instance = { - "input_ids": input_ids, - "input_mask": input_mask - } - + input_mask = encode_plus_tokens["attention_mask"] + + transformed_instance = {"input_ids": input_ids, "input_mask": input_mask} + transformed_instances.append(transformed_instance) - - transformed_data = { - "signature_name":"serving_default", - "instances": transformed_instances - } + + transformed_data = {"signature_name": "serving_default", "instances": transformed_instances} transformed_data_json = json.dumps(transformed_data) - print('transformed_data_json: {}'.format(transformed_data_json)) - + print("transformed_data_json: {}".format(transformed_data_json)) + return transformed_data_json def output_handler(response, context): - print('response: {}'.format(response)) + print("response: {}".format(response)) response_json = response.json() - print('response_json: {}'.format(response_json)) - + print("response_json: {}".format(response_json)) + log_probabilities = response_json["predictions"] - print('log_probabilities: {}'.format(log_probabilities)) - + print("log_probabilities: {}".format(log_probabilities)) + predicted_classes = [] for log_probability in log_probabilities: - print('log_probability in loop: {}'.format(log_probability)) - print('type(log_probability) in loop: {}'.format(type(log_probability))) - - softmax = tf.nn.softmax(log_probability) - - predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) + print("log_probability in loop: {}".format(log_probability)) + print("type(log_probability) in loop: {}".format(type(log_probability))) + + softmax = tf.nn.softmax(log_probability) + + predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32) predicted_class = classes[predicted_class_idx] - print('predicted_class: {}'.format(predicted_class)) + print("predicted_class: {}".format(predicted_class)) prediction_dict = {} - prediction_dict['predicted_label'] = predicted_class - + prediction_dict["predicted_label"] = predicted_class + jsonline = json.dumps(prediction_dict) - print('jsonline: {}'.format(jsonline)) - + print("jsonline: {}".format(jsonline)) + predicted_classes.append(jsonline) - print('predicted_classes in the loop: {}'.format(predicted_classes)) - - predicted_classes_jsonlines = '\n'.join(predicted_classes) - print('predicted_classes_jsonlines: {}'.format(predicted_classes_jsonlines)) + print("predicted_classes in the loop: {}".format(predicted_classes)) + + predicted_classes_jsonlines = "\n".join(predicted_classes) + print("predicted_classes_jsonlines: {}".format(predicted_classes_jsonlines)) response_content_type = context.accept_header - - return predicted_classes_jsonlines, response_content_type \ No newline at end of file + + return predicted_classes_jsonlines, response_content_type diff --git a/12_security/src/tf_bert_reviews.py b/12_security/src/tf_bert_reviews.py index 8c46a2ef..30ad69de 100644 --- a/12_security/src/tf_bert_reviews.py +++ b/12_security/src/tf_bert_reviews.py @@ -9,91 +9,92 @@ import sys import os import tensorflow as tf -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) -#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1']) -subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1']) + +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==2.8.0"]) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0']) +# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3']) +subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn==0.23.1"]) +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib==3.2.1"]) from transformers import DistilBertTokenizer from transformers import TFDistilBertForSequenceClassification from transformers import TextClassificationPipeline from transformers.configuration_distilbert import DistilBertConfig from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.models import load_model -#from tensorflow.keras.mixed_precision import experimental as mixed_precision + +# from tensorflow.keras.mixed_precision import experimental as mixed_precision CLASSES = [1, 2, 3, 4, 5] def select_data_and_label_from_record(record): - x = { - 'input_ids': record['input_ids'], - 'input_mask': record['input_mask'], - 'segment_ids': record['segment_ids'] - } + x = {"input_ids": record["input_ids"], "input_mask": record["input_mask"], "segment_ids": record["segment_ids"]} - y = record['label_ids'] + y = record["label_ids"] return (x, y) -def file_based_input_dataset_builder(channel, - input_filenames, - pipe_mode, - is_training, - drop_remainder, - batch_size, - epochs, - steps_per_epoch, - max_seq_length): +def file_based_input_dataset_builder( + channel, + input_filenames, + pipe_mode, + is_training, + drop_remainder, + batch_size, + epochs, + steps_per_epoch, + max_seq_length, +): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: - print('***** Using pipe_mode with channel {}'.format(channel)) + print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset - dataset = PipeModeDataset(channel=channel, - record_format='TFRecord') + + dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: - print('***** Using input_filenames {}'.format(input_filenames)) + print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) -# dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { - "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "label_ids": tf.io.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py - # Convert input_ids into input_tokens with DistilBert vocabulary + # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record - + dataset = dataset.apply( tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder, - num_parallel_calls=tf.data.experimental.AUTOTUNE)) + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder, + num_parallel_calls=tf.data.experimental.AUTOTUNE, + ) + ) -# dataset.cache() + # dataset.cache() - dataset = dataset.shuffle(buffer_size=1000, - reshuffle_each_iteration=True) + dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 - print('**************** {} *****************'.format(channel)) + print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: @@ -106,236 +107,178 @@ def _decode_record(record, name_to_features): def load_checkpoint_model(checkpoint_path): import glob import os - - glob_pattern = os.path.join(checkpoint_path, '*.h5') - print('glob pattern {}'.format(glob_pattern)) + + glob_pattern = os.path.join(checkpoint_path, "*.h5") + print("glob pattern {}".format(glob_pattern)) list_of_checkpoint_files = glob.glob(glob_pattern) - print('List of checkpoint files {}'.format(list_of_checkpoint_files)) - + print("List of checkpoint files {}".format(list_of_checkpoint_files)) + latest_checkpoint_file = max(list_of_checkpoint_files) - print('Latest checkpoint file {}'.format(latest_checkpoint_file)) + print("Latest checkpoint file {}".format(latest_checkpoint_file)) - initial_epoch_number_str = latest_checkpoint_file.rsplit('_', 1)[-1].split('.h5')[0] + initial_epoch_number_str = latest_checkpoint_file.rsplit("_", 1)[-1].split(".h5")[0] initial_epoch_number = int(initial_epoch_number_str) - loaded_model = TFDistilBertForSequenceClassification.from_pretrained( - latest_checkpoint_file, - config=config) + loaded_model = TFDistilBertForSequenceClassification.from_pretrained(latest_checkpoint_file, config=config) + + print("loaded_model {}".format(loaded_model)) + print("initial_epoch_number {}".format(initial_epoch_number)) - print('loaded_model {}'.format(loaded_model)) - print('initial_epoch_number {}'.format(initial_epoch_number)) - return loaded_model, initial_epoch_number -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--train_data', - type=str, - default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation_data', - type=str, - default=os.environ['SM_CHANNEL_VALIDATION']) - parser.add_argument('--test_data', - type=str, - default=os.environ['SM_CHANNEL_TEST']) - parser.add_argument('--output_dir', - type=str, - default=os.environ['SM_OUTPUT_DIR']) - parser.add_argument('--hosts', - type=list, - default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current_host', - type=str, - default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--num_gpus', - type=int, - default=os.environ['SM_NUM_GPUS']) - parser.add_argument('--checkpoint_base_path', - type=str, - default='/opt/ml/checkpoints') - parser.add_argument('--use_xla', - type=eval, - default=False) - parser.add_argument('--use_amp', - type=eval, - default=False) - parser.add_argument('--max_seq_length', - type=int, - default=64) - parser.add_argument('--train_batch_size', - type=int, - default=128) - parser.add_argument('--validation_batch_size', - type=int, - default=256) - parser.add_argument('--test_batch_size', - type=int, - default=256) - parser.add_argument('--epochs', - type=int, - default=2) - parser.add_argument('--learning_rate', - type=float, - default=0.00003) - parser.add_argument('--epsilon', - type=float, - default=0.00000001) - parser.add_argument('--train_steps_per_epoch', - type=int, - default=None) - parser.add_argument('--validation_steps', - type=int, - default=None) - parser.add_argument('--test_steps', - type=int, - default=None) - parser.add_argument('--freeze_bert_layer', - type=eval, - default=False) - parser.add_argument('--enable_sagemaker_debugger', - type=eval, - default=False) - parser.add_argument('--run_validation', - type=eval, - default=False) - parser.add_argument('--run_test', - type=eval, - default=False) - parser.add_argument('--run_sample_predictions', - type=eval, - default=False) - parser.add_argument('--enable_tensorboard', - type=eval, - default=False) - parser.add_argument('--enable_checkpointing', - type=eval, - default=False) - parser.add_argument('--output_data_dir', # This is unused - type=str, - default=os.environ['SM_OUTPUT_DATA_DIR']) - + parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--validation_data", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) + parser.add_argument("--test_data", type=str, default=os.environ["SM_CHANNEL_TEST"]) + parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current_host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--num_gpus", type=int, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--checkpoint_base_path", type=str, default="/opt/ml/checkpoints") + parser.add_argument("--use_xla", type=eval, default=False) + parser.add_argument("--use_amp", type=eval, default=False) + parser.add_argument("--max_seq_length", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=128) + parser.add_argument("--validation_batch_size", type=int, default=256) + parser.add_argument("--test_batch_size", type=int, default=256) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--learning_rate", type=float, default=0.00003) + parser.add_argument("--epsilon", type=float, default=0.00000001) + parser.add_argument("--train_steps_per_epoch", type=int, default=None) + parser.add_argument("--validation_steps", type=int, default=None) + parser.add_argument("--test_steps", type=int, default=None) + parser.add_argument("--freeze_bert_layer", type=eval, default=False) + parser.add_argument("--enable_sagemaker_debugger", type=eval, default=False) + parser.add_argument("--run_validation", type=eval, default=False) + parser.add_argument("--run_test", type=eval, default=False) + parser.add_argument("--run_sample_predictions", type=eval, default=False) + parser.add_argument("--enable_tensorboard", type=eval, default=False) + parser.add_argument("--enable_checkpointing", type=eval, default=False) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) # This is unused + # This points to the S3 location - this should not be used by our code # We should use /opt/ml/model/ instead - # parser.add_argument('--model_dir', - # type=str, + # parser.add_argument('--model_dir', + # type=str, # default=os.environ['SM_MODEL_DIR']) - + args, _ = parser.parse_known_args() - print("Args:") + print("Args:") print(args) - - env_var = os.environ - print("Environment Variables:") - pprint.pprint(dict(env_var), width = 1) - - print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV'])) - sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV']) - is_master = sm_training_env_json['is_master'] - print('is_master {}'.format(is_master)) - + + env_var = os.environ + print("Environment Variables:") + pprint.pprint(dict(env_var), width=1) + + print("SM_TRAINING_ENV {}".format(env_var["SM_TRAINING_ENV"])) + sm_training_env_json = json.loads(env_var["SM_TRAINING_ENV"]) + is_master = sm_training_env_json["is_master"] + print("is_master {}".format(is_master)) + train_data = args.train_data - print('train_data {}'.format(train_data)) + print("train_data {}".format(train_data)) validation_data = args.validation_data - print('validation_data {}'.format(validation_data)) + print("validation_data {}".format(validation_data)) test_data = args.test_data - print('test_data {}'.format(test_data)) - local_model_dir = os.environ['SM_MODEL_DIR'] + print("test_data {}".format(test_data)) + local_model_dir = os.environ["SM_MODEL_DIR"] output_dir = args.output_dir - print('output_dir {}'.format(output_dir)) + print("output_dir {}".format(output_dir)) hosts = args.hosts - print('hosts {}'.format(hosts)) + print("hosts {}".format(hosts)) current_host = args.current_host - print('current_host {}'.format(current_host)) + print("current_host {}".format(current_host)) num_gpus = args.num_gpus - print('num_gpus {}'.format(num_gpus)) - job_name = os.environ['SAGEMAKER_JOB_NAME'] - print('job_name {}'.format(job_name)) + print("num_gpus {}".format(num_gpus)) + job_name = os.environ["SAGEMAKER_JOB_NAME"] + print("job_name {}".format(job_name)) use_xla = args.use_xla - print('use_xla {}'.format(use_xla)) + print("use_xla {}".format(use_xla)) use_amp = args.use_amp - print('use_amp {}'.format(use_amp)) + print("use_amp {}".format(use_amp)) max_seq_length = args.max_seq_length - print('max_seq_length {}'.format(max_seq_length)) + print("max_seq_length {}".format(max_seq_length)) train_batch_size = args.train_batch_size - print('train_batch_size {}'.format(train_batch_size)) + print("train_batch_size {}".format(train_batch_size)) validation_batch_size = args.validation_batch_size - print('validation_batch_size {}'.format(validation_batch_size)) + print("validation_batch_size {}".format(validation_batch_size)) test_batch_size = args.test_batch_size - print('test_batch_size {}'.format(test_batch_size)) + print("test_batch_size {}".format(test_batch_size)) epochs = args.epochs - print('epochs {}'.format(epochs)) + print("epochs {}".format(epochs)) learning_rate = args.learning_rate - print('learning_rate {}'.format(learning_rate)) + print("learning_rate {}".format(learning_rate)) epsilon = args.epsilon - print('epsilon {}'.format(epsilon)) + print("epsilon {}".format(epsilon)) train_steps_per_epoch = args.train_steps_per_epoch - print('train_steps_per_epoch {}'.format(train_steps_per_epoch)) + print("train_steps_per_epoch {}".format(train_steps_per_epoch)) validation_steps = args.validation_steps - print('validation_steps {}'.format(validation_steps)) + print("validation_steps {}".format(validation_steps)) test_steps = args.test_steps - print('test_steps {}'.format(test_steps)) + print("test_steps {}".format(test_steps)) freeze_bert_layer = args.freeze_bert_layer - print('freeze_bert_layer {}'.format(freeze_bert_layer)) + print("freeze_bert_layer {}".format(freeze_bert_layer)) enable_sagemaker_debugger = args.enable_sagemaker_debugger - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) run_validation = args.run_validation - print('run_validation {}'.format(run_validation)) + print("run_validation {}".format(run_validation)) run_test = args.run_test - print('run_test {}'.format(run_test)) + print("run_test {}".format(run_test)) run_sample_predictions = args.run_sample_predictions - print('run_sample_predictions {}'.format(run_sample_predictions)) + print("run_sample_predictions {}".format(run_sample_predictions)) enable_tensorboard = args.enable_tensorboard - print('enable_tensorboard {}'.format(enable_tensorboard)) + print("enable_tensorboard {}".format(enable_tensorboard)) enable_checkpointing = args.enable_checkpointing - print('enable_checkpointing {}'.format(enable_checkpointing)) + print("enable_checkpointing {}".format(enable_checkpointing)) checkpoint_base_path = args.checkpoint_base_path - print('checkpoint_base_path {}'.format(checkpoint_base_path)) + print("checkpoint_base_path {}".format(checkpoint_base_path)) if is_master: checkpoint_path = checkpoint_base_path else: - checkpoint_path = '/tmp/checkpoints' - print('checkpoint_path {}'.format(checkpoint_path)) - - # Determine if PipeMode is enabled - pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '') - pipe_mode = (pipe_mode_str.find('Pipe') >= 0) - print('Using pipe_mode: {}'.format(pipe_mode)) - - # Model Output - transformer_fine_tuned_model_path = os.path.join(local_model_dir, 'transformers/fine-tuned/') + checkpoint_path = "/tmp/checkpoints" + print("checkpoint_path {}".format(checkpoint_path)) + + # Determine if PipeMode is enabled + pipe_mode_str = os.environ.get("SM_INPUT_DATA_CONFIG", "") + pipe_mode = pipe_mode_str.find("Pipe") >= 0 + print("Using pipe_mode: {}".format(pipe_mode)) + + # Model Output + transformer_fine_tuned_model_path = os.path.join(local_model_dir, "transformers/fine-tuned/") os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output - tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0') + tensorflow_saved_model_path = os.path.join(local_model_dir, "tensorflow/saved_model/0") os.makedirs(tensorflow_saved_model_path, exist_ok=True) - # Tensorboard Logs - tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/') + # Tensorboard Logs + tensorboard_logs_path = os.path.join(local_model_dir, "tensorboard/") os.makedirs(tensorboard_logs_path, exist_ok=True) # Commented out due to incompatibility with transformers library (possibly) - # Set the global precision mixed_precision policy to "mixed_float16" -# mixed_precision_policy = 'mixed_float16' -# print('Mixed precision policy {}'.format(mixed_precision_policy)) -# policy = mixed_precision.Policy(mixed_precision_policy) -# mixed_precision.set_policy(policy) - + # Set the global precision mixed_precision policy to "mixed_float16" + # mixed_precision_policy = 'mixed_float16' + # print('Mixed precision policy {}'.format(mixed_precision_policy)) + # policy = mixed_precision.Policy(mixed_precision_policy) + # mixed_precision.set_policy(policy) + distributed_strategy = tf.distribute.MirroredStrategy() # Comment out when using smdebug as smdebug does not support MultiWorkerMirroredStrategy() as of smdebug 0.8.0 - #distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + # distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) - train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) - print('train_data_filenames {}'.format(train_data_filenames)) + train_data_filenames = glob(os.path.join(train_data, "*.tfrecord")) + print("train_data_filenames {}".format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( - channel='train', + channel="train", input_filenames=train_data_filenames, pipe_mode=pipe_mode, is_training=True, @@ -343,7 +286,8 @@ def load_checkpoint_model(checkpoint_path): batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) tokenizer = None config = None @@ -352,83 +296,82 @@ def load_checkpoint_model(checkpoint_path): # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 - while (retries < 5 and not successful_download): + while retries < 5 and not successful_download: try: - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - config = DistilBertConfig.from_pretrained('distilbert-base-uncased', - num_labels=len(CLASSES)) - model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', - config=config) + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=len(CLASSES)) + model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config) successful_download = True - print('Sucessfully downloaded after {} retries.'.format(retries)) + print("Sucessfully downloaded after {} retries.".format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) - print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) + print("Retry #{}. Sleeping for {} seconds".format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] - initial_epoch_number = 0 + initial_epoch_number = 0 if enable_checkpointing: - print('***** Checkpoint enabled *****') - - os.makedirs(checkpoint_path, exist_ok=True) + print("***** Checkpoint enabled *****") + + os.makedirs(checkpoint_path, exist_ok=True) if os.listdir(checkpoint_path): - print('***** Found checkpoint *****') + print("***** Found checkpoint *****") print(checkpoint_path) model, initial_epoch_number = load_checkpoint_model(checkpoint_path) - print('***** Using checkpoint model {} *****'.format(model)) - + print("***** Using checkpoint model {} *****".format(model)) + checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(checkpoint_path, 'tf_model_{epoch:05d}.h5'), - save_weights_only=False, - verbose=1, - monitor='val_accuracy') - print('*** CHECKPOINT CALLBACK {} ***'.format(checkpoint_callback)) + filepath=os.path.join(checkpoint_path, "tf_model_{epoch:05d}.h5"), + save_weights_only=False, + verbose=1, + monitor="val_accuracy", + ) + print("*** CHECKPOINT CALLBACK {} ***".format(checkpoint_callback)) callbacks.append(checkpoint_callback) if not tokenizer or not model or not config: - print('Not properly initialized...') + print("Not properly initialized...") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) - print('** use_amp {}'.format(use_amp)) + print("** use_amp {}".format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - print('enable_sagemaker_debugger {}'.format(enable_sagemaker_debugger)) + print("enable_sagemaker_debugger {}".format(enable_sagemaker_debugger)) if enable_sagemaker_debugger: - print('*** DEBUGGING ***') + print("*** DEBUGGING ***") import smdebug.tensorflow as smd + # This assumes that we specified debugger_hook_config debugger_callback = smd.KerasHook.create_from_json_file() - print('*** DEBUGGER CALLBACK {} ***'.format(debugger_callback)) + print("*** DEBUGGER CALLBACK {} ***".format(debugger_callback)) callbacks.append(debugger_callback) optimizer = debugger_callback.wrap_optimizer(optimizer) - if enable_tensorboard: - tensorboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=tensorboard_logs_path) - print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback)) + if enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path) + print("*** TENSORBOARD CALLBACK {} ***".format(tensorboard_callback)) callbacks.append(tensorboard_callback) - - print('*** OPTIMIZER {} ***'.format(optimizer)) - + + print("*** OPTIMIZER {} ***".format(optimizer)) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - print('Compiled model {}'.format(model)) + print("Compiled model {}".format(model)) model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: - validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) - print('validation_data_filenames {}'.format(validation_data_filenames)) + validation_data_filenames = glob(os.path.join(validation_data, "*.tfrecord")) + print("validation_data_filenames {}".format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( - channel='validation', + channel="validation", input_filenames=validation_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -436,34 +379,39 @@ def load_checkpoint_model(checkpoint_path): batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting Training and Validation...') + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting Training and Validation...") validation_dataset = validation_dataset.take(validation_steps) - train_and_validation_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - validation_data=validation_dataset, - validation_steps=validation_steps, - callbacks=callbacks) + train_and_validation_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + validation_data=validation_dataset, + validation_steps=validation_steps, + callbacks=callbacks, + ) print(train_and_validation_history) - else: # Not running validation - print('Starting Training (Without Validation)...') - train_history = model.fit(train_dataset, - shuffle=True, - epochs=epochs, - initial_epoch=initial_epoch_number, - steps_per_epoch=train_steps_per_epoch, - callbacks=callbacks) + else: # Not running validation + print("Starting Training (Without Validation)...") + train_history = model.fit( + train_dataset, + shuffle=True, + epochs=epochs, + initial_epoch=initial_epoch_number, + steps_per_epoch=train_steps_per_epoch, + callbacks=callbacks, + ) print(train_history) if run_test: - test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) - print('test_data_filenames {}'.format(test_data_filenames)) + test_data_filenames = glob(os.path.join(test_data, "*.tfrecord")) + print("test_data_filenames {}".format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( - channel='test', + channel="test", input_filenames=test_data_filenames, pipe_mode=pipe_mode, is_training=False, @@ -471,138 +419,139 @@ def load_checkpoint_model(checkpoint_path): batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, - max_seq_length=max_seq_length).map(select_data_and_label_from_record) - - print('Starting test...') - test_history = model.evaluate(test_dataset, - steps=test_steps, - callbacks=callbacks) - - print('Test history {}'.format(test_history)) - + max_seq_length=max_seq_length, + ).map(select_data_and_label_from_record) + + print("Starting test...") + test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) + + print("Test history {}".format(test_history)) + # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model - print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) + print("transformer_fine_tuned_model_path {}".format(transformer_fine_tuned_model_path)) model.save_pretrained(transformer_fine_tuned_model_path) # Save the TensorFlow SavedModel for Serving Predictions - print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) - model.save(tensorflow_saved_model_path, save_format='tf') - + print("tensorflow_saved_model_path {}".format(tensorflow_saved_model_path)) + model.save(tensorflow_saved_model_path, save_format="tf") + # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ - inference_path = os.path.join(local_model_dir, 'code/') - print('Copying inference source files to {}'.format(inference_path)) - os.makedirs(inference_path, exist_ok=True) - os.system('cp inference.py {}'.format(inference_path)) - print(glob(inference_path)) -# os.system('cp requirements.txt {}/code'.format(inference_path)) - + inference_path = os.path.join(local_model_dir, "code/") + print("Copying inference source files to {}".format(inference_path)) + os.makedirs(inference_path, exist_ok=True) + os.system("cp inference.py {}".format(inference_path)) + print(glob(inference_path)) + # os.system('cp requirements.txt {}/code'.format(inference_path)) + if run_sample_predictions: - loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path, - id2label={ - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5 - }, - label2id={ - 1: 0, - 2: 1, - 3: 2, - 4: 3, - 5: 4 - }) - - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + loaded_model = TFDistilBertForSequenceClassification.from_pretrained( + transformer_fine_tuned_model_path, + id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, + ) + + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") if num_gpus >= 1: - inference_device = 0 # GPU 0 + inference_device = 0 # GPU 0 else: - inference_device = -1 # CPU - print('inference_device {}'.format(inference_device)) + inference_device = -1 # CPU + print("inference_device {}".format(inference_device)) - inference_pipeline = TextClassificationPipeline(model=loaded_model, - tokenizer=tokenizer, - framework='tf', - device=inference_device) + inference_pipeline = TextClassificationPipeline( + model=loaded_model, tokenizer=tokenizer, framework="tf", device=inference_device + ) - print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone.""")) + print( + """I loved it! I will recommend this to everyone.""", + inference_pipeline("""I loved it! I will recommend this to everyone."""), + ) print("""It's OK.""", inference_pipeline("""It's OK.""")) - print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore.""")) + print( + """Really bad. I hope they don't make this anymore.""", + inference_pipeline("""Really bad. I hope they don't make this anymore."""), + ) import csv - df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', - delimiter='\t', - quoting=csv.QUOTE_NONE, - compression='gzip')[['review_body', 'star_rating']] + df_test_reviews = pd.read_csv( + "./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz", + delimiter="\t", + quoting=csv.QUOTE_NONE, + compression="gzip", + )[["review_body", "star_rating"]] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() - + import pandas as pd def predict(review_body): prediction_map = inference_pipeline(review_body) - return prediction_map[0]['label'] + return prediction_map[0]["label"] - y_test = df_test_reviews['review_body'].map(predict) + y_test = df_test_reviews["review_body"].map(predict) y_test - - y_actual = df_test_reviews['star_rating'] + + y_actual = df_test_reviews["star_rating"] y_actual from sklearn.metrics import classification_report + print(classification_report(y_true=y_test, y_pred=y_actual)) - + from sklearn.metrics import accuracy_score - print('Accuracy: ', accuracy_score(y_true=y_test, y_pred=y_actual)) - + + print("Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_actual)) + import matplotlib.pyplot as plt import pandas as pd - def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): + def plot_conf_mat(cm, classes, title, cmap=plt.cm.Greens): print(cm) - plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) - fmt = 'd' - thresh = cm.max() / 2. + fmt = "d" + thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="black" if cm[i, j] > thresh else "black") + plt.text( + j, + i, + format(cm[i, j], fmt), + horizontalalignment="center", + color="black" if cm[i, j] > thresh else "black", + ) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - + plt.ylabel("True label") + plt.xlabel("Predicted label") + import itertools import numpy as np from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt + #%matplotlib inline #%config InlineBackend.figure_format='retina' cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() - fig, ax = plt.subplots(figsize=(10,5)) - plot_conf_mat(cm, - classes=['1', '2', '3', '4', '5'], - title='Confusion Matrix') + fig, ax = plt.subplots(figsize=(10, 5)) + plot_conf_mat(cm, classes=["1", "2", "3", "4", "5"], title="Confusion Matrix") - # Save the confusion matrix + # Save the confusion matrix plt.show() - - # Model Output - metrics_path = os.path.join(local_model_dir, 'metrics/') + + # Model Output + metrics_path = os.path.join(local_model_dir, "metrics/") os.makedirs(metrics_path, exist_ok=True) - plt.savefig('{}/confusion_matrix.png'.format(metrics_path)) + plt.savefig("{}/confusion_matrix.png".format(metrics_path)) diff --git a/99_cleanup/01_Cleanup.ipynb b/99_cleanup/01_Cleanup.ipynb index 75d00ab1..8573ab45 100644 --- a/99_cleanup/01_Cleanup.ipynb +++ b/99_cleanup/01_Cleanup.ipynb @@ -14,8 +14,8 @@ "bucket = sagemaker_session.default_bucket()\n", "region = boto3.Session().region_name\n", "\n", - "sm = boto3.Session().client(service_name='sagemaker', region_name=region)\n", - "comprehend = boto3.Session().client(service_name='comprehend', region_name=region)" + "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n", + "comprehend = boto3.Session().client(service_name=\"comprehend\", region_name=region)" ] }, { @@ -70,9 +70,7 @@ "outputs": [], "source": [ "try:\n", - " comprehend.delete_endpoint(\n", - " EndpointArn=comprehend_endpoint_arn\n", - " )\n", + " comprehend.delete_endpoint(EndpointArn=comprehend_endpoint_arn)\n", "except:\n", " pass" ] @@ -84,9 +82,7 @@ "outputs": [], "source": [ "try:\n", - " sm.delete_endpoint(\n", - " EndpointName=autopilot_endpoint_name\n", - " )\n", + " sm.delete_endpoint(EndpointName=autopilot_endpoint_name)\n", "except:\n", " pass" ] @@ -98,9 +94,7 @@ "outputs": [], "source": [ "try:\n", - " sm.delete_endpoint(\n", - " EndpointName=tensorflow_endpoint_name\n", - " )\n", + " sm.delete_endpoint(EndpointName=tensorflow_endpoint_name)\n", "except:\n", " pass" ] @@ -112,9 +106,7 @@ "outputs": [], "source": [ "try:\n", - " sm.delete_endpoint(\n", - " EndpointName=pytorch_endpoint_name\n", - " )\n", + " sm.delete_endpoint(EndpointName=pytorch_endpoint_name)\n", "except:\n", " pass" ] @@ -126,9 +118,7 @@ "outputs": [], "source": [ "try:\n", - " sm.delete_endpoint(\n", - " EndpointName=bandit_experiment_name\n", - " )\n", + " sm.delete_endpoint(EndpointName=bandit_experiment_name)\n", "except:\n", " pass" ] @@ -140,9 +130,7 @@ "outputs": [], "source": [ "try:\n", - " sm.delete_endpoint(\n", - " EndpointName=pipeline_endpoint_name\n", - " )\n", + " sm.delete_endpoint(EndpointName=pipeline_endpoint_name)\n", "except:\n", " pass" ] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..fa2ea559 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.black] +line-length = 119 +exclude = ''' +( + /( + \.eggs + | \.git + | \.mypy_cache + | build + | dist + | spark + | jars + | \.jar + | wip + ) +) +'''