From 776cd0a44fd9304ac30dc624ea2a2239dbb089be Mon Sep 17 00:00:00 2001 From: Davide Bisso Date: Fri, 21 Jun 2024 11:53:36 +0200 Subject: [PATCH 1/4] Adding NotebookVersion Parameter as specified in official AWS Docs https://docs.aws.amazon.com/athena/latest/APIReference/API_StartSession.html#athena-StartSession-request-NotebookVersion, this parameter is necessary to create session using Spark --- awswrangler/athena/_spark.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/awswrangler/athena/_spark.py b/awswrangler/athena/_spark.py index 88c83d750..8f4bd555f 100644 --- a/awswrangler/athena/_spark.py +++ b/awswrangler/athena/_spark.py @@ -94,6 +94,7 @@ def create_spark_session( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, + notebook_version: str = 'Athena notebook version 1', idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> str: @@ -116,6 +117,9 @@ def create_spark_session( spark_properties: Dict[str, Any], optional Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. + notebook_version: str + The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. + The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional The idle timeout in minutes for the session. The default is 15. boto3_session : boto3.Session(), optional @@ -146,6 +150,7 @@ def create_spark_session( WorkGroup=workgroup, EngineConfiguration=engine_configuration, SessionIdleTimeoutInMinutes=idle_timeout, + NotebookVersion=notebook_version, ) _logger.info("Session info:\n%s", response) session_id: str = response["SessionId"] @@ -166,6 +171,7 @@ def run_spark_calculation( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, + notebook_version: str = 'Athena notebook version 1', idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> dict[str, Any]: @@ -192,6 +198,9 @@ def run_spark_calculation( spark_properties: Dict[str, Any], optional Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. + notebook_version: str + The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. + The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional The idle timeout in minutes for the session. The default is 15. boto3_session : boto3.Session(), optional @@ -221,6 +230,7 @@ def run_spark_calculation( default_executor_dpu_size=default_executor_dpu_size, additional_configs=additional_configs, spark_properties=spark_properties, + notebook_version=notebook_version, idle_timeout=idle_timeout, boto3_session=boto3_session, ) From deb4880929ab88607c8d4244d603222753b9e787 Mon Sep 17 00:00:00 2001 From: Davide Bisso Date: Fri, 21 Jun 2024 12:25:03 +0200 Subject: [PATCH 2/4] Reformatted using ruff --- awswrangler/athena/_spark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/awswrangler/athena/_spark.py b/awswrangler/athena/_spark.py index 8f4bd555f..6f15c4b6a 100644 --- a/awswrangler/athena/_spark.py +++ b/awswrangler/athena/_spark.py @@ -94,7 +94,7 @@ def create_spark_session( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, - notebook_version: str = 'Athena notebook version 1', + notebook_version: str = "Athena notebook version 1", idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> str: @@ -118,7 +118,7 @@ def create_spark_session( Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. notebook_version: str - The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. + The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional The idle timeout in minutes for the session. The default is 15. @@ -171,7 +171,7 @@ def run_spark_calculation( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, - notebook_version: str = 'Athena notebook version 1', + notebook_version: str = "Athena notebook version 1", idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> dict[str, Any]: @@ -199,7 +199,7 @@ def run_spark_calculation( Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. notebook_version: str - The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. + The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional The idle timeout in minutes for the session. The default is 15. From 352339867c064230982df37a37af2c5161971493 Mon Sep 17 00:00:00 2001 From: Abdel Jaidi Date: Mon, 24 Jun 2024 10:45:22 +0100 Subject: [PATCH 3/4] fix: NotebookVersion must be optional --- awswrangler/athena/_spark.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/awswrangler/athena/_spark.py b/awswrangler/athena/_spark.py index 6f15c4b6a..68adad502 100644 --- a/awswrangler/athena/_spark.py +++ b/awswrangler/athena/_spark.py @@ -94,7 +94,7 @@ def create_spark_session( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, - notebook_version: str = "Athena notebook version 1", + notebook_version: str | None = None, idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> str: @@ -117,11 +117,11 @@ def create_spark_session( spark_properties: Dict[str, Any], optional Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. - notebook_version: str + notebook_version: str, optional The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional - The idle timeout in minutes for the session. The default is 15. + The idle timeout in minutes for the session. The default is 15. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -146,11 +146,13 @@ def create_spark_session( engine_configuration["AdditionalConfigs"] = additional_configs if spark_properties: engine_configuration["SparkProperties"] = spark_properties + kwargs: Any = {"SessionIdleTimeoutInMinutes": idle_timeout} + if notebook_version: + kwargs["NotebookVersion"] = notebook_version response = client_athena.start_session( WorkGroup=workgroup, EngineConfiguration=engine_configuration, - SessionIdleTimeoutInMinutes=idle_timeout, - NotebookVersion=notebook_version, + **kwargs, ) _logger.info("Session info:\n%s", response) session_id: str = response["SessionId"] From 47e602de9ef418f216bff2200f6cbfb068525f89 Mon Sep 17 00:00:00 2001 From: Abdel Jaidi Date: Mon, 24 Jun 2024 10:48:27 +0100 Subject: [PATCH 4/4] fix: apply change to run_spark_calculation --- awswrangler/athena/_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/awswrangler/athena/_spark.py b/awswrangler/athena/_spark.py index 68adad502..e60a3befd 100644 --- a/awswrangler/athena/_spark.py +++ b/awswrangler/athena/_spark.py @@ -173,7 +173,7 @@ def run_spark_calculation( default_executor_dpu_size: int = 1, additional_configs: dict[str, Any] | None = None, spark_properties: dict[str, Any] | None = None, - notebook_version: str = "Athena notebook version 1", + notebook_version: str | None = None, idle_timeout: int = 15, boto3_session: boto3.Session | None = None, ) -> dict[str, Any]: @@ -200,7 +200,7 @@ def run_spark_calculation( spark_properties: Dict[str, Any], optional Contains SparkProperties in the form of key-value pairs.Specifies custom jar files and Spark properties for use cases like cluster encryption, table formats, and general Spark tuning. - notebook_version: str + notebook_version: str, optional The notebook version. This value is supplied automatically for notebook sessions in the Athena console and is not required for programmatic session access. The only valid notebook version is Athena notebook version 1. If you specify a value for NotebookVersion, you must also specify a value for NotebookId idle_timeout : int, optional