diff --git a/awsconfigs/apps/jupyter-web-app/configs/spawner_ui_config.yaml b/awsconfigs/apps/jupyter-web-app/configs/spawner_ui_config.yaml index bf4365c167..267cbc2042 100644 --- a/awsconfigs/apps/jupyter-web-app/configs/spawner_ui_config.yaml +++ b/awsconfigs/apps/jupyter-web-app/configs/spawner_ui_config.yaml @@ -17,14 +17,14 @@ spawnerFormDefaults: image: # The container Image for the user's Jupyter Notebook - value: public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 + value: public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0 # The list of available standard container Images options: - kubeflownotebookswg/jupyter-scipy:v1.7.0 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-gpu-py39-cu112-ubuntu20.04-ec2-v1.1 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-gpu-py38-cu116-ubuntu20.04-ec2-v1.2 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-cpu-py38-ubuntu20.04-ec2-v1.2 + - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 + - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0 + - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 + - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0 imageGroupOne: # The container Image for the user's Group One Server # The annotation `notebooks.kubeflow.org/http-rewrite-uri: /` diff --git a/awsconfigs/common/ack-sagemaker-controller/base/controller/deployment.yaml b/awsconfigs/common/ack-sagemaker-controller/base/controller/deployment.yaml index 1ae9e285dd..d88b7b041a 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/controller/deployment.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/controller/deployment.yaml @@ -1,8 +1,6 @@ apiVersion: v1 kind: Namespace metadata: - labels: - control-plane: controller name: ack-system --- apiVersion: apps/v1 @@ -11,16 +9,17 @@ metadata: name: ack-sagemaker-controller namespace: ack-system labels: - control-plane: controller + app.kubernetes.io/name: ack-sagemaker-controller + app.kubernetes.io/part-of: ack-system spec: selector: matchLabels: - control-plane: controller + app.kubernetes.io/name: ack-sagemaker-controller replicas: 1 template: metadata: labels: - control-plane: controller + app.kubernetes.io/name: ack-sagemaker-controller spec: containers: - command: diff --git a/awsconfigs/common/ack-sagemaker-controller/base/controller/kustomization.yaml b/awsconfigs/common/ack-sagemaker-controller/base/controller/kustomization.yaml index 718c84b757..87142a6be6 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/controller/kustomization.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/controller/kustomization.yaml @@ -6,4 +6,4 @@ kind: Kustomization images: - name: controller newName: public.ecr.aws/aws-controllers-k8s/sagemaker-controller - newTag: v0.4.5 + newTag: v1.2.1 diff --git a/awsconfigs/common/ack-sagemaker-controller/base/controller/service.yaml b/awsconfigs/common/ack-sagemaker-controller/base/controller/service.yaml index a73d8e659a..c3746e66bc 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/controller/service.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/controller/service.yaml @@ -5,7 +5,7 @@ metadata: namespace: ack-system spec: selector: - control-plane: controller + app.kubernetes.io/name: ack-sagemaker-controller ports: - name: metricsport port: 8080 diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_apps.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_apps.yaml index 06e370d887..e2f43e01c4 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_apps.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_apps.yaml @@ -43,8 +43,7 @@ spec: description: The name of the app. type: string appType: - description: The type of app. Supported apps are JupyterServer and - KernelGateway. TensorBoard is not supported. + description: The type of app. type: string domainID: description: The domain ID. @@ -91,13 +90,13 @@ spec: type: object type: array userProfileName: - description: The user profile name. + description: The user profile name. If this value is not set, then + SpaceName must be set. type: string required: - appName - appType - domainID - - userProfileName type: object status: description: AppStatus defines the observed state of App diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpointconfigs.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpointconfigs.yaml index 31dbed79da..ca95a9bc3c 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpointconfigs.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpointconfigs.yaml @@ -133,13 +133,14 @@ spec: see SSD Instance Store Volumes (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html)." type: string productionVariants: - description: An list of ProductionVariant objects, one for each model + description: An array of ProductionVariant objects, one for each model that you want to host at this endpoint. items: description: Identifies a model that you want to host and the resources chosen to deploy for hosting it. If you are deploying multiple models, tell SageMaker how to distribute traffic among the models - by specifying variant weights. + by specifying variant weights. For more information on production + variants, check Production variants (https://docs.aws.amazon.com/sagemaker/latest/dg/model-ab-testing.html). properties: acceleratorType: type: string @@ -155,6 +156,8 @@ spec: kmsKeyID: type: string type: object + enableSSMAccess: + type: boolean initialInstanceCount: format: int64 type: integer @@ -167,6 +170,17 @@ spec: type: integer modelName: type: string + serverlessConfig: + description: Specifies the serverless configuration for an endpoint + variant. + properties: + maxConcurrency: + format: int64 + type: integer + memorySizeInMB: + format: int64 + type: integer + type: object variantName: type: string volumeSizeInGB: diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpoints.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpoints.yaml index b7399ca590..00f554b4f3 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpoints.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_endpoints.yaml @@ -44,6 +44,73 @@ spec: description: "EndpointSpec defines the desired state of Endpoint. \n A hosted endpoint for real-time inference." properties: + deploymentConfig: + description: The deployment configuration for an endpoint, which contains + the desired deployment strategy and rollback configurations. + properties: + autoRollbackConfiguration: + description: Automatic rollback configuration for handling endpoint + deployment failures and recovery. + properties: + alarms: + items: + description: An Amazon CloudWatch alarm configured to monitor + metrics on an endpoint. + properties: + alarmName: + type: string + type: object + type: array + type: object + blueGreenUpdatePolicy: + description: Update policy for a blue/green deployment. If this + update policy is specified, SageMaker creates a new fleet during + the deployment while maintaining the old fleet. SageMaker flips + traffic to the new fleet according to the specified traffic + routing configuration. Only one update policy should be used + in the deployment configuration. If no update policy is specified, + SageMaker uses a blue/green deployment strategy with all at + once traffic shifting by default. + properties: + maximumExecutionTimeoutInSeconds: + format: int64 + type: integer + terminationWaitInSeconds: + format: int64 + type: integer + trafficRoutingConfiguration: + description: Defines the traffic routing strategy during an + endpoint deployment to shift traffic from the old fleet + to the new fleet. + properties: + canarySize: + description: Specifies the endpoint capacity to activate + for production. + properties: + type_: + type: string + value: + format: int64 + type: integer + type: object + linearStepSize: + description: Specifies the endpoint capacity to activate + for production. + properties: + type_: + type: string + value: + format: int64 + type: integer + type: object + type_: + type: string + waitIntervalInSeconds: + format: int64 + type: integer + type: object + type: object + type: object endpointConfigName: description: The name of an endpoint configuration. For more information, see CreateEndpointConfig. @@ -181,6 +248,100 @@ spec: description: A timestamp that shows when the endpoint was last modified. format: date-time type: string + pendingDeploymentSummary: + description: Returns the summary of an in-progress deployment. This + field is only returned when the endpoint is creating or updating + with a new endpoint configuration. + properties: + endpointConfigName: + type: string + productionVariants: + items: + description: The production variant summary for a deployment + when an endpoint is creating or updating with the CreateEndpoint + or UpdateEndpoint operations. Describes the VariantStatus + , weight and capacity for a production variant associated + with an endpoint. + properties: + acceleratorType: + type: string + currentInstanceCount: + format: int64 + type: integer + currentServerlessConfig: + description: Specifies the serverless configuration for + an endpoint variant. + properties: + maxConcurrency: + format: int64 + type: integer + memorySizeInMB: + format: int64 + type: integer + type: object + currentWeight: + type: number + deployedImages: + items: + description: "Gets the Amazon EC2 Container Registry path + of the docker image of the model that is hosted in this + ProductionVariant. \n If you used the registry/repository[:tag] + form to specify the image path of the primary container + when you created the model hosted in this ProductionVariant, + the path resolves to a path of the form registry/repository[@digest]. + A digest is a hash value that identifies a specific + version of an image. For information about Amazon ECR + paths, see Pulling an Image (https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-pull-ecr-image.html) + in the Amazon ECR User Guide." + properties: + resolutionTime: + format: date-time + type: string + resolvedImage: + type: string + specifiedImage: + type: string + type: object + type: array + desiredInstanceCount: + format: int64 + type: integer + desiredServerlessConfig: + description: Specifies the serverless configuration for + an endpoint variant. + properties: + maxConcurrency: + format: int64 + type: integer + memorySizeInMB: + format: int64 + type: integer + type: object + desiredWeight: + type: number + instanceType: + type: string + variantName: + type: string + variantStatus: + items: + description: Describes the status of the production variant. + properties: + startTime: + format: date-time + type: string + status: + type: string + statusMessage: + type: string + type: object + type: array + type: object + type: array + startTime: + format: date-time + type: string + type: object productionVariants: description: An array of ProductionVariantSummary objects, one for each model hosted behind this endpoint. @@ -193,6 +354,17 @@ spec: currentInstanceCount: format: int64 type: integer + currentServerlessConfig: + description: Specifies the serverless configuration for an endpoint + variant. + properties: + maxConcurrency: + format: int64 + type: integer + memorySizeInMB: + format: int64 + type: integer + type: object currentWeight: type: number deployedImages: @@ -220,6 +392,17 @@ spec: desiredInstanceCount: format: int64 type: integer + desiredServerlessConfig: + description: Specifies the serverless configuration for an endpoint + variant. + properties: + maxConcurrency: + format: int64 + type: integer + memorySizeInMB: + format: int64 + type: integer + type: object desiredWeight: type: number variantName: diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_featuregroups.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_featuregroups.yaml index c8b132fc1f..106cd9e4bb 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_featuregroups.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_featuregroups.yaml @@ -99,7 +99,9 @@ spec: we encrypt all data at rest using Amazon Web Services KMS key. By defining your bucket-level key (https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-key.html) for SSE, you can reduce Amazon Web Services KMS requests costs by - up to 99 percent. \n To learn more about this parameter, see OfflineStoreConfig." + up to 99 percent. \n * Format for the offline store table. Supported + formats are Glue (Default) and Apache Iceberg (https://iceberg.apache.org/). + \n To learn more about this parameter, see OfflineStoreConfig." properties: dataCatalogConfig: description: The meta data of the Glue table which serves as data diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml index 44c89e58ab..590ced73b7 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml @@ -133,8 +133,7 @@ spec: type: object strategy: description: The strategy hyperparameter tuning uses to find the - best combination of hyperparameters for your model. Currently, - the only supported value is Bayesian. + best combination of hyperparameters for your model. type: string trainingJobEarlyStoppingType: type: string @@ -346,6 +345,10 @@ spec: items: type: string type: array + instanceGroupNames: + items: + type: string + type: array s3DataDistributionType: type: string s3DataType: @@ -423,20 +426,51 @@ spec: type: string type: object resourceConfig: - description: Describes the resources, including ML compute instances - and ML storage volumes, to use for model training. + description: Describes the resources, including machine learning + (ML) compute instances and ML storage volumes, to use for model + training. properties: instanceCount: format: int64 type: integer + instanceGroups: + items: + description: Defines an instance group for heterogeneous + cluster training. When requesting a training job using + the CreateTrainingJob (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html) + API, you can configure multiple instance groups . + properties: + instanceCount: + format: int64 + type: integer + instanceGroupName: + type: string + instanceType: + type: string + type: object + type: array instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: format: int64 type: integer type: object + retryStrategy: + description: The retry strategy to use when a training job fails + due to an InternalServerError. RetryStrategy is specified as + part of the CreateTrainingJob and CreateHyperParameterTuningJob + requests. You can add the StoppingCondition parameter to the + request to limit the training time for the complete job. + properties: + maximumRetryAttempts: + format: int64 + type: integer + type: object roleARN: type: string staticHyperParameters: @@ -673,6 +707,10 @@ spec: items: type: string type: array + instanceGroupNames: + items: + type: string + type: array s3DataDistributionType: type: string s3DataType: @@ -752,20 +790,51 @@ spec: type: string type: object resourceConfig: - description: Describes the resources, including ML compute instances - and ML storage volumes, to use for model training. + description: Describes the resources, including machine learning + (ML) compute instances and ML storage volumes, to use for + model training. properties: instanceCount: format: int64 type: integer + instanceGroups: + items: + description: Defines an instance group for heterogeneous + cluster training. When requesting a training job using + the CreateTrainingJob (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html) + API, you can configure multiple instance groups . + properties: + instanceCount: + format: int64 + type: integer + instanceGroupName: + type: string + instanceType: + type: string + type: object + type: array instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: format: int64 type: integer type: object + retryStrategy: + description: The retry strategy to use when a training job fails + due to an InternalServerError. RetryStrategy is specified + as part of the CreateTrainingJob and CreateHyperParameterTuningJob + requests. You can add the StoppingCondition parameter to the + request to limit the training time for the complete job. + properties: + maximumRetryAttempts: + format: int64 + type: integer + type: object roleARN: type: string staticHyperParameters: @@ -909,9 +978,9 @@ spec: failureReason: type: string finalHyperParameterTuningJobObjectiveMetric: - description: Shows the final value for the objective metric for - a training job that was launched by a hyperparameter tuning - job. You define the objective metric in the HyperParameterTuningJobObjective + description: Shows the latest objective metric emitted by a training + job that was launched by a hyperparameter tuning job. You define + the objective metric in the HyperParameterTuningJobObjective parameter of HyperParameterTuningJobConfig. properties: metricName: @@ -997,9 +1066,9 @@ spec: failureReason: type: string finalHyperParameterTuningJobObjectiveMetric: - description: Shows the final value for the objective metric for - a training job that was launched by a hyperparameter tuning - job. You define the objective metric in the HyperParameterTuningJobObjective + description: Shows the latest objective metric emitted by a training + job that was launched by a hyperparameter tuning job. You define + the objective metric in the HyperParameterTuningJobObjective parameter of HyperParameterTuningJobConfig. properties: metricName: diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_modelpackages.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_modelpackages.yaml index b728669482..e7ed5b81c3 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_modelpackages.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_modelpackages.yaml @@ -443,8 +443,12 @@ spec: type: string samplePayloadURL: description: The Amazon Simple Storage Service (Amazon S3) path where - the sample payload are stored. This path must point to a single - gzip compressed tar archive (.tar.gz suffix). + the sample payload is stored. This path must point to a single gzip + compressed tar archive (.tar.gz suffix). This archive can hold multiple + files that are all equally used in the load test. Each file in the + archive must satisfy the size constraints of the InvokeEndpoint + (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html#API_runtime_InvokeEndpoint_RequestSyntax) + call. type: string sourceAlgorithmSpecification: description: Details about the algorithm that was used to create the diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_models.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_models.yaml index 40d1a5a145..aa98f65309 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_models.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_models.yaml @@ -33,7 +33,8 @@ spec: metadata: type: object spec: - description: ModelSpec defines the desired state of Model. + description: "ModelSpec defines the desired state of Model. \n The properties + of a model as returned by the Search API." properties: containers: description: Specifies the containers in the inference pipeline. diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelineexecutions.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelineexecutions.yaml new file mode 100644 index 0000000000..0f78545e74 --- /dev/null +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelineexecutions.yaml @@ -0,0 +1,162 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: pipelineexecutions.sagemaker.services.k8s.aws +spec: + group: sagemaker.services.k8s.aws + names: + kind: PipelineExecution + listKind: PipelineExecutionList + plural: pipelineexecutions + singular: pipelineexecution + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.failureReason + name: FAILURE-REASON + priority: 1 + type: string + - jsonPath: .status.pipelineExecutionStatus + name: STATUS + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: PipelineExecution is the Schema for the PipelineExecutions API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: "PipelineExecutionSpec defines the desired state of PipelineExecution. + \n An execution of a pipeline." + properties: + parallelismConfiguration: + description: This configuration, if specified, overrides the parallelism + configuration of the parent pipeline for this specific run. + properties: + maxParallelExecutionSteps: + format: int64 + type: integer + type: object + pipelineExecutionDescription: + description: The description of the pipeline execution. + type: string + pipelineExecutionDisplayName: + description: The display name of the pipeline execution. + type: string + pipelineName: + description: The name of the pipeline. + type: string + pipelineParameters: + description: Contains a list of pipeline parameters. This list can + be empty. + items: + description: Assigns a value to a named Pipeline parameter. + properties: + name: + type: string + value: + type: string + type: object + type: array + required: + - pipelineName + type: object + status: + description: PipelineExecutionStatus defines the observed state of PipelineExecution + properties: + ackResourceMetadata: + description: All CRs managed by ACK have a common `Status.ACKResourceMetadata` + member that is used to contain resource sync state, account ownership, + constructed ARN for the resource + properties: + arn: + description: 'ARN is the Amazon Resource Name for the resource. + This is a globally-unique identifier and is set only by the + ACK service controller once the controller has orchestrated + the creation of the resource OR when it has verified that an + "adopted" resource (a resource where the ARN annotation was + set by the Kubernetes user on the CR) exists and matches the + supplied CR''s Spec field values. TODO(vijat@): Find a better + strategy for resources that do not have ARN in CreateOutputResponse + https://github.com/aws/aws-controllers-k8s/issues/270' + type: string + ownerAccountID: + description: OwnerAccountID is the AWS Account ID of the account + that owns the backend AWS service API resource. + type: string + region: + description: Region is the AWS region in which the resource exists + or will exist. + type: string + required: + - ownerAccountID + - region + type: object + conditions: + description: All CRS managed by ACK have a common `Status.Conditions` + member that contains a collection of `ackv1alpha1.Condition` objects + that describe the various terminal states of the CR and its backend + AWS service API resource + items: + description: Condition is the common struct used by all CRDs managed + by ACK service controllers to indicate terminal states of the + CR and its backend AWS service API resource + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type is the type of the Condition + type: string + required: + - status + - type + type: object + type: array + creationTime: + description: The time when the pipeline execution was created. + format: date-time + type: string + failureReason: + description: If the execution failed, a message describing why. + type: string + lastModifiedTime: + description: The time when the pipeline execution was modified last. + format: date-time + type: string + pipelineExecutionStatus: + description: The status of the pipeline execution. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelines.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelines.yaml new file mode 100644 index 0000000000..9e09641a35 --- /dev/null +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_pipelines.yaml @@ -0,0 +1,174 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: pipelines.sagemaker.services.k8s.aws +spec: + group: sagemaker.services.k8s.aws + names: + kind: Pipeline + listKind: PipelineList + plural: pipelines + singular: pipeline + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.pipelineStatus + name: STATUS + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: Pipeline is the Schema for the Pipelines API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: "PipelineSpec defines the desired state of Pipeline. \n A + SageMaker Model Building Pipeline instance." + properties: + parallelismConfiguration: + description: This is the configuration that controls the parallelism + of the pipeline. If specified, it applies to all runs of this pipeline + by default. + properties: + maxParallelExecutionSteps: + format: int64 + type: integer + type: object + pipelineDefinition: + description: The JSON pipeline definition of the pipeline. + type: string + pipelineDescription: + description: A description of the pipeline. + type: string + pipelineDisplayName: + description: The display name of the pipeline. + type: string + pipelineName: + description: The name of the pipeline. + type: string + roleARN: + description: The Amazon Resource Name (ARN) of the role used by the + pipeline to access and create resources. + type: string + tags: + description: A list of tags to apply to the created pipeline. + items: + description: "A tag object that consists of a key and an optional + value, used to manage metadata for SageMaker Amazon Web Services + resources. \n You can add tags to notebook instances, training + jobs, hyperparameter tuning jobs, batch transform jobs, models, + labeling jobs, work teams, endpoint configurations, and endpoints. + For more information on adding tags to SageMaker resources, see + AddTags. \n For more information on adding metadata to your Amazon + Web Services resources with tagging, see Tagging Amazon Web Services + resources (https://docs.aws.amazon.com/general/latest/gr/aws_tagging.html). + For advice on best practices for managing Amazon Web Services + resources with tagging, see Tagging Best Practices: Implement + an Effective Amazon Web Services Resource Tagging Strategy (https://d1.awsstatic.com/whitepapers/aws-tagging-best-practices.pdf)." + properties: + key: + type: string + value: + type: string + type: object + type: array + required: + - pipelineName + - roleARN + type: object + status: + description: PipelineStatus defines the observed state of Pipeline + properties: + ackResourceMetadata: + description: All CRs managed by ACK have a common `Status.ACKResourceMetadata` + member that is used to contain resource sync state, account ownership, + constructed ARN for the resource + properties: + arn: + description: 'ARN is the Amazon Resource Name for the resource. + This is a globally-unique identifier and is set only by the + ACK service controller once the controller has orchestrated + the creation of the resource OR when it has verified that an + "adopted" resource (a resource where the ARN annotation was + set by the Kubernetes user on the CR) exists and matches the + supplied CR''s Spec field values. TODO(vijat@): Find a better + strategy for resources that do not have ARN in CreateOutputResponse + https://github.com/aws/aws-controllers-k8s/issues/270' + type: string + ownerAccountID: + description: OwnerAccountID is the AWS Account ID of the account + that owns the backend AWS service API resource. + type: string + region: + description: Region is the AWS region in which the resource exists + or will exist. + type: string + required: + - ownerAccountID + - region + type: object + conditions: + description: All CRS managed by ACK have a common `Status.Conditions` + member that contains a collection of `ackv1alpha1.Condition` objects + that describe the various terminal states of the CR and its backend + AWS service API resource + items: + description: Condition is the common struct used by all CRDs managed + by ACK service controllers to indicate terminal states of the + CR and its backend AWS service API resource + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type is the type of the Condition + type: string + required: + - status + - type + type: object + type: array + creationTime: + description: The time when the pipeline was created. + format: date-time + type: string + lastModifiedTime: + description: The time when the pipeline was last modified. + format: date-time + type: string + pipelineStatus: + description: The status of the pipeline execution. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 93bafd0a51..e1e376cf01 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -115,16 +115,16 @@ spec: type: string type: object debugHookConfig: - description: Configuration information for the Debugger hook parameters, - metric and tensor collections, and storage paths. To learn more - about how to configure the DebugHookConfig parameter, see Use the - SageMaker and Debugger Configuration API Operations to Create, Update, - and Debug Your Training Job (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html). + description: Configuration information for the Amazon SageMaker Debugger + hook parameters, metric and tensor collections, and storage paths. + To learn more about how to configure the DebugHookConfig parameter, + see Use the SageMaker and Debugger Configuration API Operations + to Create, Update, and Debug Your Training Job (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html). properties: collectionConfigurations: items: - description: Configuration information for the Debugger output - tensor collections. + description: Configuration information for the Amazon SageMaker + Debugger output tensor collections. properties: collectionName: type: string @@ -144,8 +144,8 @@ spec: type: string type: object debugRuleConfigurations: - description: Configuration information for Debugger rules for debugging - output tensors. + description: Configuration information for Amazon SageMaker Debugger + rules for debugging output tensors. items: description: Configuration information for SageMaker Debugger rules for debugging. To learn more about how to configure the DebugRuleConfiguration @@ -227,7 +227,11 @@ spec: provided by SageMaker, see Algorithms (https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html). \n You can specify a maximum of 100 hyperparameters. Each hyperparameter is a key-value pair. Each key and value is limited to 256 characters, - as specified by the Length Constraint." + as specified by the Length Constraint. \n Do not include any security-sensitive + information including account access IDs, secrets or tokens in any + hyperparameter field. If the use of security-sensitive credentials + are detected, SageMaker will reject your training job request and + return an exception error." type: object inputDataConfig: description: "An array of Channel objects. Each channel is a named @@ -276,6 +280,10 @@ spec: items: type: string type: array + instanceGroupNames: + items: + type: string + type: array s3DataDistributionType: type: string s3DataType: @@ -349,8 +357,8 @@ spec: type: string type: object profilerConfig: - description: Configuration information for Debugger system monitoring, - framework profiling, and storage paths. + description: Configuration information for Amazon SageMaker Debugger + system monitoring, framework profiling, and storage paths. properties: profilingIntervalInMilliseconds: format: int64 @@ -363,8 +371,8 @@ spec: type: string type: object profilerRuleConfigurations: - description: Configuration information for Debugger rules for profiling - system and framework metrics. + description: Configuration information for Amazon SageMaker Debugger + rules for profiling system and framework metrics. items: description: Configuration information for profiling rules. properties: @@ -400,14 +408,41 @@ spec: instanceCount: format: int64 type: integer + instanceGroups: + items: + description: Defines an instance group for heterogeneous cluster + training. When requesting a training job using the CreateTrainingJob + (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html) + API, you can configure multiple instance groups . + properties: + instanceCount: + format: int64 + type: integer + instanceGroupName: + type: string + instanceType: + type: string + type: object + type: array instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: format: int64 type: integer type: object + retryStrategy: + description: The number of times to retry the job when the job fails + due to an InternalServerError. + properties: + maximumRetryAttempts: + format: int64 + type: integer + type: object roleARN: description: "The Amazon Resource Name (ARN) of an IAM role that SageMaker can assume to perform tasks on your behalf. \n During model training, @@ -463,8 +498,8 @@ spec: type: object type: array tensorBoardOutputConfig: - description: Configuration of storage locations for the Debugger TensorBoard - output data. + description: Configuration of storage locations for the Amazon SageMaker + Debugger TensorBoard output data. properties: localPath: type: string @@ -563,9 +598,14 @@ spec: - type type: object type: array + creationTime: + description: A timestamp that indicates when the training job was + created. + format: date-time + type: string debugRuleEvaluationStatuses: - description: Evaluation status of Debugger rules for debugging on - a training job. + description: Evaluation status of Amazon SageMaker Debugger rules + for debugging on a training job. items: description: Information about the status of the rule evaluation. properties: @@ -585,6 +625,11 @@ spec: failureReason: description: If the training job failed, the reason it failed. type: string + lastModifiedTime: + description: A timestamp that indicates when the status of the training + job was last modified. + format: date-time + type: string modelArtifacts: description: Information about the Amazon S3 location that is configured for storing model artifacts. @@ -593,8 +638,8 @@ spec: type: string type: object profilerRuleEvaluationStatuses: - description: Evaluation status of Debugger rules for profiling on - a training job. + description: Evaluation status of Amazon SageMaker Debugger rules + for profiling on a training job. items: description: Information about the status of the rule evaluation. properties: @@ -611,6 +656,9 @@ spec: type: string type: object type: array + profilingStatus: + description: Profiling status of a training job. + type: string secondaryStatus: description: "Provides detailed information about the state of the training job. For detailed information on the secondary status of @@ -645,6 +693,18 @@ spec: - The training job has stopped. \n For more detailed information, see SecondaryStatus." type: string + warmPoolStatus: + description: The status of the warm pool associated with the training + job. + properties: + resourceRetainedBillableTimeInSeconds: + format: int64 + type: integer + reusedByJob: + type: string + status: + type: string + type: object type: object type: object served: true diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml index 4d7860537e..17c7fd47db 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml @@ -45,16 +45,16 @@ spec: singleSignOnUserIdentifier: description: A specifier for the type of value specified in SingleSignOnUserValue. Currently, the only supported value is "UserName". If the Domain's - AuthMode is Amazon Web Services SSO, this field is required. If - the Domain's AuthMode is not Amazon Web Services SSO, this field - cannot be specified. + AuthMode is IAM Identity Center, this field is required. If the + Domain's AuthMode is not IAM Identity Center, this field cannot + be specified. type: string singleSignOnUserValue: description: The username of the associated Amazon Web Services Single - Sign-On User for this UserProfile. If the Domain's AuthMode is Amazon - Web Services SSO, this field is required, and must match a valid + Sign-On User for this UserProfile. If the Domain's AuthMode is IAM + Identity Center, this field is required, and must match a valid username of a user in your directory. If the Domain's AuthMode is - not Amazon Web Services SSO, this field cannot be specified. + not IAM Identity Center, this field cannot be specified. type: string tags: description: "Each tag consists of a key and an optional value. Tag diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_adoptedresources.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_adoptedresources.yaml index f764dbbc29..7dca541dfc 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_adoptedresources.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_adoptedresources.yaml @@ -1,10 +1,9 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.7.0 + controller-gen.kubebuilder.io/version: v0.9.2 creationTimestamp: null name: adoptedresources.services.k8s.aws spec: @@ -170,6 +169,7 @@ spec: - name - uid type: object + x-kubernetes-map-type: atomic type: array type: object required: @@ -224,9 +224,3 @@ spec: storage: true subresources: status: {} -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_fieldexports.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_fieldexports.yaml index a435de8447..4a7ab61b31 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_fieldexports.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/common/bases/services.k8s.aws_fieldexports.yaml @@ -1,10 +1,9 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.7.0 + controller-gen.kubebuilder.io/version: v0.9.2 creationTimestamp: null name: fieldexports.services.k8s.aws spec: @@ -133,9 +132,3 @@ spec: storage: true subresources: status: {} -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] diff --git a/awsconfigs/common/ack-sagemaker-controller/base/crd/kustomization.yaml b/awsconfigs/common/ack-sagemaker-controller/base/crd/kustomization.yaml index a556d2d13d..113f72e7bc 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/crd/kustomization.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/crd/kustomization.yaml @@ -1,8 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: - - common resources: + - common - bases/sagemaker.services.k8s.aws_apps.yaml - bases/sagemaker.services.k8s.aws_dataqualityjobdefinitions.yaml - bases/sagemaker.services.k8s.aws_domains.yaml @@ -19,6 +18,8 @@ resources: - bases/sagemaker.services.k8s.aws_monitoringschedules.yaml - bases/sagemaker.services.k8s.aws_notebookinstances.yaml - bases/sagemaker.services.k8s.aws_notebookinstancelifecycleconfigs.yaml + - bases/sagemaker.services.k8s.aws_pipelines.yaml + - bases/sagemaker.services.k8s.aws_pipelineexecutions.yaml - bases/sagemaker.services.k8s.aws_processingjobs.yaml - bases/sagemaker.services.k8s.aws_trainingjobs.yaml - bases/sagemaker.services.k8s.aws_transformjobs.yaml diff --git a/awsconfigs/common/ack-sagemaker-controller/base/default/kustomization.yaml b/awsconfigs/common/ack-sagemaker-controller/base/default/kustomization.yaml index b4521337f5..c89f8ed4b1 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/default/kustomization.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/default/kustomization.yaml @@ -12,7 +12,7 @@ #commonLabels: # someName: someValue -bases: +resources: - ../crd - ../rbac - ../controller diff --git a/awsconfigs/common/ack-sagemaker-controller/base/rbac/cluster-role-controller.yaml b/awsconfigs/common/ack-sagemaker-controller/base/rbac/cluster-role-controller.yaml index 175dfcb376..05f7e69608 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/rbac/cluster-role-controller.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/rbac/cluster-role-controller.yaml @@ -351,6 +351,46 @@ rules: - get - patch - update +- apiGroups: + - sagemaker.services.k8s.aws + resources: + - pipelineexecutions + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - sagemaker.services.k8s.aws + resources: + - pipelineexecutions/status + verbs: + - get + - patch + - update +- apiGroups: + - sagemaker.services.k8s.aws + resources: + - pipelines + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - sagemaker.services.k8s.aws + resources: + - pipelines/status + verbs: + - get + - patch + - update - apiGroups: - sagemaker.services.k8s.aws resources: diff --git a/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-reader.yaml b/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-reader.yaml index eeb85fa7ec..2ffb60a931 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-reader.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-reader.yaml @@ -25,6 +25,8 @@ rules: - monitoringschedules - notebookinstances - notebookinstancelifecycleconfigs + - pipelines + - pipelineexecutions - processingjobs - trainingjobs - transformjobs diff --git a/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-writer.yaml b/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-writer.yaml index 581263e03b..747d6b0dd1 100644 --- a/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-writer.yaml +++ b/awsconfigs/common/ack-sagemaker-controller/base/rbac/role-writer.yaml @@ -25,6 +25,8 @@ rules: - monitoringschedules - notebookinstances - notebookinstancelifecycleconfigs + - pipelines + - pipelineexecutions - processingjobs - trainingjobs - transformjobs @@ -56,6 +58,8 @@ rules: - monitoringschedules - notebookinstances - notebookinstancelifecycleconfigs + - pipelines + - pipelineexecutions - processingjobs - trainingjobs - transformjobs diff --git a/awsconfigs/common/aws-alb-ingress-controller/base/kustomization.yaml b/awsconfigs/common/aws-alb-ingress-controller/base/kustomization.yaml index 2179aec2ab..2875e52bfa 100644 --- a/awsconfigs/common/aws-alb-ingress-controller/base/kustomization.yaml +++ b/awsconfigs/common/aws-alb-ingress-controller/base/kustomization.yaml @@ -3,7 +3,7 @@ kind: Kustomization namespace: kube-system resources: # Manifests downloaded from: -# https://github.com/kubernetes-sigs/aws-load-balancer-controller/releases/download/v2.4.1/v2_4_1_full.yaml +# https://github.com/kubernetes-sigs/aws-load-balancer-controller/releases/download/v2.4.7/v2_4_7_full.yaml # change from original manifest is in Deployment named aws-load-balancer-controller for controller container args: --cluster-name=$(CLUSTER_NAME) - load_balancer_controller.yaml commonLabels: diff --git a/awsconfigs/common/aws-alb-ingress-controller/base/load_balancer_controller.yaml b/awsconfigs/common/aws-alb-ingress-controller/base/load_balancer_controller.yaml index 8976f30a61..592e15dd1d 100644 --- a/awsconfigs/common/aws-alb-ingress-controller/base/load_balancer_controller.yaml +++ b/awsconfigs/common/aws-alb-ingress-controller/base/load_balancer_controller.yaml @@ -2,7 +2,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.5.0 + controller-gen.kubebuilder.io/version: v0.11.1 creationTimestamp: null labels: app.kubernetes.io/name: aws-load-balancer-controller @@ -133,6 +133,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic scheme: description: Scheme defines the scheme for all Ingresses that belong to IngressClass with this IngressClassParams. @@ -162,18 +163,12 @@ spec: served: true storage: true subresources: {} -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.5.0 + controller-gen.kubebuilder.io/version: v0.11.1 creationTimestamp: null labels: app.kubernetes.io/name: aws-load-balancer-controller @@ -516,6 +511,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic serviceRef: description: serviceRef is a reference to a Kubernetes Service and ServicePort. @@ -562,12 +558,6 @@ spec: storage: true subresources: status: {} -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] --- apiVersion: v1 kind: ServiceAccount @@ -820,10 +810,9 @@ spec: spec: containers: - args: - # change from original manifest - --cluster-name=$(CLUSTER_NAME) - --ingress-class=alb - image: amazon/aws-alb-ingress-controller:v2.4.1 + image: public.ecr.aws/eks/aws-load-balancer-controller:v2.4.7 livenessProbe: failureThreshold: 2 httpGet: @@ -1000,23 +989,3 @@ webhooks: resources: - ingresses sideEffects: None ---- -apiVersion: elbv2.k8s.aws/v1beta1 -kind: IngressClassParams -metadata: - labels: - app.kubernetes.io/name: aws-load-balancer-controller - name: alb ---- -apiVersion: networking.k8s.io/v1 -kind: IngressClass -metadata: - labels: - app.kubernetes.io/name: aws-load-balancer-controller - name: alb -spec: - controller: ingress.k8s.aws/alb - parameters: - apiGroup: elbv2.k8s.aws - kind: IngressClassParams - name: alb diff --git a/awsconfigs/common/aws-authservice/base/kustomization.yaml b/awsconfigs/common/aws-authservice/base/kustomization.yaml index 82d0bbb755..96ca2518f2 100644 --- a/awsconfigs/common/aws-authservice/base/kustomization.yaml +++ b/awsconfigs/common/aws-authservice/base/kustomization.yaml @@ -9,7 +9,7 @@ resources: images: - name: public.ecr.aws/c9e4w0g3/cognito/aws-authservice newName: public.ecr.aws/c9e4w0g3/cognito/aws-authservice - newTag: v1.0.0 + newTag: v2.0.0 configMapGenerator: - name: authservice-config env: params.env \ No newline at end of file diff --git a/awsconfigs/common/aws-authservice/base/virtual-service.yaml b/awsconfigs/common/aws-authservice/base/virtual-service.yaml index 8666e5146b..fc775bb2b2 100644 --- a/awsconfigs/common/aws-authservice/base/virtual-service.yaml +++ b/awsconfigs/common/aws-authservice/base/virtual-service.yaml @@ -10,7 +10,7 @@ spec: http: - match: - uri: - prefix: /logout + prefix: /authservice/logout route: - destination: host: aws-authservice.istio-system.svc.cluster.local diff --git a/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-dmh59b856d-kubeflow-ConfigMap.yaml b/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-dmh59b856d-kubeflow-ConfigMap.yaml deleted file mode 100644 index e86ed4dbb3..0000000000 --- a/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-dmh59b856d-kubeflow-ConfigMap.yaml +++ /dev/null @@ -1,193 +0,0 @@ -apiVersion: v1 -data: - spawner_ui_config.yaml: | - # Configuration file for the Jupyter UI. - # - # Each Jupyter UI option is configured by two keys: 'value' and 'readOnly' - # - The 'value' key contains the default value - # - The 'readOnly' key determines if the option will be available to users - # - # If the 'readOnly' key is present and set to 'true', the respective option - # will be disabled for users and only set by the admin. Also when a - # Notebook is POSTED to the API if a necessary field is not present then - # the value from the config will be used. - # - # If the 'readOnly' key is missing (defaults to 'false'), the respective option - # will be available for users to edit. - # - # Note that some values can be templated. Such values are the names of the - # Volumes as well as their StorageClass - spawnerFormDefaults: - image: - # The container Image for the user's Jupyter Notebook - value: public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 - # The list of available standard container Images - options: - - kubeflownotebookswg/jupyter-scipy:v1.7.0 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-gpu-py39-cu112-ubuntu20.04-ec2-v1.1 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-gpu-py38-cu116-ubuntu20.04-ec2-v1.2 - - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-cpu-py38-ubuntu20.04-ec2-v1.2 - imageGroupOne: - # The container Image for the user's Group One Server - # The annotation `notebooks.kubeflow.org/http-rewrite-uri: /` - # is applied to notebook in this group, configuring - # the Istio rewrite for containers that host their web UI at `/` - value: kubeflownotebookswg/codeserver-python:v1.7.0 - # The list of available standard container Images - options: - - kubeflownotebookswg/codeserver-python:v1.7.0 - imageGroupTwo: - # The container Image for the user's Group Two Server - # The annotation `notebooks.kubeflow.org/http-rewrite-uri: /` - # is applied to notebook in this group, configuring - # the Istio rewrite for containers that host their web UI at `/` - # The annotation `notebooks.kubeflow.org/http-headers-request-set` - # is applied to notebook in this group, configuring Istio - # to add the `X-RStudio-Root-Path` header to requests - value: kubeflownotebookswg/rstudio-tidyverse:v1.7.0 - # The list of available standard container Images - options: - - kubeflownotebookswg/rstudio-tidyverse:v1.7.0 - # If true, hide registry and/or tag name in the image selection dropdown - hideRegistry: true - hideTag: false - allowCustomImage: true - # If true, users can input custom images - # If false, users can only select from the images in this config - imagePullPolicy: - # Supported values: Always, IfNotPresent, Never - value: IfNotPresent - readOnly: false - cpu: - # CPU for user's Notebook - value: '0.5' - # Factor by with to multiply request to calculate limit - # if no limit is set, to disable set "none" - limitFactor: "1.2" - readOnly: false - memory: - # Memory for user's Notebook - value: 1.0Gi - # Factor by with to multiply request to calculate limit - # if no limit is set, to disable set "none" - limitFactor: "1.2" - readOnly: false - environment: - value: {} - readOnly: false - workspaceVolume: - # Workspace Volume to be attached to user's Notebook - # If you don't want a workspace volume then delete the 'value' key - value: - mount: /home/jovyan - newPvc: - metadata: - name: '{notebook-name}-workspace' - spec: - resources: - requests: - storage: 10Gi - accessModes: - - ReadWriteOnce - readOnly: false - dataVolumes: - # List of additional Data Volumes to be attached to the user's Notebook - value: [] - # For example, a list with 2 Data Volumes: - # value: - # - mount: /home/jovyan/datavol-1 - # newPvc: - # metadata: - # name: '{notebook-name}-datavol-1' - # spec: - # resources: - # requests: - # storage: 5Gi - # accessModes: - # - ReadWriteOnce - # - mount: /home/jovyan/datavol-1 - # existingSource: - # persistentVolumeClaim: - # claimName: test-pvc - readOnly: false - gpus: - # Number of GPUs to be assigned to the Notebook Container - value: - # values: "none", "1", "2", "4", "8" - num: "none" - # Determines what the UI will show and send to the backend - vendors: - - limitsKey: "nvidia.com/gpu" - uiName: "NVIDIA" - - limitsKey: "amd.com/gpu" - uiName: "AMD" - # Values: "" or a `limits-key` from the vendors list - vendor: "" - readOnly: false - affinityConfig: - # If readonly, the default value will be the only option - # value is a list of `configKey`s that we want to be selected by default - value: "" - # The list of available affinity configs - options: [] - #options: - # - configKey: "exclusive__n1-standard-2" - # displayName: "Exclusive: n1-standard-2" - # affinity: - # # (Require) Node having label: `node_pool=notebook-n1-standard-2` - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: "node_pool" - # operator: "In" - # values: - # - "notebook-n1-standard-2" - # # (Require) Node WITHOUT existing Pod having label: `notebook-name` - # podAntiAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # - labelSelector: - # matchExpressions: - # - key: "notebook-name" - # operator: "Exists" - # namespaces: [] - # topologyKey: "kubernetes.io/hostname" - #readOnly: false - tolerationGroup: - # The default `groupKey` from the options list - # If readonly, the default value will be the only option - value: "" - # The list of available tolerationGroup configs - options: [] - #options: - # - groupKey: "group_1" - # displayName: "Group 1: description" - # tolerations: - # - key: "key1" - # operator: "Equal" - # value: "value1" - # effect: "NoSchedule" - # - key: "key2" - # operator: "Equal" - # value: "value2" - # effect: "NoSchedule" - readOnly: false - shm: - value: true - readOnly: false - configurations: - # List of labels to be selected, these are the labels from PodDefaults - # value: - # - add-aws-secret - # - default-editor - value: [] - readOnly: false -kind: ConfigMap -metadata: - annotations: {} - labels: - app: jupyter-web-app - kustomize.component: jupyter-web-app - name: jupyter-web-app-config-dmh59b856d - namespace: kubeflow \ No newline at end of file diff --git a/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-mgf762gt24-kubeflow-ConfigMap.yaml b/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-mgf762gt24-kubeflow-ConfigMap.yaml new file mode 100644 index 0000000000..0d6d6cc779 --- /dev/null +++ b/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-config-mgf762gt24-kubeflow-ConfigMap.yaml @@ -0,0 +1,95 @@ +apiVersion: v1 +data: + spawner_ui_config.yaml: "# Configuration file for the Jupyter UI.\n#\n# Each Jupyter\ + \ UI option is configured by two keys: 'value' and 'readOnly'\n# - The 'value'\ + \ key contains the default value\n# - The 'readOnly' key determines if the option\ + \ will be available to users\n#\n# If the 'readOnly' key is present and set to\ + \ 'true', the respective option\n# will be disabled for users and only set by\ + \ the admin. Also when a\n# Notebook is POSTED to the API if a necessary field\ + \ is not present then\n# the value from the config will be used.\n#\n# If the\ + \ 'readOnly' key is missing (defaults to 'false'), the respective option\n# will\ + \ be available for users to edit.\n#\n# Note that some values can be templated.\ + \ Such values are the names of the\n# Volumes as well as their StorageClass\n\ + spawnerFormDefaults:\n image:\n # The container Image for the user's Jupyter\ + \ Notebook\n value: public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0\n\ + \ # The list of available standard container Images\n options:\n - kubeflownotebookswg/jupyter-scipy:v1.7.0\n\ + \ - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0\n\ + \ - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0\n\ + \ - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0\n\ + \ - public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0\n\ + \ imageGroupOne:\n # The container Image for the user's Group One Server\n\ + \ # The annotation `notebooks.kubeflow.org/http-rewrite-uri: /`\n # is applied\ + \ to notebook in this group, configuring\n # the Istio rewrite for containers\ + \ that host their web UI at `/`\n value: kubeflownotebookswg/codeserver-python:v1.7.0\n\ + \ # The list of available standard container Images\n options:\n - kubeflownotebookswg/codeserver-python:v1.7.0\n\ + \ imageGroupTwo:\n # The container Image for the user's Group Two Server\n\ + \ # The annotation `notebooks.kubeflow.org/http-rewrite-uri: /`\n # is applied\ + \ to notebook in this group, configuring\n # the Istio rewrite for containers\ + \ that host their web UI at `/`\n # The annotation `notebooks.kubeflow.org/http-headers-request-set`\n\ + \ # is applied to notebook in this group, configuring Istio\n # to add the\ + \ `X-RStudio-Root-Path` header to requests\n value: kubeflownotebookswg/rstudio-tidyverse:v1.7.0\n\ + \ # The list of available standard container Images\n options:\n - kubeflownotebookswg/rstudio-tidyverse:v1.7.0\n\ + \ # If true, hide registry and/or tag name in the image selection dropdown\n\ + \ hideRegistry: true\n hideTag: false\n allowCustomImage: true\n # If true,\ + \ users can input custom images\n # If false, users can only select from the\ + \ images in this config\n imagePullPolicy:\n # Supported values: Always, IfNotPresent,\ + \ Never\n value: IfNotPresent\n readOnly: false\n cpu:\n # CPU for user's\ + \ Notebook\n value: '0.5'\n # Factor by with to multiply request to calculate\ + \ limit\n # if no limit is set, to disable set \"none\"\n limitFactor: \"\ + 1.2\"\n readOnly: false\n memory:\n # Memory for user's Notebook\n value:\ + \ 1.0Gi\n # Factor by with to multiply request to calculate limit\n # if\ + \ no limit is set, to disable set \"none\"\n limitFactor: \"1.2\"\n readOnly:\ + \ false\n environment:\n value: {}\n readOnly: false\n workspaceVolume:\n\ + \ # Workspace Volume to be attached to user's Notebook\n # If you don't\ + \ want a workspace volume then delete the 'value' key\n value:\n mount:\ + \ /home/jovyan\n newPvc:\n metadata:\n name: '{notebook-name}-workspace'\n\ + \ spec:\n resources:\n requests:\n storage:\ + \ 10Gi\n accessModes:\n - ReadWriteOnce\n readOnly: false\n\ + \ dataVolumes:\n # List of additional Data Volumes to be attached to the user's\ + \ Notebook\n value: []\n # For example, a list with 2 Data Volumes:\n \ + \ # value:\n # - mount: /home/jovyan/datavol-1\n # newPvc:\n #\ + \ metadata:\n # name: '{notebook-name}-datavol-1'\n # \ + \ spec:\n # resources:\n # requests:\n # \ + \ storage: 5Gi\n # accessModes:\n # - ReadWriteOnce\n\ + \ # - mount: /home/jovyan/datavol-1\n # existingSource:\n # \ + \ persistentVolumeClaim:\n # claimName: test-pvc\n readOnly:\ + \ false\n gpus:\n # Number of GPUs to be assigned to the Notebook Container\n\ + \ value:\n # values: \"none\", \"1\", \"2\", \"4\", \"8\"\n num:\ + \ \"none\"\n # Determines what the UI will show and send to the backend\n\ + \ vendors:\n - limitsKey: \"nvidia.com/gpu\"\n uiName: \"NVIDIA\"\ + \n - limitsKey: \"amd.com/gpu\"\n uiName: \"AMD\"\n # Values:\ + \ \"\" or a `limits-key` from the vendors list\n vendor: \"\"\n readOnly:\ + \ false\n affinityConfig:\n # If readonly, the default value will be the only\ + \ option\n # value is a list of `configKey`s that we want to be selected by\ + \ default\n value: \"\"\n # The list of available affinity configs\n \ + \ options: []\n #options:\n # - configKey: \"exclusive__n1-standard-2\"\ + \n # displayName: \"Exclusive: n1-standard-2\"\n # affinity:\n \ + \ # # (Require) Node having label: `node_pool=notebook-n1-standard-2`\n \ + \ # nodeAffinity:\n # requiredDuringSchedulingIgnoredDuringExecution:\n\ + \ # nodeSelectorTerms:\n # - matchExpressions:\n \ + \ # - key: \"node_pool\"\n # operator: \"In\"\ + \n # values:\n # - \"notebook-n1-standard-2\"\ + \n # # (Require) Node WITHOUT existing Pod having label: `notebook-name`\n\ + \ # podAntiAffinity:\n # requiredDuringSchedulingIgnoredDuringExecution:\n\ + \ # - labelSelector:\n # matchExpressions:\n #\ + \ - key: \"notebook-name\"\n # operator: \"\ + Exists\"\n # namespaces: []\n # topologyKey: \"kubernetes.io/hostname\"\ + \n #readOnly: false\n tolerationGroup:\n # The default `groupKey` from\ + \ the options list\n # If readonly, the default value will be the only option\n\ + \ value: \"\"\n # The list of available tolerationGroup configs\n options:\ + \ []\n #options:\n # - groupKey: \"group_1\"\n # displayName: \"\ + Group 1: description\"\n # tolerations:\n # - key: \"key1\"\n \ + \ # operator: \"Equal\"\n # value: \"value1\"\n # \ + \ effect: \"NoSchedule\"\n # - key: \"key2\"\n # operator: \"\ + Equal\"\n # value: \"value2\"\n # effect: \"NoSchedule\"\n\ + \ readOnly: false\n shm:\n value: true\n readOnly: false\n configurations:\n\ + \ # List of labels to be selected, these are the labels from PodDefaults\n\ + \ # value:\n # - add-aws-secret\n # - default-editor\n value:\ + \ []\n readOnly: false\n" +kind: ConfigMap +metadata: + labels: + app: jupyter-web-app + kustomize.component: jupyter-web-app + name: jupyter-web-app-config-mgf762gt24 + namespace: kubeflow diff --git a/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-dhcbt5dtdf-kubeflow-ConfigMap.yaml b/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-42k97gcbmb-kubeflow-ConfigMap.yaml similarity index 87% rename from charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-dhcbt5dtdf-kubeflow-ConfigMap.yaml rename to charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-42k97gcbmb-kubeflow-ConfigMap.yaml index 09ffe28948..b6064620e1 100644 --- a/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-dhcbt5dtdf-kubeflow-ConfigMap.yaml +++ b/charts/apps/jupyter-web-app/templates/ConfigMap/jupyter-web-app-parameters-42k97gcbmb-kubeflow-ConfigMap.yaml @@ -11,5 +11,5 @@ metadata: labels: app: jupyter-web-app kustomize.component: jupyter-web-app - name: jupyter-web-app-parameters-dhcbt5dtdf + name: jupyter-web-app-parameters-42k97gcbmb namespace: kubeflow diff --git a/charts/apps/jupyter-web-app/templates/Deployment/jupyter-web-app-deployment-kubeflow-Deployment.yaml b/charts/apps/jupyter-web-app/templates/Deployment/jupyter-web-app-deployment-kubeflow-Deployment.yaml index c9b62b71b3..4d580caa8e 100644 --- a/charts/apps/jupyter-web-app/templates/Deployment/jupyter-web-app-deployment-kubeflow-Deployment.yaml +++ b/charts/apps/jupyter-web-app/templates/Deployment/jupyter-web-app-deployment-kubeflow-Deployment.yaml @@ -42,7 +42,7 @@ spec: serviceAccountName: jupyter-web-app-service-account volumes: - configMap: - name: jupyter-web-app-config-dmh59b856d + name: jupyter-web-app-config-mgf762gt24 name: config-volume - configMap: name: jupyter-web-app-logos diff --git a/charts/common/aws-authservice/Chart.yaml b/charts/common/aws-authservice/Chart.yaml index 496f63ee0f..0078b75ce3 100644 --- a/charts/common/aws-authservice/Chart.yaml +++ b/charts/common/aws-authservice/Chart.yaml @@ -1,24 +1,6 @@ apiVersion: v2 -name: aws-authservice +appVersion: v2.0.0 description: A Helm chart for Kubernetes - -# A chart can be either an 'application' or a 'library' chart. -# -# Application charts are a collection of templates that can be packaged into versioned archives -# to be deployed. -# -# Library charts provide useful utilities or functions for the chart developer. They're included as -# a dependency of application charts to inject those utilities and functions into the rendering -# pipeline. Library charts do not define any templates and therefore cannot be deployed. +name: aws-authservice type: application - -# This is the chart version. This version number should be incremented each time you make changes -# to the chart and its templates, including the app version. -# Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 - -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "v0.1.0" +version: 0.2.0 diff --git a/charts/common/aws-authservice/templates/ConfigMap/authservice-config-ck6577dfkd-istio-system-ConfigMap.yaml b/charts/common/aws-authservice/templates/ConfigMap/authservice-config-c9f7d7t7db-istio-system-ConfigMap.yaml similarity index 74% rename from charts/common/aws-authservice/templates/ConfigMap/authservice-config-ck6577dfkd-istio-system-ConfigMap.yaml rename to charts/common/aws-authservice/templates/ConfigMap/authservice-config-c9f7d7t7db-istio-system-ConfigMap.yaml index 8adc4490f1..992493cce5 100644 --- a/charts/common/aws-authservice/templates/ConfigMap/authservice-config-ck6577dfkd-istio-system-ConfigMap.yaml +++ b/charts/common/aws-authservice/templates/ConfigMap/authservice-config-c9f7d7t7db-istio-system-ConfigMap.yaml @@ -3,5 +3,5 @@ data: LOGOUT_URL: {{ .Values.LOGOUT_URL }} kind: ConfigMap metadata: - name: authservice-config-ck6577dfkd + name: authservice-config-c9f7d7t7db namespace: istio-system diff --git a/charts/common/aws-authservice/templates/Deployment/aws-authservice-istio-system-Deployment.yaml b/charts/common/aws-authservice/templates/Deployment/aws-authservice-istio-system-Deployment.yaml index f71c419ae1..4843cb7d2d 100644 --- a/charts/common/aws-authservice/templates/Deployment/aws-authservice-istio-system-Deployment.yaml +++ b/charts/common/aws-authservice/templates/Deployment/aws-authservice-istio-system-Deployment.yaml @@ -23,8 +23,8 @@ spec: valueFrom: configMapKeyRef: key: LOGOUT_URL - name: authservice-config-ck6577dfkd - image: public.ecr.aws/c9e4w0g3/cognito/aws-authservice:v1.0.0 + name: authservice-config-c9f7d7t7db + image: public.ecr.aws/c9e4w0g3/cognito/aws-authservice:v2.0.0 imagePullPolicy: IfNotPresent name: aws-authservice ports: diff --git a/charts/common/aws-authservice/templates/VirtualService/authservice-web-cognito-istio-system-VirtualService.yaml b/charts/common/aws-authservice/templates/VirtualService/authservice-web-cognito-istio-system-VirtualService.yaml index c2ce72c982..75b3a0476d 100644 --- a/charts/common/aws-authservice/templates/VirtualService/authservice-web-cognito-istio-system-VirtualService.yaml +++ b/charts/common/aws-authservice/templates/VirtualService/authservice-web-cognito-istio-system-VirtualService.yaml @@ -11,7 +11,7 @@ spec: http: - match: - uri: - prefix: /logout + prefix: /authservice/logout route: - destination: host: aws-authservice.istio-system.svc.cluster.local diff --git a/components/aws-authservice/main.go b/components/aws-authservice/main.go index 119bd9368b..c59d8dab6d 100644 --- a/components/aws-authservice/main.go +++ b/components/aws-authservice/main.go @@ -14,6 +14,7 @@ package main import ( + "encoding/json" "fmt" "log" "net/http" @@ -36,7 +37,6 @@ func init() { // LogoutHandler expires ALB Cookies and redirects to Cognito Logout Endpoint func LogoutHandler(w http.ResponseWriter, r *http.Request) { log.Println("Traffic reached LogoutHandler") - // There are 4 possible AWSELBAuthSessionCookies // https://docs.aws.amazon.com/elasticloadbalancing/latest/application/listener-authenticate-users.html#authentication-logout for cookieIndex := 0; cookieIndex < 4; cookieIndex++ { @@ -44,15 +44,29 @@ func LogoutHandler(w http.ResponseWriter, r *http.Request) { expireALBCookie := &http.Cookie{Value: "Expired", Name: name, MaxAge: -1, Path: "/"} http.SetCookie(w, expireALBCookie) } - http.Redirect(w, r, redirectURL, http.StatusSeeOther) + + // Central Dashboard expects to redirect to event.detail.response['afterLogoutURL']) after logout + // https://github.com/kubeflow/kubeflow/blob/master/components/centraldashboard/public/components/logout-button.js#L49 + resp := struct { + AfterLogoutURL string `json:"afterLogoutURL"` + }{ + AfterLogoutURL: redirectURL, + } + jsonBytes, err := json.Marshal(resp) + if err != nil { + log.Println("Failed to marshal struct to json: %v", err) + } + + w.Write(jsonBytes) + + http.Redirect(w, r, redirectURL, http.StatusCreated) } func main() { router := mux.NewRouter() - router.HandleFunc("/logout", LogoutHandler).Methods(http.MethodGet) - + router.HandleFunc("/authservice/logout", LogoutHandler).Methods(http.MethodPost) var listenPort = ":" + port log.Println("Starting web server at", listenPort) - log.Fatal(http.ListenAndServe(listenPort, handlers.CORS()(router))) + log.Println(http.ListenAndServe(listenPort, handlers.CORS()(router))) } diff --git a/components/notebook-dockerfiles/README.md b/components/notebook-dockerfiles/README.md index e430007822..8149e56811 100644 --- a/components/notebook-dockerfiles/README.md +++ b/components/notebook-dockerfiles/README.md @@ -7,10 +7,10 @@ This directory contains the source code for these jupyter images which is based The following AWS Optimized container images are available from the [Amazon Elastic Container Registry](https://gallery.ecr.aws/kubeflow-on-aws/) (Amazon ECR). ``` -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-gpu-py39-cu112-ubuntu20.04-ec2-v1.1 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-gpu-py38-cu116-ubuntu20.04-ec2-v1.2 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-cpu-py38-ubuntu20.04-ec2-v1.2 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0 ``` These images are based on AWS Deep Learning Containers which provide optimized environments with popular machine learning frameworks such as TensorFlow and PyTorch, and are available in the Amazon ECR. For more information on AWS Deep Learning Container options, see [Deep Learning Container Docs](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/what-is-dlc.html). @@ -18,8 +18,6 @@ Along with specific machine learning frameworks, these container images have add ``` kfp kfserving -h5py -pandas awscli boto3 ``` diff --git a/components/notebook-dockerfiles/pytorch/cpu.Dockerfile b/components/notebook-dockerfiles/pytorch/cpu.Dockerfile index 70a5feaab5..15f02ccfbe 100644 --- a/components/notebook-dockerfiles/pytorch/cpu.Dockerfile +++ b/components/notebook-dockerfiles/pytorch/cpu.Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.1-cpu-py38-ubuntu20.04-ec2-v1.2 +ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0 FROM $BASE_IMAGE ARG NB_USER=jovyan @@ -49,7 +49,7 @@ RUN apt-get update \ # install -- node.js RUN export DEBIAN_FRONTEND=noninteractive \ && curl -sL "https://deb.nodesource.com/gpgkey/nodesource.gpg.key" | apt-key add - \ - && echo "deb https://deb.nodesource.com/node_14.x focal main" > /etc/apt/sources.list.d/nodesource.list \ + && echo "deb http://deb.nodesource.com/node_14.x focal main" > /etc/apt/sources.list.d/nodesource.list \ && apt-get -yq update \ && apt-get -yq install --no-install-recommends \ nodejs \ @@ -57,8 +57,8 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && rm -rf /var/lib/apt/lists/* # Install kubectl client -RUN echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \ - && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ +RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ + && echo "deb http://packages.cloud.google.com/apt/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \ && apt-get update \ && apt-get install -y kubectl diff --git a/components/notebook-dockerfiles/pytorch/cuda.Dockerfile b/components/notebook-dockerfiles/pytorch/cuda.Dockerfile index 6a3f4ab855..46cce6475e 100644 --- a/components/notebook-dockerfiles/pytorch/cuda.Dockerfile +++ b/components/notebook-dockerfiles/pytorch/cuda.Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.1-gpu-py38-cu116-ubuntu20.04-ec2-v1.2 +ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 FROM $BASE_IMAGE @@ -46,7 +46,7 @@ RUN apt-get update \ # install -- node.js RUN export DEBIAN_FRONTEND=noninteractive \ && curl -sL "https://deb.nodesource.com/gpgkey/nodesource.gpg.key" | apt-key add - \ - && echo "deb https://deb.nodesource.com/node_14.x focal main" > /etc/apt/sources.list.d/nodesource.list \ + && echo "deb http://deb.nodesource.com/node_14.x focal main" > /etc/apt/sources.list.d/nodesource.list \ && apt-get -yq update \ && apt-get -yq install --no-install-recommends \ nodejs \ @@ -54,8 +54,8 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && rm -rf /var/lib/apt/lists/* # Install kubectl client -RUN echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \ - && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ +RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ + && echo "deb http://packages.cloud.google.com/apt/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \ && apt-get update \ && apt-get install -y kubectl diff --git a/components/notebook-dockerfiles/pytorch/requirements.txt b/components/notebook-dockerfiles/pytorch/requirements.txt index cb3ecdecf9..0f61e9a1c5 100644 --- a/components/notebook-dockerfiles/pytorch/requirements.txt +++ b/components/notebook-dockerfiles/pytorch/requirements.txt @@ -12,23 +12,7 @@ widgetsnbextension==3.6.1 ipywidgets==7.7.1 # Kubeflow Related Packages -kfp==1.8.13 -kfp-server-api==1.8.3 -kserve==0.9.0 -kubeflow-training==1.4.0 -kubeflow-katib==0.14.0 - -# Common data science packages -h5py==3.7.0 -pandas==1.4.3 -matplotlib==3.5.2 -xgboost==1.6.1 -ipympl==0.9.1 - -# AWS related packages -awscli==1.22.101 -boto3==1.21.0 - -# Pytorch packages -# a version mismatch for fastai can cause a different version of torch to get installed, be careful. -fastai==2.7.9 \ No newline at end of file +kfp==1.8.20 +kserve==0.10.1 +kubeflow-training==1.6.0 +git+https://github.com/kubeflow/katib.git@1d3ab5726f2f2181f174b5324b600cbfdd5f0cec#subdirectory=sdk/python/v1beta1 diff --git a/components/notebook-dockerfiles/tensorflow/cpu.Dockerfile b/components/notebook-dockerfiles/tensorflow/cpu.Dockerfile index 77358677f9..4f6c060ffd 100644 --- a/components/notebook-dockerfiles/tensorflow/cpu.Dockerfile +++ b/components/notebook-dockerfiles/tensorflow/cpu.Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1 +ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0 FROM $BASE_IMAGE diff --git a/components/notebook-dockerfiles/tensorflow/cuda.Dockerfile b/components/notebook-dockerfiles/tensorflow/cuda.Dockerfile index 318263dc96..6a482c3499 100644 --- a/components/notebook-dockerfiles/tensorflow/cuda.Dockerfile +++ b/components/notebook-dockerfiles/tensorflow/cuda.Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.10.0-gpu-py39-cu112-ubuntu20.04-ec2-v1.1 +ARG BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 FROM $BASE_IMAGE diff --git a/components/notebook-dockerfiles/tensorflow/requirements.txt b/components/notebook-dockerfiles/tensorflow/requirements.txt index 251a062e2d..33b7fe8705 100644 --- a/components/notebook-dockerfiles/tensorflow/requirements.txt +++ b/components/notebook-dockerfiles/tensorflow/requirements.txt @@ -12,23 +12,7 @@ widgetsnbextension==3.6.1 ipywidgets==7.7.1 # Kubeflow Related Packages -kfp==1.8.13 -kfp-server-api==1.8.3 -kserve==0.9.0 -kubeflow-training==1.4.0 -kubeflow-katib==0.14.0 - -# Common data science packages -h5py==3.7.0 -pandas==1.4.3 -matplotlib==3.5.2 -xgboost==1.6.1 -ipympl==0.9.1 - -# AWS related packages -awscli==1.22.101 -boto3==1.21.0 - -# TF Packages -keras==2.9.0 -protobuf==3.19.5 \ No newline at end of file +kfp==1.8.20 +kubeflow-training==1.6.0 +kserve==0.10.1 +git+https://github.com/kubeflow/katib.git@1d3ab5726f2f2181f174b5324b600cbfdd5f0cec#subdirectory=sdk/python/v1beta1 diff --git a/deployments/cognito-rds-s3/terraform/main.tf b/deployments/cognito-rds-s3/terraform/main.tf index 6565650d01..6aaa6d514c 100644 --- a/deployments/cognito-rds-s3/terraform/main.tf +++ b/deployments/cognito-rds-s3/terraform/main.tf @@ -152,6 +152,12 @@ module "eks_blueprints_kubernetes_addons" { } enable_aws_efs_csi_driver = true + + aws_fsx_csi_driver_helm_config = { + namespace = "kube-system" + version = "1.5.1" + } + enable_aws_fsx_csi_driver = true enable_nvidia_device_plugin = local.using_gpu diff --git a/deployments/cognito/terraform/main.tf b/deployments/cognito/terraform/main.tf index ac820250b7..f4d9988c33 100644 --- a/deployments/cognito/terraform/main.tf +++ b/deployments/cognito/terraform/main.tf @@ -153,6 +153,12 @@ module "eks_blueprints_kubernetes_addons" { } enable_aws_efs_csi_driver = true + + aws_fsx_csi_driver_helm_config = { + namespace = "kube-system" + version = "1.5.1" + } + enable_aws_fsx_csi_driver = true enable_nvidia_device_plugin = local.using_gpu diff --git a/deployments/rds-s3/terraform/main.tf b/deployments/rds-s3/terraform/main.tf index 9fd01127bb..68cefe7999 100644 --- a/deployments/rds-s3/terraform/main.tf +++ b/deployments/rds-s3/terraform/main.tf @@ -145,6 +145,12 @@ module "eks_blueprints_kubernetes_addons" { } enable_aws_efs_csi_driver = true + + aws_fsx_csi_driver_helm_config = { + namespace = "kube-system" + version = "1.5.1" + } + enable_aws_fsx_csi_driver = true enable_nvidia_device_plugin = local.using_gpu diff --git a/deployments/vanilla/terraform/main.tf b/deployments/vanilla/terraform/main.tf index b7630cc351..8145cab509 100644 --- a/deployments/vanilla/terraform/main.tf +++ b/deployments/vanilla/terraform/main.tf @@ -144,6 +144,12 @@ module "eks_blueprints_kubernetes_addons" { } enable_aws_efs_csi_driver = true + + aws_fsx_csi_driver_helm_config = { + namespace = "kube-system" + version = "1.5.1" + } + enable_aws_fsx_csi_driver = true enable_nvidia_device_plugin = local.using_gpu diff --git a/iaac/terraform/common/ack-sagemaker-controller/locals.tf b/iaac/terraform/common/ack-sagemaker-controller/locals.tf index 5f9b865580..ba87d27519 100644 --- a/iaac/terraform/common/ack-sagemaker-controller/locals.tf +++ b/iaac/terraform/common/ack-sagemaker-controller/locals.tf @@ -9,7 +9,7 @@ locals { name = local.name chart = "${local.service}-chart" repository = "oci://public.ecr.aws/aws-controllers-k8s" - version = "v0.4.5" + version = "v1.2.1" namespace = local.namespace description = "SageMaker Operator for Kubernetes (ACK)" values = [] diff --git a/tests/e2e/fixtures/kserve_dependencies.py b/tests/e2e/fixtures/kserve_dependencies.py index 49483b9575..5354d2e1fe 100644 --- a/tests/e2e/fixtures/kserve_dependencies.py +++ b/tests/e2e/fixtures/kserve_dependencies.py @@ -80,8 +80,8 @@ def on_delete(): @pytest.fixture(scope="class") -def s3_bucket_with_data(metadata, kserve_secret, request): - metadata_key = "s3-bucket" +def s3_bucket_with_data_kserve(metadata, kserve_secret, request): + metadata_key = "s3-bucket-kserve" bucket_name = "s3-" + RANDOM_PREFIX bucket = S3BucketWithTrainingData( name=bucket_name, @@ -138,7 +138,7 @@ def on_delete(): def kserve_inference_service( metadata, kserve_iam_service_account, - s3_bucket_with_data, + s3_bucket_with_data_kserve, kserve_secret, cluster, region, @@ -156,7 +156,7 @@ def on_create(): print("creating allow-predictor-transformer AuthorizationPolicy...") kubectl_apply(AUTHORIZATION_POLICY_CONFIG_FILE) print("creating inference service...") - bucket_name = metadata.get("s3-bucket") + bucket_name = metadata.get("s3-bucket-kserve") inference_config = load_yaml_file(INFERENCE_CONFIG_FILE) inference_config["spec"]["predictor"]["model"][ diff --git a/tests/e2e/resources/kserve/kserve-secret.yaml b/tests/e2e/resources/kserve/kserve-secret.yaml index 7ab7d9398e..79c526e092 100644 --- a/tests/e2e/resources/kserve/kserve-secret.yaml +++ b/tests/e2e/resources/kserve/kserve-secret.yaml @@ -1,7 +1,4 @@ apiVersion: v1 -data: - AWS_ACCESS_KEY_ID: '' - AWS_SECRET_ACCESS_KEY: '' kind: Secret metadata: annotations: diff --git a/tests/e2e/resources/notebooks/verify_ack_integration.ipynb b/tests/e2e/resources/notebooks/verify_ack_integration.ipynb index 9cacf3750a..4a11e9d80f 100644 --- a/tests/e2e/resources/notebooks/verify_ack_integration.ipynb +++ b/tests/e2e/resources/notebooks/verify_ack_integration.ipynb @@ -11,8 +11,14 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "name": "python", + "version": "3.10.6" }, "orig_nbformat": 4 }, diff --git a/tests/e2e/resources/notebooks/verify_pytorch_installation.ipynb b/tests/e2e/resources/notebooks/verify_pytorch_installation.ipynb index 4cbdf8d48f..d0ce5d2038 100644 --- a/tests/e2e/resources/notebooks/verify_pytorch_installation.ipynb +++ b/tests/e2e/resources/notebooks/verify_pytorch_installation.ipynb @@ -27,7 +27,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/tests/e2e/resources/notebooks/verify_tensorflow_installation.ipynb b/tests/e2e/resources/notebooks/verify_tensorflow_installation.ipynb index 42204ca530..f90ac984fa 100644 --- a/tests/e2e/resources/notebooks/verify_tensorflow_installation.ipynb +++ b/tests/e2e/resources/notebooks/verify_tensorflow_installation.ipynb @@ -27,7 +27,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/tests/e2e/tests/test_notebook_images.py b/tests/e2e/tests/test_notebook_images.py index 4932e7fd8d..3fe48d624b 100644 --- a/tests/e2e/tests/test_notebook_images.py +++ b/tests/e2e/tests/test_notebook_images.py @@ -29,27 +29,27 @@ NOTEBOOK_IMAGES = [ "kubeflownotebookswg/jupyter-scipy:v1.7.0", - "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-gpu-py39-cu112-ubuntu20.04-ec2-v1.1", - "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.10.0-cpu-py39-ubuntu20.04-ec2-v1.1", - "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-gpu-py38-cu116-ubuntu20.04-ec2-v1.2", - "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.1-cpu-py38-ubuntu20.04-ec2-v1.2", + "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0", + "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0", + "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0", + "public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0", ] testdata = [ ("scipy", NOTEBOOK_IMAGES[0], "sanity_check.ipynb", "Hello World!"), - ("tf-gpu", NOTEBOOK_IMAGES[1], "verify_tensorflow_installation.ipynb", "2.10.0"), - ("tf-cpu", NOTEBOOK_IMAGES[2], "verify_tensorflow_installation.ipynb", "2.10.0"), + ("tf-gpu", NOTEBOOK_IMAGES[1], "verify_tensorflow_installation.ipynb", "2.12.0"), + ("tf-cpu", NOTEBOOK_IMAGES[2], "verify_tensorflow_installation.ipynb", "2.12.0"), ( "pytorch-gpu", NOTEBOOK_IMAGES[3], "verify_pytorch_installation.ipynb", - "1.12.1+cu116", + "2.0.0", ), ( "pytorch-cpu", NOTEBOOK_IMAGES[4], "verify_pytorch_installation.ipynb", - "1.12.1+cpu", + "2.0.0", ), ] diff --git a/tests/e2e/tests/test_sanity.py b/tests/e2e/tests/test_sanity.py index 8d68a9d4d6..86df622981 100644 --- a/tests/e2e/tests/test_sanity.py +++ b/tests/e2e/tests/test_sanity.py @@ -65,7 +65,7 @@ kserve_iam_service_account, kserve_secret, clone_tensorflow_serving, - s3_bucket_with_data, + s3_bucket_with_data_kserve, kserve_inference_service, ) @@ -235,10 +235,10 @@ def port_forward(installation): pass @pytest.fixture(scope="class") -def s3_bucket_with_data(region): +def s3_bucket_with_data_sagemaker(region): bucket_name = "s3-" + RANDOM_PREFIX bucket = S3BucketWithTrainingData(name=bucket_name, cmd=f"python utils/s3_for_training/sync.py {bucket_name} {region}", - time_to_sleep=120) + time_to_sleep=180) bucket.create() yield @@ -355,7 +355,7 @@ def test_kserve_with_irsa( clone_tensorflow_serving, kserve_iam_service_account, kserve_secret, - s3_bucket_with_data, + s3_bucket_with_data_kserve, kserve_inference_service, ): # Edit the ConfigMap to change the default domain as per your deployment @@ -401,7 +401,7 @@ def test_kserve_with_irsa( assert retcode == 0 def test_run_kfp_sagemaker_pipeline( - self, region, metadata, s3_bucket_with_data, sagemaker_execution_role, kfp_client, clean_up_training_jobs_in_user_ns + self, region, metadata, s3_bucket_with_data_sagemaker, sagemaker_execution_role, kfp_client, clean_up_training_jobs_in_user_ns ): experiment_name = "experiment-" + RANDOM_PREFIX diff --git a/tests/e2e/utils/auto-fsx-setup.py b/tests/e2e/utils/auto-fsx-setup.py index 452723b72f..5a70a26003 100755 --- a/tests/e2e/utils/auto-fsx-setup.py +++ b/tests/e2e/utils/auto-fsx-setup.py @@ -188,7 +188,7 @@ def setup_fsx_driver(): def install_fsx_driver(): print("Installing FSx driver...") - FSx_DRIVER_VERSION = "v0.7.1" + FSx_DRIVER_VERSION = "v0.9.0" FSx_CSI_DRIVER = f"github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=tags/{FSx_DRIVER_VERSION}" kubectl_apply_kustomize(FSx_CSI_DRIVER) diff --git a/tests/e2e/utils/kubeflow_installation.py b/tests/e2e/utils/kubeflow_installation.py index f8fe546ab6..cc1cccf73d 100644 --- a/tests/e2e/utils/kubeflow_installation.py +++ b/tests/e2e/utils/kubeflow_installation.py @@ -236,13 +236,13 @@ def install_alb_controller(cluster_name): --set clusterName={cluster_name} \ --set serviceAccount.create=false \ --set serviceAccount.name=aws-load-balancer-controller \ - --version v1.4.3" + --version v1.4.8" ) def install_ack_controller(): SERVICE = "sagemaker" - RELEASE_VERSION = "v0.4.5" + RELEASE_VERSION = "v1.2.1" CHART_EXPORT_PATH = "../../charts/common/ack-controller" CHART_REF = f"{SERVICE}-chart" CHART_REPO = f"public.ecr.aws/aws-controllers-k8s/{CHART_REF}" diff --git a/tests/e2e/utils/rds-s3/auto-rds-s3-setup.py b/tests/e2e/utils/rds-s3/auto-rds-s3-setup.py index 79ef8b477d..68b90eb174 100644 --- a/tests/e2e/utils/rds-s3/auto-rds-s3-setup.py +++ b/tests/e2e/utils/rds-s3/auto-rds-s3-setup.py @@ -24,12 +24,7 @@ from shutil import which -INSTALLATION_PATH_FILE_RDS_S3 = "./resources/installation_config/rds-s3-static.yaml" -INSTALLATION_PATH_FILE_RDS_ONLY = "./resources/installation_config/rds-only.yaml" -INSTALLATION_PATH_FILE_S3_ONLY = "./resources/installation_config/s3-only-static.yaml" -path_dic_rds_s3 = load_yaml_file(INSTALLATION_PATH_FILE_RDS_S3) -path_dic_rds_only = load_yaml_file(INSTALLATION_PATH_FILE_RDS_ONLY) -path_dic_s3_only = load_yaml_file(INSTALLATION_PATH_FILE_S3_ONLY) + def main(): @@ -446,10 +441,20 @@ def install_secrets_store_csi_driver(): # TO DO: decouple kustomize params.env and helm values.yaml write up in future def setup_kubeflow_pipeline(): print("Setting up Kubeflow Pipeline...") - print("Retrieving DB instance info...") db_instance_info = get_db_instance_info() + if CREDENTIALS_OPTION == "irsa": + INSTALLATION_PATH_FILE_RDS_S3 = "./resources/installation_config/rds-s3.yaml" + INSTALLATION_PATH_FILE_S3_ONLY = "./resources/installation_config/s3-only.yaml" + else: + INSTALLATION_PATH_FILE_RDS_S3 = "./resources/installation_config/rds-s3-static.yaml" + INSTALLATION_PATH_FILE_S3_ONLY = "./resources/installation_config/s3-only-static.yaml" + INSTALLATION_PATH_FILE_RDS_ONLY = "./resources/installation_config/rds-only.yaml" + path_dic_rds_s3 = load_yaml_file(INSTALLATION_PATH_FILE_RDS_S3) + path_dic_rds_only = load_yaml_file(INSTALLATION_PATH_FILE_RDS_ONLY) + path_dic_s3_only = load_yaml_file(INSTALLATION_PATH_FILE_S3_ONLY) + # helm # pipelines helm path pipeline_rds_s3_helm_path = path_dic_rds_s3["kubeflow-pipelines"][ diff --git a/tools/helmify/src/config.yaml b/tools/helmify/src/config.yaml index 916b827bf1..b73d06274a 100644 --- a/tools/helmify/src/config.yaml +++ b/tools/helmify/src/config.yaml @@ -336,8 +336,8 @@ aws-authservice: kustomization_paths: - awsconfigs/common/aws-authservice/base output_helm_chart_path: charts/common/aws-authservice - version: 0.1.0 - app_version: v0.1.0 + version: 0.2.0 + app_version: v2.0.0 params: template_paths: - tools/helmify/template/aws-authservice/params.env diff --git a/website/content/en/docs/add-ons/storage/fsx-for-lustre/guide.md b/website/content/en/docs/add-ons/storage/fsx-for-lustre/guide.md index 6cf1ef13c0..5ddcf4d853 100644 --- a/website/content/en/docs/add-ons/storage/fsx-for-lustre/guide.md +++ b/website/content/en/docs/add-ons/storage/fsx-for-lustre/guide.md @@ -75,10 +75,10 @@ The script applies some default values for the file system name, performance mod If you prefer to manually setup each component then you can follow this manual guide. #### 1. Install the FSx CSI Driver -We recommend installing the FSx CSI Driver v0.7.1 directly from the [the aws-fsx-csi-driver GitHub repository](https://github.com/kubernetes-sigs/aws-fsx-csi-driver) as follows: +We recommend installing the FSx CSI Driver v0.9.0 directly from the [the aws-fsx-csi-driver GitHub repository](https://github.com/kubernetes-sigs/aws-fsx-csi-driver) as follows: ```bash -kubectl apply -k "github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=tags/v0.7.1" +kubectl apply -k "github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=tags/v0.9.0" ``` You can confirm that FSx CSI Driver was installed using the following command: diff --git a/website/content/en/docs/component-guides/kserve/access-aws-services-from-kserve.md b/website/content/en/docs/component-guides/kserve/access-aws-services-from-kserve.md index 0d72219520..f9c1a4bcf8 100644 --- a/website/content/en/docs/component-guides/kserve/access-aws-services-from-kserve.md +++ b/website/content/en/docs/component-guides/kserve/access-aws-services-from-kserve.md @@ -25,7 +25,7 @@ weight = 10 > NOTE: You can use ECR (`AmazonEC2ContainerRegistryReadOnly`) and S3 (`AmazonS3ReadOnlyAccess`) ReadOnly managed policies. We recommend creating fine grained policy for production usecase. ### Deploy models from S3 Bucket -1. Create Secret with empty AWS Credential: +1. Create Secret: ```sh cat < secret.yaml apiVersion: v1 @@ -38,14 +38,10 @@ weight = 10 serving.kserve.io/s3-usehttps: "1" serving.kserve.io/s3-region: ${CLUSTER_REGION} type: Opaque - data: - AWS_ACCESS_KEY_ID: "" - AWS_SECRET_ACCESS_KEY: "" EOF kubectl apply -f secret.yaml ``` - > NOTE: The **empty** keys for `AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY` force it to add the env vars to the init containers but don't override the actual credentials from the IAM role (which happens if you add dummy values). These **empty** keys are needed for IRSA to work in current version and will not be needed in future release. 1. Attach secret to IRSA in your profile namespace: ``` diff --git a/website/content/en/docs/component-guides/notebooks.md b/website/content/en/docs/component-guides/notebooks.md index 45e09f71b0..a75a9d533a 100644 --- a/website/content/en/docs/component-guides/notebooks.md +++ b/website/content/en/docs/component-guides/notebooks.md @@ -20,10 +20,10 @@ Use AWS-optimized Kubeflow Notebook server images to quickly get started with a These container images are available on the [Amazon Elastic Container Registry (Amazon ECR)](https://gallery.ecr.aws/kubeflow-on-aws/). The following images are available as part of this release, however you can always find the latest updated images in the linked ECR repository. ``` -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.9.1-gpu-py39-cu112-ubuntu20.04-e3-v1.2-2022-09-20 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.9.1-cpu-py39-ubuntu20.04-e3-v1.2-2022-09-20 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.0-gpu-py38-cu116-ubuntu20.04-ec2-2022-09-20 -public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:1.12.0-cpu-py38-ubuntu20.04-ec2-2022-09-20 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-tensorflow:2.12.0-cpu-py310-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0 +public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-cpu-py310-ubuntu20.04-ec2-v1.0 ``` AWS Deep Learning Containers provide optimized environments with popular machine learning frameworks such as TensorFlow and PyTorch, and are available in the Amazon ECR. For more information on AWS Deep Learning Container options, see [Available Deep Learning Containers Images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md). @@ -31,8 +31,6 @@ AWS Deep Learning Containers provide optimized environments with popular machine Along with specific machine learning frameworks, these container images have additional pre-installed packages: - `kfp` - `kfserving` -- `h5py` -- `pandas` - `awscli` - `boto3` diff --git a/website/content/en/docs/deployment/cognito-rds-s3/guide-terraform.md b/website/content/en/docs/deployment/cognito-rds-s3/guide-terraform.md index 66254cd2a7..21f89941e0 100644 --- a/website/content/en/docs/deployment/cognito-rds-s3/guide-terraform.md +++ b/website/content/en/docs/deployment/cognito-rds-s3/guide-terraform.md @@ -48,21 +48,13 @@ pwd export TF_VAR_create_subdomain="false" ``` -1. Create an IAM user to use with the Minio Client - - [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `TF_VAR_minio_aws_access_key_id` and `TF_VAR_minio_aws_secret_access_key` respectively. - 1. Define the following environment variables: - ```sh + ```bash # Region to create the cluster in export CLUSTER_REGION= # Name of the cluster to create export CLUSTER_NAME= - # AWS access key id of the static credentials used to authenticate the Minio Client - export TF_VAR_minio_aws_access_key_id= - # AWS secret access key of the static credentials used to authenticate the Minio Client - export TF_VAR_minio_aws_secret_access_key= # Name of an existing Route53 root domain (e.g. example.com) export ROOT_DOMAIN= # Name of the subdomain to create (e.g. platform.example.com) @@ -78,9 +70,34 @@ pwd # Load Balancer Scheme export LOAD_BALANCER_SCHEME=internet-facing ``` - > NOTE: Configure Load Balancer Scheme (e.g. `internet-facing` or `internal`). Default is set to `internet-facing`. Use `internal` as the load balancer scheme if you want the load balancer to be accessible only within your VPC. See [Load balancer scheme](https://docs.aws.amazon.com/elasticloadbalancing/latest/userguide/how-elastic-load-balancing-works.html#load-balancer-scheme) in the AWS documentation + +As of Kubeflow 1.7, there are two options to configure Amazon S3 as an artifact store for pipelines. Choose one of the following options: + > Note: IRSA is only supported in KFPv1, if you plan to use KFPv2, choose the IAM User option. IRSA support for KFPv2 will be added in the next release. + - Option 1 - IRSA (Recommended): IAM Role for Service Account (IRSA) which allows the use of AWS IAM permission boundaries at the Kubernetes pod level. A Kubernetes service account (SA) is associated with an IAM role with a role policy that scopes the IAM permissions (e.g. S3 read/write access, etc.). When a pod in the SA namespace is annotated with the SA name, EKS injects the IAM role ARN and a token is used to get the credentials so that the pod can make requests to AWS services within the scope of the role policy associated with the IRSA. + For more information, see [Amazon EKS IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + + - Option 2 - IAM User (Deprecated): + [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `minio_aws_access_key_id` and `minio_aws_secret_access_key` respectively. + +1. Export your desired PIPELINE_S3_CREDENTIAL_OPTION specific values +{{< tabpane persistLang=false >}} +{{< tab header="IRSA" lang="toml" >}} +# Pipeline S3 Credential Option to configure +export PIPELINE_S3_CREDENTIAL_OPTION="irsa" +{{< /tab >}} +{{< tab header="IAM User" lang="toml" >}} +# Pipeline S3 Credential Option to configure +export PIPELINE_S3_CREDENTIAL_OPTION="static" +# AWS access key id of the static credentials used to authenticate the Minio Client +export TF_VAR_minio_aws_access_key_id= +# AWS secret access key of the static credentials used to authenticate the Minio Client +export TF_VAR_minio_aws_secret_access_key= +{{< /tab >}} + {{< /tabpane >}} + + 1. Save the variables to a `.tfvars` file: ```sh cat < sample.auto.tfvars @@ -92,6 +109,7 @@ pwd cognito_user_pool_name="${USER_POOL_NAME}" use_rds="${USE_RDS}" use_s3="${USE_S3}" + pipeline_s3_credential_option="${PIPELINE_S3_CREDENTIAL_OPTION}" use_cognito="${USE_COGNITO}" load_balancer_scheme="${LOAD_BALANCER_SCHEME}" diff --git a/website/content/en/docs/deployment/cognito-rds-s3/guide.md b/website/content/en/docs/deployment/cognito-rds-s3/guide.md index ed8eb9d6d8..5fa977b08b 100644 --- a/website/content/en/docs/deployment/cognito-rds-s3/guide.md +++ b/website/content/en/docs/deployment/cognito-rds-s3/guide.md @@ -15,9 +15,10 @@ Refer to the [general prerequisites guide]({{< ref "/docs/deployment/prerequisit 3. Create an EKS cluster 4. Create an S3 Bucket 5. Create an RDS Instance -6. Configure AWS Secrets for RDS and S3 -7. Install AWS Secrets and Kubernetes Secrets Store CSI driver -8. Configure an RDS endpoint and an S3 bucket name for Kubeflow Pipelines +6. Configure AWS Secrets or IAM Role for S3 +7. Configure AWS Secrets for RDS +8. Install AWS Secrets and Kubernetes Secrets Store CSI driver +9. Configure an RDS endpoint and an S3 bucket name for Kubeflow Pipelines ## Configure Custom Domain and Cognito @@ -31,13 +32,20 @@ Refer to the [general prerequisites guide]({{< ref "/docs/deployment/prerequisit Enable culling for notebooks by following the [instructions]({{< ref "/docs/deployment/configure-notebook-culling.md#" >}}) in configure culling for notebooks guide. 2. Deploy Kubeflow. + + 1. Export your pipeline-s3-credential-option + ```bash + export PIPELINE_S3_CREDENTIAL_OPTION= + ``` + 1. Install Kubeflow using the following command: + {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} -make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=cognito-rds-s3 +make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=cognito-rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< tab header="Helm" lang="yaml" >}} -make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=cognito-rds-s3 +make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=cognito-rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} @@ -58,9 +66,9 @@ make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=cognito-rds-s3 1. Delete the kubeflow deployment: {{< tabpane persistLang=false >}} - {{< tab header="Kustomize" lang="toml" >}}make delete-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=cognito-rds-s3 + {{< tab header="Kustomize" lang="toml" >}}make delete-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=cognito-rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} - {{< tab header="Helm" lang="yaml" >}}make delete-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=cognito-rds-s3 + {{< tab header="Helm" lang="yaml" >}}make delete-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=cognito-rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} diff --git a/website/content/en/docs/deployment/configure-notebook-culling.md b/website/content/en/docs/deployment/configure-notebook-culling.md index 8f6aa88f59..5f2e15adf5 100644 --- a/website/content/en/docs/deployment/configure-notebook-culling.md +++ b/website/content/en/docs/deployment/configure-notebook-culling.md @@ -22,9 +22,9 @@ weight = 80 {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="sh" >}} printf ' -enableCulling='$ENABLE_CULLING' -cullIdleTime='$CULL_IDLE_TIMEOUT' -idlenessCheckPeriod='$IDLENESS_CHECK_PERIOD' +ENABLE_CULLING='$ENABLE_CULLING' +CULL_IDLE_TIME='$CULL_IDLE_TIMEOUT' +IDLENESS_CHECK_PERIOD='$IDLENESS_CHECK_PERIOD' ' > awsconfigs/apps/notebook-controller/params.env {{< /tab >}} {{< tab header="Helm" lang="sh" >}} diff --git a/website/content/en/docs/deployment/rds-s3/guide-terraform.md b/website/content/en/docs/deployment/rds-s3/guide-terraform.md index a8255fe3fd..795d601b87 100644 --- a/website/content/en/docs/deployment/rds-s3/guide-terraform.md +++ b/website/content/en/docs/deployment/rds-s3/guide-terraform.md @@ -39,27 +39,43 @@ pwd ### Configure -1. Create an IAM user to use with the Minio Client - - [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `TF_VAR_minio_aws_access_key_id` and `TF_VAR_minio_aws_secret_access_key` respectively. - 1. Define the following environment variables: - ```sh + ```bash # Region to create the cluster in export CLUSTER_REGION= # Name of the cluster to create export CLUSTER_NAME= - # AWS access key id of the static credentials used to authenticate the Minio Client - export TF_VAR_minio_aws_access_key_id= - # AWS secret access key of the static credentials used to authenticate the Minio Client - export TF_VAR_minio_aws_secret_access_key= # true/false flag to configure and deploy with RDS export USE_RDS="true" # true/false flag to configure and deploy with S3 export USE_S3="true" ``` +As of Kubeflow 1.7, there are two options to configure Amazon S3 as an artifact store for pipelines. Choose one of the following options: + > Note: IRSA is only supported in KFPv1, if you plan to use KFPv2, choose the IAM User option. IRSA support for KFPv2 will be added in the next release. + - Option 1 - IRSA (Recommended): IAM Role for Service Account (IRSA) which allows the use of AWS IAM permission boundaries at the Kubernetes pod level. A Kubernetes service account (SA) is associated with an IAM role with a role policy that scopes the IAM permissions (e.g. S3 read/write access, etc.). When a pod in the SA namespace is annotated with the SA name, EKS injects the IAM role ARN and a token is used to get the credentials so that the pod can make requests to AWS services within the scope of the role policy associated with the IRSA. + For more information, see [Amazon EKS IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + + - Option 2 - IAM User (Deprecated): + [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `minio_aws_access_key_id` and `minio_aws_secret_access_key` respectively. + +1. Export your desired PIPELINE_S3_CREDENTIAL_OPTION specific values +{{< tabpane persistLang=false >}} +{{< tab header="IRSA" lang="toml" >}} +# Pipeline S3 Credential Option to configure +export PIPELINE_S3_CREDENTIAL_OPTION="irsa" +{{< /tab >}} +{{< tab header="IAM User" lang="toml" >}} +# Pipeline S3 Credential Option to configure +export PIPELINE_S3_CREDENTIAL_OPTION="static" +# AWS access key id of the static credentials used to authenticate the Minio Client +export TF_VAR_minio_aws_access_key_id= +# AWS secret access key of the static credentials used to authenticate the Minio Client +export TF_VAR_minio_aws_secret_access_key= +{{< /tab >}} + {{< /tabpane >}} + 1. Save the variables to a `.tfvars` file: ```sh @@ -69,6 +85,8 @@ pwd generate_db_password="true" use_rds="${USE_RDS}" use_s3="${USE_S3}" + pipeline_s3_credential_option="${PIPELINE_S3_CREDENTIAL_OPTION}" + # The below values are set to make cleanup easier but are not recommended for production deletion_protection="false" diff --git a/website/content/en/docs/deployment/rds-s3/guide.md b/website/content/en/docs/deployment/rds-s3/guide.md index c0bb75dbb0..000c6ff6e3 100644 --- a/website/content/en/docs/deployment/rds-s3/guide.md +++ b/website/content/en/docs/deployment/rds-s3/guide.md @@ -61,11 +61,31 @@ export REPO_ROOT=$(pwd) There are two ways to create RDS and S3 resources before you deploy the Kubeflow manifests. Either use the [automated setup](#21-option-1-automated-setup) Python script that is mentioned in the following step, or follow the [manual setup instructions](#22-option-2-manual-setup). +As of Kubeflow 1.7, there are two options to configure Amazon S3 as an artifact store for pipelines. Choose one of the following options: + > Note: IRSA is only supported in KFPv1, if you plan to use KFPv2, choose the IAM User option. IRSA support for KFPv2 will be added in the next release. + - Option 1 - IRSA (Recommended): IAM Role for Service Account (IRSA) which allows the use of AWS IAM permission boundaries at the Kubernetes pod level. A Kubernetes service account (SA) is associated with an IAM role with a role policy that scopes the IAM permissions (e.g. S3 read/write access, etc.). When a pod in the SA namespace is annotated with the SA name, EKS injects the IAM role ARN and a token is used to get the credentials so that the pod can make requests to AWS services within the scope of the role policy associated with the IRSA. + For more information, see [Amazon EKS IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + + - Option 2 - IAM User (Deprecated): + [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `minio_aws_access_key_id` and `minio_aws_secret_access_key` respectively. + +1. Export your desired PIPELINE_S3_CREDENTIAL_OPTION: + {{< tabpane >}} + {{< tab header="IRSA" lang="toml" >}} +export PIPELINE_S3_CREDENTIAL_OPTION=irsa +{{< /tab >}} + {{< tab header="IAM User" lang="toml" >}} +export PIPELINE_S3_CREDENTIAL_OPTION=static +{{< /tab >}} + {{< /tabpane >}} + + ### 2.1 **Option 1: Automated Setup** +> Note: Automated Setup is only supported for RDS AND S3 Deployments, for RDS/S3 only use the manual steps. This setup performs all the manual steps in an automated fashion. -The script takes care of creating the S3 bucket, creating the S3 Secrets using the Secrets manager, setting up the RDS database, and creating the RDS Secret using the Secrets manager. The script also edits the required configuration files for Kubeflow Pipelines to be properly configured for the RDS database during Kubeflow installation. The script also handles cases where the resources already exist. In this case, the script will simply skip the step. +The script takes care of creating the S3 bucket, setting up IRSA to access to S3 or creating the S3 Secrets if using static credentials, setting up the RDS database, and creating the RDS Secret using the Secrets manager. The script also edits the required configuration files for Kubeflow Pipelines to be properly configured for the RDS database during Kubeflow installation. The script also handles cases where the resources already exist. In this case, the script will simply skip the step. > Note: The script will **not** delete any resource. Therefore, if a resource already exists (eg: Secret, database with the same name, or S3 bucket), **it will skip the creation of those resources and use the existing resources instead**. This is by design in order to prevent unwanted results, such as accidental deletion. For example, if a database with the same name already exists, the script will skip the database creation setup. If you forgot to change the database name used for creation, then this gives you the chance to retry the script with the proper value. See `python auto-rds-s3-setup.py --help` for the list of parameters, as well as their default values. @@ -73,23 +93,39 @@ The script takes care of creating the S3 bucket, creating the S3 Secrets using t ```bash cd tests/e2e ``` -1. [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` of the IAM user that you created to use in the following step, which will be referenced as `MINIO_AWS_ACCESS_KEY_ID` and `MINIO_AWS_SECRET_ACCESS_KEY` respectively. -1. Export values for `CLUSTER_REGION`, `CLUSTER_NAME`, `S3_BUCKET`, `MINIO_AWS_ACCESS_KEY_ID`, and `MINIO_AWS_SECRET_ACCESS_KEY`. + +1. Export values for `CLUSTER_REGION`, `CLUSTER_NAME`, `S3_BUCKET`. ```bash export CLUSTER_REGION=<> export CLUSTER_NAME=<> export S3_BUCKET=<> export DB_INSTANCE_NAME=<> export DB_SUBNET_GROUP_NAME=<> - export MINIO_AWS_ACCESS_KEY_ID=<> - export MINIO_AWS_SECRET_ACCESS_KEY=<> export RDS_SECRET_NAME=<> - export S3_SECRET_NAME=<> ``` + +1. Export your desired PIPELINE_S3_CREDENTIAL_OPTION specific values + {{< tabpane >}} + {{< tab header="IRSA" lang="toml" >}} +export PIPELINE_S3_CREDENTIAL_OPTION=irsa +{{< /tab >}} + {{< tab header="IAM User" lang="toml" >}} +export S3_SECRET_NAME=<> +export MINIO_AWS_ACCESS_KEY_ID=<> +export MINIO_AWS_SECRET_ACCESS_KEY=<> +export PIPELINE_S3_CREDENTIAL_OPTION=static +{{< /tab >}} + {{< /tabpane >}} + 1. Run the `auto-rds-s3-setup.py` script - ``` - PYTHONPATH=.. python utils/rds-s3/auto-rds-s3-setup.py --region $CLUSTER_REGION --cluster $CLUSTER_NAME --bucket $S3_BUCKET --s3_aws_access_key_id $MINIO_AWS_ACCESS_KEY_ID --s3_aws_secret_access_key $MINIO_AWS_SECRET_ACCESS_KEY --db_instance_name $DB_INSTANCE_NAME --s3_secret_name $S3_SECRET_NAME --rds_secret_name $RDS_SECRET_NAME --db_subnet_group_name $DB_SUBNET_GROUP_NAME - ``` + {{< tabpane >}} + {{< tab header="IRSA" lang="toml" >}} +PYTHONPATH=.. python utils/rds-s3/auto-rds-s3-setup.py --region $CLUSTER_REGION --cluster $CLUSTER_NAME --bucket $S3_BUCKET --db_instance_name $DB_INSTANCE_NAME --rds_secret_name $RDS_SECRET_NAME --db_subnet_group_name $DB_SUBNET_GROUP_NAME --pipeline_s3_credential_option $PIPELINE_S3_CREDENTIAL_OPTION +{{< /tab >}} + {{< tab header="IAM User" lang="toml" >}} +PYTHONPATH=.. python utils/rds-s3/auto-rds-s3-setup.py --region $CLUSTER_REGION --cluster $CLUSTER_NAME --bucket $S3_BUCKET -s3_aws_access_key_id $MINIO_AWS_ACCESS_KEY_ID --s3_aws_secret_access_key $MINIO_AWS_SECRET_ACCESS_KEY --db_instance_name $DB_INSTANCE_NAME --s3_secret_name $S3_SECRET_NAME --rds_secret_name $RDS_SECRET_NAME --db_subnet_group_name $DB_SUBNET_GROUP_NAME --pipeline_s3_credential_option $PIPELINE_S3_CREDENTIAL_OPTION +{{< /tab >}} + {{< /tabpane >}} ### Advanced customization @@ -123,22 +159,14 @@ Follow this step if you prefer to manually set up each component. - `RDS database endpoint URL` - `RDS database port` -3. [S3] Create IAM User With Permissions To S3 Bucket - - [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` of the IAM user that you created to use in the following step, which will be referenced as `MINIO_AWS_ACCESS_KEY_ID` and `MINIO_AWS_SECRET_ACCESS_KEY` respectively. +### 2.2.1 RDS Setup 1. Export values: - ```bash - export RDS_SECRET="" - export S3_SECRET="" - export DB_HOST="" - export MLMD_DB=metadata_db - export S3_BUCKET="" - export MINIO_SERVICE_HOST=s3.amazonaws.com - export MINIO_AWS_ACCESS_KEY_ID="" - export MINIO_AWS_SECRET_ACCESS_KEY="" - ``` - + ```bash + export RDS_SECRET="" + export DB_HOST="" + export MLMD_DB=metadata_db + ``` 3. Create Secrets in AWS Secrets Manager 1. [RDS] Create the RDS Secret and configure the Secret provider: @@ -160,16 +188,146 @@ yq e '.rds.secretName = env(RDS_SECRET)' -i charts/common/aws-secrets-manager/rd {{< /tab >}} {{< /tabpane >}} +### 2.2.2 S3 Setup + 1. Export values: + ```bash + export S3_BUCKET="" + export MINIO_SERVICE_HOST=s3.amazonaws.com + ``` +As of Kubeflow 1.7, there are two options to configure Amazon S3 as an artifact store for pipelines. Choose one of the following options: + > Note: IRSA is only supported in KFPv1, if you plan to use KFPv2, choose the IAM User option. IRSA support for KFPv2 will be added in the next release. + - Option 1 - IRSA (Recommended): Follow [Configure using IRSA](#2221-configure-using-irsa) + - Option 2 - IAM User (Deprecated): Follow [Configure using IAM User](#2222-configure-using-iam-user) + + ### 2.2.2.1 Configure using IRSA + +IAM Role for Service Account (IRSA) which allows the use of AWS IAM permission boundaries at the Kubernetes pod level. A Kubernetes service account (SA) is associated with an IAM role with a role policy that scopes the IAM permissions (e.g. S3 read/write access, etc.). When a pod in the SA namespace is annotated with the SA name, EKS injects the IAM role ARN and a token is used to get the credentials so that the pod can make requests to AWS services within the scope of the role policy associated with the IRSA. For more information, see [Amazon EKS IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + + +1. Create and Configure IAM Roles: + + 1. An OIDC provider must exist for your cluster to use IRSA. Create an OIDC provider and associate it with your EKS cluster by running the following command if your cluster doesn’t already have one: + ```bash + eksctl utils associate-iam-oidc-provider --cluster ${CLUSTER_NAME} \ + --region ${CLUSTER_REGION} --approve + ``` + + 2. Get the identity issuer URL by running the following commands: + ```bash + export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) + export OIDC_PROVIDER_URL=$(aws eks describe-cluster --name $CLUSTER_NAME --region $CLUSTER_REGION \ + --query "cluster.identity.oidc.issuer" --output text | cut -c9-) + ``` + + 3. [Create an IAM policy](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies_create.html) with access to the S3 bucket where pipeline artifacts will be stored. The following policy grants full access to the S3 bucket, you can scope it down by giving read, write and GetBucketLocation permissions. + ```bash + printf '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "s3:*", + "Resource": [ + "arn:aws:s3:::${S3_BUCKET}", + "arn:aws:s3::::${S3_BUCKET}/*" + ] + } + ] + } + ' > ./s3_policy.json + ``` + + 4. Create Pipeline Backend Role + ```bash + cat < backend-trust.json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_PROVIDER_URL}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${OIDC_PROVIDER_URL}:aud": "sts.amazonaws.com", + "${OIDC_PROVIDER_URL}:sub": "system:serviceaccount:kubeflow:ml-pipeline" + } + } + } + ] + } + EOF - 1. [S3] Create the S3 Secret and configure the Secret provider: - 1. Configure a Secret (e.g. `s3-secret`) with your AWS credentials. These need to be long-term credentials from an IAM user and not temporary. + export PIPELINE_BACKEND_ROLE_NAME=kf-pipeline-backend-role-$CLUSTER_NAME + aws --region $CLUSTER_REGION iam create-role --role-name $PIPELINE_BACKEND_ROLE_NAME --assume-role-policy-document file://backend-trust.json + export BACKEND_ROLE_ARN=$(aws --region $CLUSTER_REGION iam get-role --role-name $PIPELINE_BACKEND_ROLE_NAME --output text --query 'Role.Arn') + ``` + 5. Create Profile Role + ```bash + cat < profile-trust.json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_PROVIDER_URL}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${OIDC_PROVIDER_URL}:aud": "sts.amazonaws.com", + "${OIDC_PROVIDER_URL}:sub": "system:serviceaccount:kubeflow-user-example-com:default-editor" + } + } + } + ] + } + EOF + + export PROFILE_ROLE_NAME=kf-pipeline-profile-role-$CLUSTER_NAME + aws --region $CLUSTER_REGION iam create-role --role-name $PROFILE_ROLE_NAME --assume-role-policy-document file://profile-trust.json + export PROFILE_ROLE_ARN=$(aws --region $CLUSTER_REGION iam get-role --role-name $PROFILE_ROLE_NAME --output text --query 'Role.Arn') + ``` + 6. Attach S3 Policy to Roles + ```bash + aws --region $CLUSTER_REGION iam put-role-policy --role-name $PIPELINE_BACKEND_ROLE_NAME --policy-name kf-pipeline-s3 --policy-document file://s3_policy.json + aws --region $CLUSTER_REGION iam put-role-policy --role-name $PROFILE_ROLE_NAME --policy-name kf-pipeline-s3 --policy-document file://s3_policy.json + ``` + 7. Configure the manifests with role ARNs. + - Select the package manager of your choice. + {{< tabpane persistLang=false >}} + {{< tab header="Kustomize" lang="toml" >}} +yq e '.metadata.annotations."eks.amazonaws.com/role-arn"=env(BACKEND_ROLE_ARN)' -i awsconfigs/apps/pipeline/s3/service-account.yaml +yq e '.spec.plugins[0].spec."awsIamRole"=env(PROFILE_ROLE_ARN)' -i awsconfigs/common/user-namespace/overlay/profile.yaml +{{< /tab >}} + {{< tab header="Helm" lang="yaml" >}} +yq e '.s3.roleArn = env(BACKEND_ROLE_ARN)' -i charts/apps/kubeflow-pipelines/rds-s3/values.yaml +yq e '.s3.roleArn = env(BACKEND_ROLE_ARN)' -i charts/apps/kubeflow-pipelines/s3-only/values.yaml +yq e '.awsIamForServiceAccount.awsIamRole = env(PROFILE_ROLE_ARN)' -i charts/common/user-namespace/values.yaml +{{< /tab >}} + {{< /tabpane >}} + +### 2.2.2.2 Configure using IAM User + +1. [Create an IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_cliwpsapi) with permissions to get bucket locations and allow read and write access to objects in an S3 bucket where you want to store the Kubeflow artifacts. Take note of the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY of the IAM user that you created to use in the following step, which will be referenced as `minio_aws_access_key_id` and `minio_aws_secret_access_key` respectively. + +1. Create and configure S3 Secrets: + 1. Export values: + ```bash + export S3_SECRET="" + export MINIO_AWS_ACCESS_KEY_ID="" + export MINIO_AWS_SECRET_ACCESS_KEY="" + ``` + 2. Configure a Secret (e.g. `s3-secret`) with your AWS credentials. These need to be long-term credentials from an IAM user and not temporary. - For more details about configuring or finding your AWS credentials, see [AWS security credentials](https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html) - ```bash aws secretsmanager create-secret --name $S3_SECRET --secret-string '{"accesskey":"'$MINIO_AWS_ACCESS_KEY_ID'","secretkey":"'$MINIO_AWS_SECRET_ACCESS_KEY'"}' --region $CLUSTER_REGION ``` - 1. Rename the `parameters.objects.objectName` field in [the S3 Secret provider configuration](https://github.com/awslabs/kubeflow-manifests/blob/main/awsconfigs/common/aws-secrets-manager/s3/secret-provider.yaml) to the name of the Secret. - - Rename the field with the following command: - Select the package manager of your choice. + 3. Rename the `parameters.objects.objectName` field in [the S3 Secret provider configuration](https://github.com/awslabs/kubeflow-manifests/blob/main/awsconfigs/common/aws-secrets-manager/s3/secret-provider.yaml) to the name of the Secret. + - Select the package manager of your choice. {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} yq e -i '.spec.parameters.objects |= sub("s3-secret",env(S3_SECRET))' awsconfigs/common/aws-secrets-manager/s3/secret-provider.yaml @@ -180,7 +338,7 @@ yq e '.s3.secretName = env(S3_SECRET)' -i charts/common/aws-secrets-manager/rds- {{< /tab >}} {{< /tabpane >}} - +### Install CSI Driver and update KFP configurations 4. Install AWS Secrets & Configuration Provider with Kubernetes Secrets Store CSI driver 1. Run the following commands to enable OIDC and create an `iamserviceaccount` with permissions to retrieve the Secrets created with AWS Secrets Manager. @@ -205,8 +363,7 @@ yq e '.s3.secretName = env(S3_SECRET)' -i charts/common/aws-secrets-manager/rds- 5. Update the KFP configurations. 1. [RDS] Configure the *RDS endpoint URL* and *the metadata DB name*: - - Rename the field with the following command - Select the package manager of your choice. + - Select the package manager of your choice. {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} printf ' @@ -224,8 +381,7 @@ yq e '.rds.mlmdDb = env(MLMD_DB)' -i charts/apps/kubeflow-pipelines/rds-only/val 2. [S3] Configure the *S3 bucket name* and *S3 bucket region*: - - Select the package manager of your choice. + - Select the package manager of your choice. {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} printf ' @@ -266,10 +422,10 @@ cd $REPO_ROOT Use the following command to deploy the Kubeflow manifests for both RDS and S3: {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} -make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-s3 +make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< tab header="Helm" lang="yaml" >}} -make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-s3 +make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} @@ -278,10 +434,10 @@ make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-s3 Use the following command to deploy the Kubeflow manifests for RDS only: {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} -make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-only +make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-only PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< tab header="Helm" lang="yaml" >}} -make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-only +make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-only PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} @@ -291,10 +447,10 @@ make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-only Use the following command to deploy the Kubeflow manifests for S3 only: {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} -make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=s3-only +make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=s3-only PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< tab header="Helm" lang="yaml" >}} -make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=s3-only +make deploy-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=s3-only PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} @@ -394,15 +550,15 @@ mysql> select * from observation_logs; Run the following command to uninstall your Kubeflow deployment: -> Note: Make sure you have the correct INSTALLATION_OPTION and DEPLOYMENT_OPTION environment variables set for your chosen installation +> Note: Make sure you have the correct INSTALLATION_OPTION, DEPLOYMENT_OPTION and PIPELINE_S3_CREDENTIAL_OPTION environment variables set for your chosen installation {{< tabpane persistLang=false >}} {{< tab header="Kustomize" lang="toml" >}} -make delete-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-s3 +make delete-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< tab header="Helm" lang="yaml" >}} -make delete-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-s3 +make delete-kubeflow INSTALLATION_OPTION=helm DEPLOYMENT_OPTION=rds-s3 PIPELINE_S3_CREDENTIAL_OPTION=$PIPELINE_S3_CREDENTIAL_OPTION {{< /tab >}} {{< /tabpane >}} diff --git a/website/content/en/docs/deployment/terraform-s3-backend.md b/website/content/en/docs/deployment/terraform-s3-backend.md new file mode 100644 index 0000000000..f5be7ad34b --- /dev/null +++ b/website/content/en/docs/deployment/terraform-s3-backend.md @@ -0,0 +1,87 @@ ++++ +title = "Using AWS S3 as a backend for Terraform" +description = "Backup terraform state to AWS S3" +weight = 70 ++++ + +## Local vs. Remote state + +While Terraform manages the state of resources it has created through a `terraform.tfstate`, by default this file only exists locally. +This means that you will need to manually copy over the original `terraform.tfstate` file when managing previously created resources on a different host. +This can become difficult to manage and keep in sync when multiple copies exist between different hosts and users. + +By using a remote backend, such as AWS S3, state is consolidated in one shared remote location and can be re-used between multiple hosts. Additionally, the state will not be lost if the local `terraform.tfstate` file was accidentally deleted. + +For additional details on using AWS S3 as a Terraform backend, refer to the following Terraform [documentation](https://developer.hashicorp.com/terraform/language/settings/backends/s3#s3). + + +## Permissions + +The permissions required by the Terraform user to use AWS S3 as a Terraform backend can be found [here](https://developer.hashicorp.com/terraform/language/settings/backends/s3#s3-bucket-permissions). + +## Creating an initial backup of Terraform state + +1. Decide on a name and region for the bucket to create, as well as a path in the bucket for where to store the `tfstate` file. + +1. Define the following environment variables: + ```sh + export BUCKET_NAME= + export PATH_TO_BACKUP= + export BUCKET_REGION= + ``` + +1. Create the S3 bucket: + ```sh + aws s3api create-bucket --bucket ${BUCKET_NAME} --region ${BUCKET_REGION} + ``` + +1. Go to the respective Terraform deployment folder. For example, if Vanilla kubeflow is being deployed: + ```sh + cd deployments/vanilla/terraform + ``` + +1. Create the following file: + ```sh + cat < backend.tf + terraform { + backend "s3" { + bucket = "${BUCKET_NAME}" + key = "${PATH_TO_BACKUP}" + region = "${BUCKET_REGION}" + } + } + EOF + ``` + +1. The above configuration will be used the next time Terraform is deployed. + +## Restoring from a Terraform state backup + +1. Find the name and region for the created bucket, as well as the path in the bucket for where the `tfstate` file is stored. + +1. Define the following environment variables: + ```sh + export BUCKET_NAME= + export PATH_TO_BACKUP= + export BUCKET_REGION= + ``` + +1. Go to the respective Terraform deployment folder. For example, if Vanilla kubeflow is being deployed: + ```sh + cd deployments/vanilla/terraform + ``` + +1. Create the following file: + ```sh + cat < backend.tf + terraform { + backend "s3" { + bucket = "${BUCKET_NAME}" + key = "${PATH_TO_BACKUP}" + region = "${BUCKET_REGION}" + } + } + EOF + ``` + +1. The above configuration will be used the next time Terraform is deployed. \ No newline at end of file