From f8a5e36d610dad0370f34ae05e0a8099ab042d1b Mon Sep 17 00:00:00 2001 From: "Akihiko (Aki) Kuroda" <16141898+akihikokuroda@users.noreply.github.com> Date: Tue, 20 Jun 2023 16:18:28 -0400 Subject: [PATCH] use cert-manager for certification management (#687) * use cert-manager for certification management Signed-off-by: Akihiko Kuroda --- gateway/api/ray.py | 47 ++++++++++ gateway/tests/api/test_ray.py | 7 +- .../helm/quantumserverless/README.md | 2 + .../templates/certmanager-issuers.yaml | 32 +++++++ .../charts/gateway/templates/deployment.yaml | 3 +- .../charts/gateway/templates/raycertgen.yaml | 51 +++++++++++ .../gateway/templates/rayclustersa.yaml | 51 +++++++++++ .../gateway/templates/rayclustertemplate.yaml | 91 ++++++++++++++++++- .../charts/gateway/templates/role.yaml | 20 ++++ .../charts/gateway/values.yaml | 1 + .../helm/quantumserverless/values.yaml | 1 + 11 files changed, 299 insertions(+), 7 deletions(-) create mode 100644 infrastructure/helm/quantumserverless/charts/gateway/templates/certmanager-issuers.yaml create mode 100644 infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustersa.yaml diff --git a/gateway/api/ray.py b/gateway/api/ray.py index e5132e441..4c0333b59 100644 --- a/gateway/api/ray.py +++ b/gateway/api/ray.py @@ -11,6 +11,7 @@ import yaml from kubernetes import client, config from kubernetes.dynamic.client import DynamicClient +from kubernetes.dynamic.exceptions import ResourceNotFoundError import requests from ray.dashboard.modules.job.sdk import JobSubmissionClient @@ -195,4 +196,50 @@ def kill_ray_cluster(cluster_name: str) -> bool: "Something went wrong during ray cluster deletion request: %s", delete_response.text, ) + try: + cert_client = dyn_client.resources.get(api_version="v1", kind="Certificate") + except ResourceNotFoundError: + return success + + delete_response = cert_client.delete(name=cluster_name, namespace=namespace) + if delete_response.status == "Success": + success = True + else: + logging.error( + "Something went wrong during ray certification deletion request: %s", + delete_response.text, + ) + + delete_response = cert_client.delete( + name=f"{cluster_name}-worker", namespace=namespace + ) + if delete_response.status == "Success": + success = True + else: + logging.error( + "Something went wrong during ray certification deletion request: %s", + delete_response.text, + ) + + corev1 = client.CoreV1Api() + delete_response = corev1.delete_namespaced_secret( + name=cluster_name, namespace=namespace + ) + if delete_response.status == "Success": + success = True + else: + logging.error( + "Something went wrong during certification secret deletion request: %s", + delete_response.text, + ) + delete_response = corev1.delete_namespaced_secret( + name=f"{cluster_name}-worker", namespace=namespace + ) + if delete_response.status == "Success": + success = True + else: + logging.error( + "Something went wrong during certification secret deletion request: %s", + delete_response.text, + ) return success diff --git a/gateway/tests/api/test_ray.py b/gateway/tests/api/test_ray.py index 263cfd329..229fc546c 100644 --- a/gateway/tests/api/test_ray.py +++ b/gateway/tests/api/test_ray.py @@ -71,9 +71,14 @@ def test_kill_cluster(self): DynamicClient.resources = MagicMock() mock = mock_delete() DynamicClient.resources.get = MagicMock(return_value=mock) + client.CoreV1Api = MagicMock() success = kill_ray_cluster("some_cluster") self.assertTrue(success) - DynamicClient.resources.get.assert_called_once_with( + DynamicClient.resources.get.assert_any_call( api_version="v1alpha1", kind="RayCluster" ) + DynamicClient.resources.get.assert_any_call( + api_version="v1", kind="Certificate" + ) + client.CoreV1Api.assert_called() diff --git a/infrastructure/helm/quantumserverless/README.md b/infrastructure/helm/quantumserverless/README.md index 7edce93a3..5d3934804 100644 --- a/infrastructure/helm/quantumserverless/README.md +++ b/infrastructure/helm/quantumserverless/README.md @@ -68,6 +68,8 @@ For our Ray Charts dependencies we are using the configuration created by the Ra - For Ray Api Server you can read their [values.yaml](https://github.com/ray-project/kuberay-helm/blob/main/helm-chart/kuberay-apiserver/values.yaml). +TLS is enabled for the gRPC communication among Ray components. It uses a self-signed certificate by derault. It can optionally use certificates signed by the cert manager in the environment that has the cert manager installed. The option is `gateway.useCertManager: ture` + **Keycloak** - The initial user ID and password for both keycload console(adminUser/adminPassword) and Ray dashboard(keycloakUserID/keycloakPassword) can be changed in the values.yaml file. It is good to change them before apply the helm. diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/certmanager-issuers.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/certmanager-issuers.yaml new file mode 100644 index 000000000..b2bf8cc07 --- /dev/null +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/certmanager-issuers.yaml @@ -0,0 +1,32 @@ +{{- if .Values.useCertManager }} +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: ray-selfsigned-ca +spec: + isCA: true + commonName: ray-selfsigned-ca + secretName: ray-root-secret + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: selfsigned-issuer + kind: ClusterIssuer + group: cert-manager.io +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: ray-ca-issuer +spec: + ca: + secretName: ray-root-secret +{{- end }} \ No newline at end of file diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/deployment.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/deployment.yaml index e222fdec9..633b3be4c 100644 --- a/infrastructure/helm/quantumserverless/charts/gateway/templates/deployment.yaml +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/deployment.yaml @@ -159,9 +159,8 @@ spec: volumeMounts: - mountPath: "/usr/src/app/media/" name: gateway-pv-storage - - mountPath: "/tmp/templates/rayclustertemplate.yaml" + - mountPath: "/tmp/templates/" name: ray-cluster-template - subPath: rayclustertemplate.yaml env: - name: DEBUG value: {{ .Values.application.debug | quote }} diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/raycertgen.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/raycertgen.yaml index 6d8294637..e9cfb4385 100644 --- a/infrastructure/helm/quantumserverless/charts/gateway/templates/raycertgen.yaml +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/raycertgen.yaml @@ -182,3 +182,54 @@ data: -CAcreateserial -out /etc/ray/tls/tls.crt \ -days 365 \ -sha256 -extfile /etc/ray/tls/cert.conf +{{- if .Values.useCertManager }} + gencert_cert_head.sh: | + #!/bin/sh + BASE_DIR=$1 + CLUSTER_NAME=$2 + IP_ADDRESS=$3 + NAMESPACE=$4 + + kubectl apply -f - < $BASE_DIR/ca.crt + kubectl get secret $CLUSTER_NAME -o=jsonpath='{.data.tls\.crt}' | base64 -d > $BASE_DIR/tls.crt + kubectl get secret $CLUSTER_NAME -o=jsonpath='{.data.tls\.key}' | base64 -d > $BASE_DIR/tls.key +{{- end }} \ No newline at end of file diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustersa.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustersa.yaml new file mode 100644 index 000000000..839a04455 --- /dev/null +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustersa.yaml @@ -0,0 +1,51 @@ +{{- if .Values.useCertManager }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ray-cluster-sa +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ray-cluster-role +rules: +- apiGroups: + - ray.io + resources: + - rayclusters + verbs: + - create + - delete + - get + - list +- apiGroups: + - cert-manager.io + resources: + - certificates + verbs: + - create + - delete + - get + - list +- apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ray-cluster-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ray-cluster-role +subjects: +- kind: ServiceAccount + name: ray-cluster-sa +{{- end }} \ No newline at end of file diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustertemplate.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustertemplate.yaml index 533c865f6..062ce19f1 100644 --- a/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustertemplate.yaml +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/rayclustertemplate.yaml @@ -1,4 +1,7 @@ apiVersion: v1 +kind: ConfigMap +metadata: + name: rayclustertemplate data: rayclustertemplate.yaml: | apiVersion: ray.io/v1alpha1 @@ -15,6 +18,21 @@ data: spec: initContainers: # Generate head's private key and certificate before `ray start`. +{{- if .Values.useCertManager }} + - name: ray-head-cert + image: rayproject/ray:2.4.0 + command: ["/bin/sh", "-c", "/etc/gen/tls/gencert_cert_head.sh /tmp/tls {{`{{ cluster_name }}`}} $POD_IP {{ .Release.Namespace }}"] + volumeMounts: + - mountPath: /tmp/tls + name: cert-tls + - mountPath: /etc/gen/tls + name: gen-tls-script + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP +{{- else }} - name: ray-head-tls image: rayproject/ray:2.4.0 command: ["/bin/sh", "-c", "cp -R /etc/ca/tls /etc/ray && /etc/gen/tls/gencert_head.sh"] @@ -31,6 +49,7 @@ data: valueFrom: fieldRef: fieldPath: status.podIP +{{- end }} affinity: containers: - image: {{ .Values.application.ray.nodeImage | quote }} @@ -62,6 +81,10 @@ data: readOnly: true - mountPath: /etc/ray/tls name: ray-tls +{{- if .Values.useCertManager }} + - mountPath: /tmp/tls + name: cert-tls +{{- end }} - mountPath: /data name: user-storage subPath: {{`{{ user_id }}`}} @@ -70,12 +93,21 @@ data: # See https://docs.ray.io/en/latest/ray-core/configure.html#tls-authentication for more details. - name: RAY_USE_TLS value: "1" +{{- if .Values.useCertManager }} + - name: RAY_TLS_SERVER_CERT + value: "/tmp/tls/tls.crt" + - name: RAY_TLS_SERVER_KEY + value: "/tmp/tls/tls.key" + - name: RAY_TLS_CA_CERT + value: "/tmp/tls/ca.crt" +{{- else }} - name: RAY_TLS_SERVER_CERT value: "/etc/ray/tls/tls.crt" - name: RAY_TLS_SERVER_KEY value: "/etc/ray/tls/tls.key" - name: RAY_TLS_CA_CERT - value: "/etc/ca/tls/ca.crt" + value: "/etc/ca/tls/ca.crt" +{{- end }} - image: fluent/fluent-bit:1.9.10 name: ray-head-logs resources: @@ -92,6 +124,10 @@ data: name: fluentbit-config subPath: fluent-bit.conf imagePullSecrets: [] +{{- if .Values.useCertManager }} + serviceAccountName: ray-cluster-sa + serviceAccount: ray-cluster-sa +{{- end }} nodeSelector: tolerations: [] volumes: @@ -107,6 +143,10 @@ data: - name: ray-tls emptyDir: {} # `gencert_head.sh` is a script to generate head Pod's private key and head's certificate. +{{- if .Values.useCertManager }} + - name: cert-tls + emptyDir: {} +{{- end }} - name: gen-tls-script configMap: name: tls @@ -114,6 +154,10 @@ data: items: - key: gencert_head.sh path: gencert_head.sh +{{- if .Values.useCertManager }} + - key: gencert_cert_head.sh + path: gencert_cert_head.sh +{{- end }} - name: user-storage persistentVolumeClaim: claimName: gateway-claim @@ -128,6 +172,22 @@ data: spec: initContainers: # Generate worker's private key and certificate before `ray start`. +{{- if .Values.useCertManager }} + initContainers: + - name: ray-worker-cert + image: rayproject/ray:2.4.0 + command: ["/bin/sh", "-c", "/etc/gen/tls/gencert_cert_head.sh /tmp/tls {{`{{ cluster_name }}`}}-worker $POD_IP {{ .Release.Namespace }}"] + volumeMounts: + - mountPath: /tmp/tls + name: cert-tls + - mountPath: /etc/gen/tls + name: gen-tls-script + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP +{{- else }} - name: ray-worker-tls image: rayproject/ray:2.4.0 command: ["/bin/sh", "-c", "cp -R /etc/ca/tls /etc/ray && /etc/gen/tls/gencert_worker.sh"] @@ -144,6 +204,7 @@ data: valueFrom: fieldRef: fieldPath: status.podIP +{{- end }} affinity: containers: - resources: @@ -162,17 +223,30 @@ data: readOnly: true - mountPath: /etc/ray/tls name: ray-tls +{{- if .Values.useCertManager }} + - mountPath: /tmp/tls + name: cert-tls +{{- end }} env: # Environment variables for Ray TLS authentication. # See https://docs.ray.io/en/latest/ray-core/configure.html#tls-authentication for more details. - name: RAY_USE_TLS value: "1" +{{- if .Values.useCertManager }} + - name: RAY_TLS_SERVER_CERT + value: "/tmp/tls/tls.crt" + - name: RAY_TLS_SERVER_KEY + value: "/tmp/tls/tls.key" + - name: RAY_TLS_CA_CERT + value: "/tmp/tls/ca.crt" +{{- else }} - name: RAY_TLS_SERVER_CERT value: "/etc/ray/tls/tls.crt" - name: RAY_TLS_SERVER_KEY value: "/etc/ray/tls/tls.key" - name: RAY_TLS_CA_CERT value: "/etc/ca/tls/ca.crt" +{{- end }} image: {{ .Values.application.ray.nodeImage | quote}} imagePullPolicy: IfNotPresent name: ray-worker @@ -191,6 +265,10 @@ data: name: user-storage subPath: {{`{{ user_id }}`}} imagePullSecrets: [] +{{- if .Values.useCertManager }} + serviceAccountName: ray-cluster-sa + serviceAccount: ray-cluster-sa +{{- end }} nodeSelector: tolerations: [] volumes: @@ -201,6 +279,10 @@ data: secretName: ca-tls - name: ray-tls emptyDir: {} +{{- if .Values.useCertManager }} + - name: cert-tls + emptyDir: {} +{{- end }} # `gencert_worker.sh` is a script to generate worker Pod's private key and worker's certificate. - name: gen-tls-script configMap: @@ -210,9 +292,10 @@ data: items: - key: gencert_worker.sh path: gencert_worker.sh +{{- if .Values.useCertManager }} + - key: gencert_cert_head.sh + path: gencert_cert_head.sh +{{- end }} - name: user-storage persistentVolumeClaim: claimName: gateway-claim -kind: ConfigMap -metadata: - name: rayclustertemplate diff --git a/infrastructure/helm/quantumserverless/charts/gateway/templates/role.yaml b/infrastructure/helm/quantumserverless/charts/gateway/templates/role.yaml index 832c6c832..30c279dc8 100644 --- a/infrastructure/helm/quantumserverless/charts/gateway/templates/role.yaml +++ b/infrastructure/helm/quantumserverless/charts/gateway/templates/role.yaml @@ -12,5 +12,25 @@ rules: - delete - get - list +{{- if .Values.useCertManager }} +- apiGroups: + - cert-manager.io + resources: + - certificates + verbs: + - create + - delete + - get + - list +- apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list +{{- end }} diff --git a/infrastructure/helm/quantumserverless/charts/gateway/values.yaml b/infrastructure/helm/quantumserverless/charts/gateway/values.yaml index 83bdb51ef..c5226de38 100644 --- a/infrastructure/helm/quantumserverless/charts/gateway/values.yaml +++ b/infrastructure/helm/quantumserverless/charts/gateway/values.yaml @@ -3,6 +3,7 @@ # Declare variables to be passed into your templates. replicaCount: 1 +useCertManager: false application: # command: [ "gunicorn", "gateway.wsgi:application", "--bind", "0.0.0.0:8000", "--workers=3" ] diff --git a/infrastructure/helm/quantumserverless/values.yaml b/infrastructure/helm/quantumserverless/values.yaml index 4b9481ee2..e428127a9 100644 --- a/infrastructure/helm/quantumserverless/values.yaml +++ b/infrastructure/helm/quantumserverless/values.yaml @@ -35,6 +35,7 @@ gatewayEnable: true gateway: nameOverride: "gateway" fullnameOverride: "gateway" + useCertManager: false image: repository: "icr.io/quantum-public/quantum-serverless-gateway"