# The resource requests and limits in this config are too small for production! # For examples with more realistic resource configuration, see # ray-cluster.complete.large.yaml and # ray-cluster.autoscaler.large.yaml. apiVersion: ray.io/v1alpha1 kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" # A unique identifier for the head node and workers of this cluster. name: raycluster-pod-security spec: rayVersion: '2.0.0' ######################headGroupSpec################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' serviceType: ClusterIP # for the head group, replicas should always be 1. # headGroupSpec.replicas is deprecated in KubeRay >= 0.3.0. replicas: 1 # the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ... rayStartParams: dashboard-host: '0.0.0.0' block: 'true' #pod template template: metadata: labels: # custom labels. NOTE: do not define custom labels start with `raycluster.`, they may be used in controller. # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ rayCluster: raycluster-sample # will be injected if missing groupName: headgroup # will be injected if missing # annotations for pod annotations: key: value spec: containers: - name: ray-head image: mtr.devops.telekom.de/juliusvonkohout/ray-ml:2.0.0 ports: - containerPort: 6379 name: gcs - containerPort: 8265 name: dashboard - containerPort: 10001 name: client lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] volumeMounts: - mountPath: /tmp/ray name: ray-logs - mountPath: /home/ray/samples name: xgboost-example-configmap resources: limits: cpu: 1 memory: 2Gi requests: cpu: 1 memory: 2Gi securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] runAsNonRoot: true runAsUser: 1000 volumes: - name: ray-logs emptyDir: {} - name: xgboost-example-configmap configMap: name: xgboost-example # An array of keys from the ConfigMap to create as files items: - key: xgboost_example.py path: xgboost_example.py workerGroupSpecs: # the pod replicas in this group typed worker - replicas: 1 minReplicas: 1 maxReplicas: 10 # logical group name, for this called large-group, also can be functional groupName: large-group # if worker pods need to be added, we can simply increment the replicas # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list # the operator will remove pods from the list until the number of replicas is satisfied # when a pod is confirmed to be deleted, its name will be removed from the list below #scaleStrategy: # workersToDelete: # - raycluster-complete-worker-large-group-bdtwh # - raycluster-complete-worker-large-group-hv457 # - raycluster-complete-worker-large-group-k8tj7 # the following params are used to complete the ray start: ray start --block rayStartParams: block: 'true' #pod template template: metadata: labels: rayCluster: raycluster-pod-security # will be injected if missing groupName: small-group # will be injected if missing spec: containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' image: mtr.devops.telekom.de/juliusvonkohout/ray-ml:2.0.0 # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] # use volumeMounts.Optional. # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumeMounts: - mountPath: /tmp/ray name: ray-logs resources: limits: cpu: 4 memory: 2Gi requests: cpu: 1 memory: 2Gi securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] runAsNonRoot: true runAsUser: 1000 initContainers: # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - name: init-myservice image: busybox:1.28 # Change the cluster postfix if you don't have a default setting command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] securityContext: runAsUser: 1000 allowPrivilegeEscalation: false capabilities: drop: ["ALL"] runAsNonRoot: true # use volumes # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumes: - name: ray-logs emptyDir: {} ######################status################################# --- apiVersion: v1 kind: ConfigMap metadata: name: xgboost-example data: xgboost_example.py: | import ray from ray.train.xgboost import XGBoostTrainer from ray.air.config import ScalingConfig # Load data. dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv") # Split data into train and validation. train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3) trainer = XGBoostTrainer( scaling_config=ScalingConfig( # Number of workers to use for data parallelism. num_workers=1, # Whether to use GPU acceleration. use_gpu=False, ), label_column="target", num_boost_round=20, params={ # XGBoost specific params "objective": "binary:logistic", # "tree_method": "gpu_hist", # uncomment this to use GPU for training "eval_metric": ["logloss", "error"], }, datasets={"train": train_dataset, "valid": valid_dataset}, ) result = trainer.fit() print(result.metrics)