# The resource requests and limits in this config are too small for production!
# For examples with more realistic resource configuration, see
# ray-cluster.complete.large.yaml and
# ray-cluster.autoscaler.large.yaml.
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
  labels:
    controller-tools.k8s.io: "1.0"
    # A unique identifier for the head node and workers of this cluster.
  name: raycluster-pod-security
spec:
  rayVersion: '2.0.0'
  ######################headGroupSpec#################################
  # head group template and specs, (perhaps 'group' is not needed in the name)
  headGroupSpec:
    # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
    serviceType: ClusterIP
    # for the head group, replicas should always be 1.
    # headGroupSpec.replicas is deprecated in KubeRay >= 0.3.0.
    replicas: 1
    # the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ...
    rayStartParams:
      dashboard-host: '0.0.0.0'
      block: 'true'
    #pod template
    template:
      metadata:
        labels:
          # custom labels. NOTE: do not define custom labels start with `raycluster.`, they may be used in controller.
          # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
          rayCluster: raycluster-sample # will be injected if missing
          groupName: headgroup # will be injected if missing
        # annotations for pod
        annotations:
          key: value
      spec:
        containers:
        - name: ray-head
          image: mtr.devops.telekom.de/juliusvonkohout/ray-ml:2.0.0
          ports:
          - containerPort: 6379
            name: gcs
          - containerPort: 8265
            name: dashboard
          - containerPort: 10001
            name: client
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          volumeMounts:
            - mountPath: /tmp/ray
              name: ray-logs
            - mountPath: /home/ray/samples
              name: xgboost-example-configmap
          resources:
            limits:
              cpu: 1
              memory: 2Gi
            requests:
              cpu: 1
              memory: 2Gi
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: ["ALL"]
            runAsNonRoot: true
            runAsUser: 1000
        volumes:
          - name: ray-logs
            emptyDir: {}
          - name: xgboost-example-configmap
            configMap:
              name: xgboost-example
              # An array of keys from the ConfigMap to create as files
              items:
                - key: xgboost_example.py
                  path: xgboost_example.py
  workerGroupSpecs:
  # the pod replicas in this group typed worker
  - replicas: 1
    minReplicas: 1
    maxReplicas: 10
    # logical group name, for this called large-group, also can be functional
    groupName: large-group
    # if worker pods need to be added, we can simply increment the replicas
    # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
    # the operator will remove pods from the list until the number of replicas is satisfied
    # when a pod is confirmed to be deleted, its name will be removed from the list below
    #scaleStrategy:
    #  workersToDelete:
    #  - raycluster-complete-worker-large-group-bdtwh
    #  - raycluster-complete-worker-large-group-hv457
    #  - raycluster-complete-worker-large-group-k8tj7 
    # the following params are used to complete the ray start: ray start --block
    rayStartParams:
      block: 'true'
    #pod template
    template:
      metadata:
        labels:
          rayCluster: raycluster-pod-security # will be injected if missing
          groupName: small-group # will be injected if missing
      spec:
        containers:
        - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
          image: mtr.devops.telekom.de/juliusvonkohout/ray-ml:2.0.0
          # environment variables to set in the container.Optional.
          # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          # use volumeMounts.Optional.
          # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
          volumeMounts:
            - mountPath: /tmp/ray
              name: ray-logs
          resources:
            limits:
              cpu: 4
              memory: 2Gi
            requests:
              cpu: 1
              memory: 2Gi
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: ["ALL"]
            runAsNonRoot: true
            runAsUser: 1000
        initContainers:
        # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
        - name: init-myservice
          image: busybox:1.28
          # Change the cluster postfix if you don't have a default setting
          command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
          securityContext:
            runAsUser: 1000
            allowPrivilegeEscalation: false
            capabilities:
              drop: ["ALL"]
            runAsNonRoot: true
        # use volumes
        # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
        volumes:
          - name: ray-logs
            emptyDir: {}
######################status#################################
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: xgboost-example
data:
  xgboost_example.py: |
    import ray
    from ray.train.xgboost import XGBoostTrainer
    from ray.air.config import ScalingConfig

    # Load data.
    dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

    # Split data into train and validation.
    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

    trainer = XGBoostTrainer(
        scaling_config=ScalingConfig(
            # Number of workers to use for data parallelism.
            num_workers=1,
            # Whether to use GPU acceleration.
            use_gpu=False,
        ),
        label_column="target",
        num_boost_round=20,
        params={
            # XGBoost specific params
            "objective": "binary:logistic",
            # "tree_method": "gpu_hist",  # uncomment this to use GPU for training
            "eval_metric": ["logloss", "error"],
        },
        datasets={"train": train_dataset, "valid": valid_dataset},
    )
    result = trainer.fit()
    print(result.metrics)