Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CARRY: Add RHOAI manifests #3

Merged
merged 1 commit into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions manifests/rhoai/kubeflow-training-roles.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#This file has been copied from ../overlays/kubeflow
#The original labels have ben commented out for documentation purposes
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: training-edit
labels:
# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true"
# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-training-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rules:
- apiGroups:
- kubeflow.org
resources:
- mpijobs
- tfjobs
- pytorchjobs
- mxjobs
- xgboostjobs
- paddlejobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- mpijobs/status
- tfjobs/status
- pytorchjobs/status
- mxjobs/status
- xgboostjobs/status
- paddlejobs/status
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: training-view
labels:
# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
rules:
- apiGroups:
- kubeflow.org
resources:
- mpijobs
- tfjobs
- pytorchjobs
- mxjobs
- xgboostjobs
- paddlejobs
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- mpijobs/status
- tfjobs/status
- pytorchjobs/status
- mxjobs/status
- xgboostjobs/status
- paddlejobs/status
verbs:
- get
45 changes: 45 additions & 0 deletions manifests/rhoai/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# RHOAI configuration for Kubeflow Training Operator (KFTO)

# Adds namespace to all resources.
namespace: opendatahub

# Value of this field is prepended to the
# names of all resources, e.g. a deployment named
# "wordpress" becomes "alices-wordpress".
# Note that it should also match with the prefix (text before '-') of the namespace
# field above.
namePrefix: kubeflow-

configMapGenerator:
- name: rhoai-config
envs:
- params.env

configurations:
- params.yaml

vars:
- name: image
objref:
kind: ConfigMap
name: rhoai-config
apiVersion: v1
fieldref:
fieldpath: data.odh-training-operator-controller-image

# Labels to add to all resources and selectors.
commonLabels:
app.kubernetes.io/name: training-operator
app.kubernetes.io/component: controller

resources:
- ../base
- kubeflow-training-roles.yaml
- monitor.yaml

patches:
# Mount the controller config file for loading manager configurations
# through a ComponentConfig type
- path: manager_config_patch.yaml
- path: manager_metrics_patch.yaml
- path: manager_delete_metrics_service_patch.yaml
12 changes: 12 additions & 0 deletions manifests/rhoai/manager_config_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: training-operator
spec:
template:
spec:
containers:
- name: training-operator
image: $(image)
args:
- "--zap-log-level=2"
6 changes: 6 additions & 0 deletions manifests/rhoai/manager_delete_metrics_service_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Delete the service created in base
$patch: delete
apiVersion: v1
kind: Service
metadata:
name: training-operator
12 changes: 12 additions & 0 deletions manifests/rhoai/manager_metrics_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: training-operator
spec:
template:
spec:
containers:
- name: training-operator
ports:
- containerPort: 8080
name: metrics
12 changes: 12 additions & 0 deletions manifests/rhoai/monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Prometheus Pod Monitor (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: training-operator-metrics-monitor
spec:
selector:
matchLabels:
app.kubernetes.io/name: training-operator
app.kubernetes.io/component: controller
podMetricsEndpoints:
- port: metrics
1 change: 1 addition & 0 deletions manifests/rhoai/params.env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
odh-training-operator-controller-image=docker.io/kubeflow/training-operator:v1-855e096
3 changes: 3 additions & 0 deletions manifests/rhoai/params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
varReference:
- path: spec/template/spec/containers[]/image
kind: Deployment