Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: fix webhook bug, create tfconn in pod controller #7

Merged
merged 2 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ jobs:
permissions:
# to create release tags (cycjimmy/semantic-release-action)
contents: write
issues: write
pull-requests: write

runs-on: ubuntu-latest
outputs:
Expand Down Expand Up @@ -43,6 +45,7 @@ jobs:
with:
images: tensorfusion/tensor-fusion-operator
tags: type=semver,pattern={{version}},value=${{needs.release.outputs.version}}

- name: Login to DockerHub
uses: docker/login-action@v2
with:
Expand Down
17 changes: 17 additions & 0 deletions .mirrord/mirrord.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"feature": {
"network": {
"incoming": "steal",
"outgoing": true
},
"fs": "read",
"env": true
},
"target": {
"namespace": "tensor-fusion",
"path": {
"deployment": "tensor-fusion-operator-controller-manager",
"container": "manager"
}
}
}
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ endif
# Be aware that the target commands are only tested with Docker which is
# scaffolded by default. However, you might want to replace it to use other
# tools. (i.e. podman)
CONTAINER_TOOL ?= docker
CONTAINER_TOOL ?= $(shell command -v docker >/dev/null 2>&1 && echo docker || echo nerdctl)

# Setting SHELL to bash allows bash commands to be executed by recipes.
# Options are set to exit when a recipe line exits non-zero or a piped command fails.
Expand Down
3 changes: 2 additions & 1 deletion PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ resources:
kind: GPU
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
version: v1
- core: true
- controller: true
core: true
group: core
kind: Pod
path: k8s.io/api/core/v1
Expand Down
18 changes: 17 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ func main() {
var secureMetrics bool
var enableHTTP2 bool
var tlsOpts []func(*tls.Config)
var configFile string
flag.StringVar(&configFile, "config", "/etc/tensor-fusion/config.yaml", "Config file of tensor-fusion-operator")
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
Expand Down Expand Up @@ -152,7 +154,13 @@ func main() {
}

ctx := context.Background()
config := config.NewDefaultConfig()
config, err := config.LoadConfig(configFile)
if os.IsNotExist(err) {
setupLog.Info("config file is not exists, use default config", "configFile", configFile)
} else if err != nil {
setupLog.Error(err, "unable to load config", "configFile", configFile, "err", err)
os.Exit(1)
}
scheduler := scheduler.NewNaiveScheduler()
if err = (&controller.TensorFusionConnectionReconciler{
Client: mgr.GetClient(),
Expand Down Expand Up @@ -183,6 +191,7 @@ func main() {
}
}


if err = (&controller.TensorFusionClusterReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down Expand Up @@ -211,6 +220,13 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "GPUNodeClass")
os.Exit(1)
}
if err = (&controller.PodReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Pod")
os.Exit(1)
}
// +kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
Expand Down
2 changes: 1 addition & 1 deletion config/default/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Adds namespace to all resources.
namespace: tensor-fusion-operator-system
namespace: tensor-fusion

# Value of this field is prepended to the
# names of all resources, e.g. a deployment named
Expand Down
1 change: 0 additions & 1 deletion config/default/manager_webhook_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: controller-manager
namespace: system
labels:
app.kubernetes.io/name: tensor-fusion-operator
app.kubernetes.io/managed-by: kustomize
Expand Down
6 changes: 6 additions & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@ namespace: tensor-fusion

resources:
- manager.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: controller
newName: tensorfusion/tensor-fusion-operator
newTag: latest
46 changes: 0 additions & 46 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,7 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
control-plane: controller-manager
app.kubernetes.io/name: tensor-fusion-operator
app.kubernetes.io/managed-by: kustomize
name: system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: controller-manager
namespace: system
labels:
control-plane: controller-manager
app.kubernetes.io/name: tensor-fusion-operator
Expand All @@ -28,35 +18,6 @@ spec:
labels:
control-plane: controller-manager
spec:
# TODO(user): Uncomment the following code to configure the nodeAffinity expression
# according to the platforms which are supported by your solution.
# It is considered best practice to support multiple architectures. You can
# build your manager image using the makefile target docker-buildx.
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/arch
# operator: In
# values:
# - amd64
# - arm64
# - ppc64le
# - s390x
# - key: kubernetes.io/os
# operator: In
# values:
# - linux
securityContext:
runAsNonRoot: true
# TODO(user): For common cases that do not require escalating privileges
# it is recommended to ensure that all your Pods/Containers are restrictive.
# More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
# Please uncomment the following code if your project does NOT have to work on old Kubernetes
# versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
# seccompProfile:
# type: RuntimeDefault
containers:
- command:
- /manager
Expand All @@ -65,11 +26,6 @@ spec:
- --health-probe-bind-address=:8081
image: controller:latest
name: manager
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- "ALL"
livenessProbe:
httpGet:
path: /healthz
Expand All @@ -82,8 +38,6 @@ spec:
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
# TODO(user): Configure the resources accordingly based on the project requirements.
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
resources:
limits:
cpu: 500m
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ kind: ClusterRole
metadata:
name: manager-role
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- tensor-fusion.ai
resources:
Expand Down
8 changes: 7 additions & 1 deletion config/webhook/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ webhooks:
service:
name: webhook-service
namespace: system
path: /mutate--v1-pod
path: /mutate-v1-pod
failurePolicy: Fail
name: mpod-v1.kb.io
rules:
Expand All @@ -24,3 +24,9 @@ webhooks:
resources:
- pods
sideEffects: None
objectSelector:
matchExpressions:
- key: tensor-fusion.ai/enabled
operator: In
values:
- "true"
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ require (
github.com/onsi/ginkgo/v2 v2.19.0
github.com/onsi/gomega v1.33.1
gomodules.xyz/jsonpatch/v2 v2.4.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.31.0
k8s.io/apimachinery v0.31.0
k8s.io/client-go v0.31.0
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/controller-runtime v0.19.1
)

Expand Down Expand Up @@ -104,14 +106,12 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.31.0 // indirect
k8s.io/apiserver v0.31.0 // indirect
k8s.io/component-base v0.31.0 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
Expand Down
50 changes: 47 additions & 3 deletions internal/config/config.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,48 @@
package config

import (
"os"

"gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
"k8s.io/utils/ptr"
)

type Pod struct {
Spec PodSpec `json:"spec,omitempty"`
}

type PodSpec struct {
InitContainers []corev1.Container `json:"initContainers,omitempty" patchStrategy:"merge" patchMergeKey:"name" protobuf:"bytes,20,rep,name=initContainers"`
Containers []corev1.Container `json:"containers,omitempty" patchStrategy:"merge" patchMergeKey:"name" protobuf:"bytes,2,rep,name=containers"`
RuntimeClassName *string `json:"runtimeClassName,omitempty" protobuf:"bytes,29,opt,name=runtimeClassName"`
}

type Config struct {
WorkerTemplate corev1.PodTemplate `json:"workerTemplate"`
PodMutator PodMutator `json:"podMutator"`
}

type PodMutator struct {
PatchStrategicMerge corev1.Pod `json:"patchStrategicMerge"`
PatchStrategicMerge Pod `json:"patchStrategicMerge"`
PatchEnvVars []corev1.EnvVar `json:"envVars"`
}

func NewDefaultConfig() Config {
return Config{
func LoadConfig(filename string) (*Config, error) {
cfg := NewDefaultConfig()
data, err := os.ReadFile(filename)
if err != nil {
return cfg, err
}
err = yaml.Unmarshal(data, cfg)
if err != nil {
return nil, err
}
return cfg, nil
}

func NewDefaultConfig() *Config {
return &Config{
WorkerTemplate: corev1.PodTemplate{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Expand All @@ -31,5 +57,23 @@ func NewDefaultConfig() Config {
},
},
},
PodMutator: PodMutator{
PatchStrategicMerge: Pod{
Spec: PodSpec{
InitContainers: []corev1.Container{
{
Name: "inject-lib",
Image: "busybox:stable-glibc",
},
},
},
},
PatchEnvVars: []corev1.EnvVar{
{
Name: "LD_PRELOAD",
Value: "tensorfusion.so",
},
},
},
}
}
Loading
Loading