Skip to content

Commit 96c2df3

Browse files
Code2Life0x5457
andauthored
fix: pooling and cloud vendor provisioning mode bugs (#42)
* fix: reconcile bug * fix: node discovery controller ref bug * fix: patch gpu node status issue * fix: gpu node report bug * fix: remove reporter * fix: Set the owner of gpu to gpunode * fix: node selecting mode issues, remove in mem gpu pool state * fix: node discovery issue * chore: fix naming convention * fix: gpu node status reconcile * fix: provisioner mode bugs * fix: lint issue --------- Co-authored-by: 0x5457 <0x5457@protonmail.com>
1 parent 40f1dc4 commit 96c2df3

37 files changed

+472
-431
lines changed

.vscode/launch.json

+11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,17 @@
1414
},
1515
"program": "${workspaceFolder}/cmd/operator/main.go",
1616
},
17+
{
18+
"name": "Debug Discovery",
19+
"type": "go",
20+
"request": "launch",
21+
"mode": "auto",
22+
"env": {
23+
"HOSTNAME": "mocknode",
24+
"KUBECONFIG": "~/.kube/config",
25+
},
26+
"program": "${workspaceFolder}/cmd/nodediscovery/main.go",
27+
},
1728
{
1829
"name": "Debug Dev Env Operator",
1930
"type": "go",

api/v1/gpunode_funcs.go

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package v1
2+
3+
import (
4+
"time"
5+
6+
"k8s.io/apimachinery/pkg/api/resource"
7+
)
8+
9+
func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, initGPUs int32) {
10+
node.Status = GPUNodeStatus{
11+
Phase: TensorFusionGPUNodePhasePending,
12+
TotalTFlops: initTFlops,
13+
TotalVRAM: initVRAM,
14+
TotalGPUs: initGPUs,
15+
AllocationDetails: &[]GPUNodeAllocationDetails{},
16+
LoadedModels: &[]string{},
17+
ManagedGPUDeviceIDs: []string{},
18+
ObservedGeneration: node.Generation,
19+
}
20+
}
21+
22+
func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
23+
if node.Annotations == nil {
24+
node.Annotations = make(map[string]string)
25+
}
26+
node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
27+
}

api/v1/gpunode_types.go

+11-4
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ type GPUNodeStatus struct {
5757
// +kubebuilder:default=Pending
5858
Phase TensorFusionGPUNodePhase `json:"phase"`
5959

60+
// +optional
6061
Conditions []metav1.Condition `json:"conditions,omitempty"`
6162

6263
TotalTFlops resource.Quantity `json:"totalTFlops"`
@@ -68,20 +69,26 @@ type GPUNodeStatus struct {
6869
AvailableTFlops resource.Quantity `json:"availableTFlops"`
6970
AvailableVRAM resource.Quantity `json:"availableVRAM"`
7071

72+
// +optional
7173
HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`
7274

75+
// +optional
7376
NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`
7477

75-
LoadedModels []string `json:"loadedModels"`
78+
// +optional
79+
LoadedModels *[]string `json:"loadedModels,omitempty"`
7680

77-
TotalGPUs int32 `json:"totalGPUs"`
78-
ManagedGPUs int32 `json:"managedGPUs"`
81+
TotalGPUs int32 `json:"totalGPUs"`
82+
ManagedGPUs int32 `json:"managedGPUs"`
83+
84+
// +optional
7985
ManagedGPUDeviceIDs []string `json:"managedGPUDeviceIDs,omitempty"`
8086

8187
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
8288

8389
// Allocation details is for node compaction, and calculate used apps
84-
AllocationDetails []GPUNodeAllocationDetails `json:"allocationDetails"`
90+
// +optional
91+
AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
8592
}
8693

8794
type GPUNodeAllocationDetails struct {

api/v1/tensorfusioncluster_types.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ const (
126126
AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
127127
)
128128

129-
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod;mock
129+
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;mock
130130
type ComputingVendorName string
131131

132132
const (
@@ -139,7 +139,7 @@ const (
139139
ComputingVendorVultr ComputingVendorName = "vultr"
140140
ComputingVendorTogetherAI ComputingVendorName = "together-ai"
141141
ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
142-
ComputingVendorAliyun ComputingVendorName = "aliyun"
142+
ComputingVendorAlibaba ComputingVendorName = "alibaba"
143143
ComputingVendorNvidia ComputingVendorName = "nvidia"
144144
ComputingVendorTencent ComputingVendorName = "tencent"
145145
ComputingVendorRunPod ComputingVendorName = "runpod"

api/v1/zz_generated.deepcopy.go

+13-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.1.4
18+
version: 1.2.1
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -318,11 +318,9 @@ spec:
318318
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
319319
x-kubernetes-int-or-string: true
320320
required:
321-
- allocationDetails
322321
- availableTFlops
323322
- availableVRAM
324323
- kubernetesNodeName
325-
- loadedModels
326324
- managedGPUs
327325
- phase
328326
- totalGPUs

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ spec:
115115
- openshift
116116
- vultr
117117
- together-ai
118-
- aliyun
118+
- alibaba
119119
- nvidia
120120
- tencent
121121
- runpod

charts/tensor-fusion/templates/controller-deployment.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ spec:
5454
- name: cert
5555
readOnly: true
5656
mountPath: /tmp/k8s-webhook-server/serving-certs
57+
- name: cloud-vendor-credentials
58+
mountPath: /tmp/secret
59+
readOnly: true
5760
{{- if .Values.agent.agentId }}
5861
- name: cluster-agent
5962
image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -98,6 +101,10 @@ spec:
98101
configMap:
99102
name: {{ include "tensor-fusion.fullname" . }}-vector-config
100103
defaultMode: 420
104+
- name: cloud-vendor-credentials
105+
secret:
106+
secretName: tf-cloud-vendor-credentials
107+
defaultMode: 420
101108
- name: logs
102109
emptyDir: {}
103110
{{- with .Values.controller.affinity }}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{{- if not (lookup "v1" "Secret" .Release.Namespace "tf-cloud-vendor-credentials") }}
2+
apiVersion: v1
3+
kind: Secret
4+
metadata:
5+
name: tf-cloud-vendor-credentials
6+
namespace: {{ .Release.Namespace }}
7+
annotations:
8+
helm.sh/resource-policy: keep
9+
data:
10+
ak: "{{ .Values.cloudVendorCredentials.accessKey | base64encode }}"
11+
sk: "{{ .Values.cloudVendorCredentials.secretKey | base64encode }}"
12+
{{- end }}

charts/tensor-fusion/values.yaml

+6-1
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,9 @@ agent:
7474
memory: 64Mi
7575
limits:
7676
cpu: 1000m
77-
memory: 512Mi
77+
memory: 512Mi
78+
79+
# Only needed if your pool is running in Provisioned mode, and the cloud vendor doesn't support IRSA or any serviceAccount like zero-credential Auth approaches
80+
cloudVendorCredentials:
81+
accessKey: "dummy"
82+
secretKey: "dummy"

0 commit comments

Comments
 (0)