NexusGPU
diff --git a/‎.vscode/launch.json
+11 b/‎.vscode/launch.json
+11
diff --git a/‎api/v1/gpunode_funcs.go
+27 b/‎api/v1/gpunode_funcs.go
+27
diff --git a/‎api/v1/gpunode_types.go
+11-4 b/‎api/v1/gpunode_types.go
+11-4
diff --git a/‎api/v1/tensorfusioncluster_types.go
+2-2 b/‎api/v1/tensorfusioncluster_types.go
+2-2
diff --git a/‎api/v1/zz_generated.deepcopy.go
+13-5 b/‎api/v1/zz_generated.deepcopy.go
+13-5
diff --git a/‎charts/tensor-fusion/Chart.yaml
+1-1 b/‎charts/tensor-fusion/Chart.yaml
+1-1
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
-2 b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
-2
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
+1-1 b/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
+1-1
diff --git a/‎charts/tensor-fusion/templates/controller-deployment.yaml
+7 b/‎charts/tensor-fusion/templates/controller-deployment.yaml
+7
diff --git a/‎charts/tensor-fusion/templates/vector.yaml ‎charts/tensor-fusion/templates/vector-config.yaml b/‎charts/tensor-fusion/templates/vector.yaml ‎charts/tensor-fusion/templates/vector-config.yaml
diff --git a/‎charts/tensor-fusion/templates/vendor-credentials.yaml
+12 b/‎charts/tensor-fusion/templates/vendor-credentials.yaml
+12
diff --git a/‎charts/tensor-fusion/values.yaml
+6-1 b/‎charts/tensor-fusion/values.yaml
+6-1
@@ -14,6 +14,17 @@
             },
             "program": "${workspaceFolder}/cmd/operator/main.go",
         },
+        {
+            "name": "Debug Discovery",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "env": {
+                "HOSTNAME": "mocknode",
+                "KUBECONFIG": "~/.kube/config",
+            },
+            "program": "${workspaceFolder}/cmd/nodediscovery/main.go",
+        },
         {
             "name": "Debug Dev Env Operator",
             "type": "go",
 
@@ -0,0 +1,27 @@
+package v1
+
+import (
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, initGPUs int32) {
+	node.Status = GPUNodeStatus{
+		Phase:               TensorFusionGPUNodePhasePending,
+		TotalTFlops:         initTFlops,
+		TotalVRAM:           initVRAM,
+		TotalGPUs:           initGPUs,
+		AllocationDetails:   &[]GPUNodeAllocationDetails{},
+		LoadedModels:        &[]string{},
+		ManagedGPUDeviceIDs: []string{},
+		ObservedGeneration:  node.Generation,
+	}
+}
+
+func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
+	if node.Annotations == nil {
+		node.Annotations = make(map[string]string)
+	}
+	node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
+}
@@ -57,6 +57,7 @@ type GPUNodeStatus struct {
 	// +kubebuilder:default=Pending
 	Phase TensorFusionGPUNodePhase `json:"phase"`
 
+	// +optional
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
 	TotalTFlops resource.Quantity `json:"totalTFlops"`
@@ -68,20 +69,26 @@ type GPUNodeStatus struct {
 	AvailableTFlops resource.Quantity `json:"availableTFlops"`
 	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
 
+	// +optional
 	HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`
 
+	// +optional
 	NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`
 
-	LoadedModels []string `json:"loadedModels"`
+	// +optional
+	LoadedModels *[]string `json:"loadedModels,omitempty"`
 
-	TotalGPUs           int32    `json:"totalGPUs"`
-	ManagedGPUs         int32    `json:"managedGPUs"`
+	TotalGPUs   int32 `json:"totalGPUs"`
+	ManagedGPUs int32 `json:"managedGPUs"`
+
+	// +optional
 	ManagedGPUDeviceIDs []string `json:"managedGPUDeviceIDs,omitempty"`
 
 	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
 
 	// Allocation details is for node compaction, and calculate used apps
-	AllocationDetails []GPUNodeAllocationDetails `json:"allocationDetails"`
+	// +optional
+	AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
 }
 
 type GPUNodeAllocationDetails struct {
 
@@ -126,7 +126,7 @@ const (
 	AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
 )
 
-// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod;mock
+// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;mock
 type ComputingVendorName string
 
 const (
@@ -139,7 +139,7 @@ const (
 	ComputingVendorVultr      ComputingVendorName = "vultr"
 	ComputingVendorTogetherAI ComputingVendorName = "together-ai"
 	ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
-	ComputingVendorAliyun     ComputingVendorName = "aliyun"
+	ComputingVendorAlibaba    ComputingVendorName = "alibaba"
 	ComputingVendorNvidia     ComputingVendorName = "nvidia"
 	ComputingVendorTencent    ComputingVendorName = "tencent"
 	ComputingVendorRunPod     ComputingVendorName = "runpod"
 
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.1.4
+version: 1.2.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -318,11 +318,9 @@ spec:
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
             required:
-            - allocationDetails
             - availableTFlops
             - availableVRAM
             - kubernetesNodeName
-            - loadedModels
             - managedGPUs
             - phase
             - totalGPUs
 
@@ -115,7 +115,7 @@ spec:
                     - openshift
                     - vultr
                     - together-ai
-                    - aliyun
+                    - alibaba
                     - nvidia
                     - tencent
                     - runpod
 
@@ -54,6 +54,9 @@ spec:
             - name: cert
               readOnly: true
               mountPath: /tmp/k8s-webhook-server/serving-certs
+            - name: cloud-vendor-credentials
+              mountPath: /tmp/secret
+              readOnly: true
         {{- if .Values.agent.agentId }}
         - name: cluster-agent
           image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -98,6 +101,10 @@ spec:
           configMap:
             name: {{ include "tensor-fusion.fullname" . }}-vector-config
             defaultMode: 420
+        - name: cloud-vendor-credentials
+          secret:
+            secretName: tf-cloud-vendor-credentials
+            defaultMode: 420
         - name: logs
           emptyDir: {}
       {{- with .Values.controller.affinity }}
 
@@ -0,0 +1,12 @@
+{{- if not (lookup "v1" "Secret" .Release.Namespace "tf-cloud-vendor-credentials") }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: tf-cloud-vendor-credentials
+  namespace: {{ .Release.Namespace }}
+  annotations:
+    helm.sh/resource-policy: keep
+data:
+  ak: "{{ .Values.cloudVendorCredentials.accessKey | base64encode }}"
+  sk: "{{ .Values.cloudVendorCredentials.secretKey | base64encode }}"
+{{- end }}
@@ -74,4 +74,9 @@ agent:
       memory: 64Mi
     limits:
       cpu: 1000m
-      memory: 512Mi
+      memory: 512Mi
+
+# Only needed if your pool is running in Provisioned mode, and the cloud vendor doesn't support IRSA or any serviceAccount like zero-credential Auth approaches
+cloudVendorCredentials:
+  accessKey: "dummy"
+  secretKey: "dummy"