NexusGPU
diff --git a/‎.github/workflows/release.yml
Lines changed: 51 additions & 6 deletions b/‎.github/workflows/release.yml
Lines changed: 51 additions & 6 deletions
diff --git a/‎.gitignore
Lines changed: 7 additions & 1 deletion b/‎.gitignore
Lines changed: 7 additions & 1 deletion
diff --git a/‎.vscode/launch.json
Lines changed: 42 additions & 0 deletions b/‎.vscode/launch.json
Lines changed: 42 additions & 0 deletions
diff --git a/‎.vscode/settings.json
Lines changed: 72 additions & 0 deletions b/‎.vscode/settings.json
Lines changed: 72 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 4 additions & 3 deletions b/‎Makefile
Lines changed: 4 additions & 3 deletions
diff --git a/‎api/v1/gpu_types.go
Lines changed: 32 additions & 3 deletions b/‎api/v1/gpu_types.go
Lines changed: 32 additions & 3 deletions
@@ -9,19 +9,20 @@ on:
 jobs:
   release:
     permissions:
-      #  to create release tags (cycjimmy/semantic-release-action)
+      # to create release tags (cycjimmy/semantic-release-action)
       contents: write
       issues: write
       pull-requests: write
-      
     runs-on: ubuntu-latest
     outputs:
       published: ${{ steps.semantic.outputs.new_release_published }}
       version: ${{ steps.semantic.outputs.new_release_version }}
     steps:
       - uses: actions/checkout@v3
+        if: github.event_name == 'push'
 
       - name: Semantic Release
+        if: github.event_name == 'push'
         id: semantic
         uses: cycjimmy/semantic-release-action@v4
         with:
@@ -31,32 +32,76 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-  publish_image:
+  publish_operator_image:
     needs:
       - release
-    if: needs.release.outputs.published == 'true'
+    if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     outputs:
       image_digest: ${{ steps.build.outputs.digest }}
     steps:
       - uses: actions/checkout@v3
+      
+      - name: Set Tag
+        if: github.event_name == 'workflow_dispatch'
+        id: set_tag
+        run: echo "tag=dev-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
+
       - id: meta
         uses: docker/metadata-action@v4
         with:
           images: tensorfusion/tensor-fusion-operator
-          tags: type=semver,pattern={{version}},value=${{needs.release.outputs.version}}
+          tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || 'type=semver,pattern={{version}},value=${{needs.release.outputs.version}}' }}
+    
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build and push operator
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          push: true
+          file: dockerfile/operator.Dockerfile
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          no-cache: true
+
+  publish_node_discovery_image:
+    needs:
+      - release
+    if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    outputs:
+      image_digest: ${{ steps.build.outputs.digest }}
+    steps:
+      - uses: actions/checkout@v3
+      
+      - name: Set Tag
+        if: github.event_name == 'workflow_dispatch'
+        id: set_tag
+        run: echo "tag=dev-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
+     
+      - id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: tensorfusion/tensor-fusion-node-discovery
+          tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || 'type=semver,pattern={{version}},value=${{needs.release.outputs.version}}' }}
 
       - name: Login to DockerHub
         uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
 
-      - name: Build and push
+      - name: Build and push node discovery
         uses: docker/build-push-action@v3
         with:
           context: .
           push: true
+          file: dockerfile/node-disvoery.Dockerfile
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           no-cache: true
@@ -21,9 +21,15 @@ go.work
 
 # editor and IDE paraphernalia
 .idea
-.vscode
 *.swp
 *.swo
 *~
 
 .DS_Store
+
+cmd/__debug*
+cmd/*/__debug*
+
+prompts/*
+
+tmp*
@@ -0,0 +1,42 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug Main Operator",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "env": {
+                "ENABLE_WEBHOOKS": "false"
+            },
+            "program": "${workspaceFolder}/cmd/operator/main.go",
+        },
+        {
+            "name": "Debug Dev Env Operator",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "console": "integratedTerminal",
+            "env": {
+                "KUBECONFIG": "~/.kube/config-tf-dev",
+                "ENABLE_WEBHOOKS": "false"
+            },
+            "program": "${workspaceFolder}/cmd/operator/main.go",
+        },
+        {
+            "name": "Debug Demo Env Operator",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "console": "integratedTerminal",
+            "env": {
+                "KUBECONFIG": "~/.kube/dev_us-east-1_demo",
+                "ENABLE_WEBHOOKS": "false"
+            },
+            "program": "${workspaceFolder}/cmd/operator/main.go",
+        }
+    ]
+}
@@ -0,0 +1,72 @@
+{
+    "cSpell.words": [
+        "alicloud",
+        "Aliyun",
+        "AMDCDNA",
+        "AMDRDNA",
+        "apimachinery",
+        "AWSGPU",
+        "batchv",
+        "CDNA",
+        "certificaterequests",
+        "certmanager",
+        "clientgoscheme",
+        "cloudnative",
+        "cloudprovider",
+        "clusterissuers",
+        "controllerutil",
+        "corev",
+        "crds",
+        "CUDA",
+        "cycjimmy",
+        "dylib",
+        "essd",
+        "Eventf",
+        "finalizer",
+        "Finalizers",
+        "goconst",
+        "golint",
+        "gosec",
+        "gpunode",
+        "gpunodeclasses",
+        "gpunodes",
+        "gpupool",
+        "gpupools",
+        "greptimedb",
+        "healthz",
+        "karpenter",
+        "kubebuilder",
+        "KUBECONFIG",
+        "Kubelet",
+        "kustomization",
+        "metav",
+        "metricsserver",
+        "nindent",
+        "nolint",
+        "NVML",
+        "omitempty",
+        "onsi",
+        "printcolumn",
+        "prometheusagents",
+        "prometheuses",
+        "prometheusrules",
+        "RDNA",
+        "readyz",
+        "runpod",
+        "schedulingconfigtemplate",
+        "schedulingconfigtemplates",
+        "schedulingcorev",
+        "subresource",
+        "tensorfusion",
+        "tensorfusionaiv",
+        "tensorfusioncluster",
+        "tensorfusionclusters",
+        "Tera",
+        "tflops",
+        "Tmpl",
+        "Tolerations",
+        "utilruntime",
+        "webhookcorev",
+        "Xlarge"
+    ]
+}
@@ -45,7 +45,8 @@ help: ## Display this help.
 
 .PHONY: manifests
 manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
-	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
+	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases && \
+	cp -r config/crd/bases/* ./charts/tensor-fusion/crds/
 
 .PHONY: generate
 generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
@@ -92,11 +93,11 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
 
 .PHONY: build
 build: manifests generate fmt vet ## Build manager binary.
-	go build -o bin/manager cmd/main.go
+	go build -o bin/manager cmd/operator/main.go
 
 .PHONY: run
 run: manifests generate fmt vet ## Run a controller from your host.
-	go run ./cmd/main.go
+	go run ./cmd/operator/main.go
 
 # If you wish to build the manager image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
 
@@ -17,20 +17,49 @@ limitations under the License.
 package v1
 
 import (
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 // GPUStatus defines the observed state of GPU.
 type GPUStatus struct {
-	UUID         string            `json:"uuid"`
+	// +kubebuilder:default=Pending
+	Phase TensorFusionGPUPhase `json:"phase"`
+
+	Capacity  *Resource `json:"capacity"`
+	Available *Resource `json:"available"`
+
+	UUID string `json:"uuid"`
+
+	// The host match selector to schedule worker pods
 	NodeSelector map[string]string `json:"nodeSelector"`
-	Capacity     Resource          `json:"capacity"`
-	Available    Resource          `json:"available"`
+	GPUModel     string            `json:"gpuModel"`
+
+	Message string `json:"message"`
 }
 
+// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
+type TensorFusionGPUPhase string
+
+const (
+	TensorFusionGPUPhasePending    TensorFusionGPUPhase = constants.PhasePending
+	TensorFusionGPUPhaseUpdating   TensorFusionGPUPhase = constants.PhaseUpdating
+	TensorFusionGPUPhaseRunning    TensorFusionGPUPhase = constants.PhaseRunning
+	TensorFusionGPUPhaseUnknown    TensorFusionGPUPhase = constants.PhaseUnknown
+	TensorFusionGPUPhaseDestroying TensorFusionGPUPhase = constants.PhaseDestroying
+	TensorFusionGPUPhaseMigrating  TensorFusionGPUPhase = constants.PhaseMigrating
+)
+
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:scope=Cluster
+// +kubebuilder:printcolumn:name="GPU Model",type="string",JSONPath=".spec.gpuModel"
+// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
+// +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.capacity.flops"
+// +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.capacity.vram"
+// +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.available.flops"
+// +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.available.vram"
+// +kubebuilder:printcolumn:name="Device UUID",type="string",JSONPath=".status.uuid"
 // GPU is the Schema for the gpus API.
 type GPU struct {
 	metav1.TypeMeta   `json:",inline"`