Skip to content

Commit dde95b6

Browse files
Code2Life0x5457
andauthored
feat: cluster reconcile (#37)
* fix: update * fix: typo * chore: typo * fix: onboard resource reconcile * fix: unit test bug * fix: optional fields and enum fields * fix: lint * chore: add print columns * feat: add node discovery binary * fix: set controller ref issue, fix node selector schema * fix: add node state in mem, add some todos * feat: start the node discovery job periodically * fix: update CR status, add aggregated values * feat: fix multiple pool gpu node label issue, add simple compaction * wip update controller * feat: add CMD argument to Dockerfile and update release workflow * feat: node provisioner mode, basic compaction, basic aliyun/aws cloud vendor implementation * feat: add metadata steps for node discovery in release workflow * feat: query GpuNodes and then get corev1.node based on Gpunode * fix: type issue * fix: update schema for node class/node requirement * fix: rename GPUNodePoolIdentifierLabelKey to GPUNodePoolIdentifierLabelFormat * feat: get tflops from config file * fix: update release workflow to handle workflow_dispatch event * chore: update workflow * fix: update release workflow to set tag format for workflow_dispatch event * fix: bump chart version, update CRDs * feat: add test case for gpupool_controller * fix: node discovery bug * fix: strings.CutPrefix pool label * fix: helm crd * fix: bump version * feat: add ownership of batch jobs in gpupool controller * fix: update CMD in release workflow to nodediscovery * feat: refactor Dockerfiles for operator and nodediscovery, removing legacy Dockerfile * enable cgo for nodediscovery * remove libnvidia-ml.so * feat: update base image for nodediscovery and operator Dockerfiles to Ubuntu 22.04 * fix: normalize UUID to lowercase in nodediscovery * fix: vector image * fix: config map key * split into 2 build jobs * fix: update status * feat: report gpunode * fix: node provisioning mode * fix: remove dead code * fix: provisioning mode node discovery issue, cloud provider status etc * feat: move the creation of nodediscovery job from poolcontroller to gpunodecontroller * fix lint * fix: gpu node creation --------- Co-authored-by: 0x5457 <0x5457@protonmail.com>
1 parent e97402a commit dde95b6

File tree

72 files changed

+7044
-1211
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+7044
-1211
lines changed

.github/workflows/release.yml

+51-6
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,20 @@ on:
99
jobs:
1010
release:
1111
permissions:
12-
# to create release tags (cycjimmy/semantic-release-action)
12+
# to create release tags (cycjimmy/semantic-release-action)
1313
contents: write
1414
issues: write
1515
pull-requests: write
16-
1716
runs-on: ubuntu-latest
1817
outputs:
1918
published: ${{ steps.semantic.outputs.new_release_published }}
2019
version: ${{ steps.semantic.outputs.new_release_version }}
2120
steps:
2221
- uses: actions/checkout@v3
22+
if: github.event_name == 'push'
2323

2424
- name: Semantic Release
25+
if: github.event_name == 'push'
2526
id: semantic
2627
uses: cycjimmy/semantic-release-action@v4
2728
with:
@@ -31,32 +32,76 @@ jobs:
3132
env:
3233
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3334

34-
publish_image:
35+
publish_operator_image:
3536
needs:
3637
- release
37-
if: needs.release.outputs.published == 'true'
38+
if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
3839
runs-on: ubuntu-latest
3940
outputs:
4041
image_digest: ${{ steps.build.outputs.digest }}
4142
steps:
4243
- uses: actions/checkout@v3
44+
45+
- name: Set Tag
46+
if: github.event_name == 'workflow_dispatch'
47+
id: set_tag
48+
run: echo "tag=dev-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
49+
4350
- id: meta
4451
uses: docker/metadata-action@v4
4552
with:
4653
images: tensorfusion/tensor-fusion-operator
47-
tags: type=semver,pattern={{version}},value=${{needs.release.outputs.version}}
54+
tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || 'type=semver,pattern={{version}},value=${{needs.release.outputs.version}}' }}
55+
56+
- name: Login to DockerHub
57+
uses: docker/login-action@v2
58+
with:
59+
username: ${{ secrets.DOCKER_USERNAME }}
60+
password: ${{ secrets.DOCKER_PASSWORD }}
61+
62+
- name: Build and push operator
63+
uses: docker/build-push-action@v3
64+
with:
65+
context: .
66+
push: true
67+
file: dockerfile/operator.Dockerfile
68+
tags: ${{ steps.meta.outputs.tags }}
69+
labels: ${{ steps.meta.outputs.labels }}
70+
no-cache: true
71+
72+
publish_node_discovery_image:
73+
needs:
74+
- release
75+
if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
76+
runs-on: ubuntu-latest
77+
outputs:
78+
image_digest: ${{ steps.build.outputs.digest }}
79+
steps:
80+
- uses: actions/checkout@v3
81+
82+
- name: Set Tag
83+
if: github.event_name == 'workflow_dispatch'
84+
id: set_tag
85+
run: echo "tag=dev-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
86+
87+
- id: meta
88+
uses: docker/metadata-action@v4
89+
with:
90+
images: tensorfusion/tensor-fusion-node-discovery
91+
tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || 'type=semver,pattern={{version}},value=${{needs.release.outputs.version}}' }}
4892

4993
- name: Login to DockerHub
5094
uses: docker/login-action@v2
5195
with:
5296
username: ${{ secrets.DOCKER_USERNAME }}
5397
password: ${{ secrets.DOCKER_PASSWORD }}
5498

55-
- name: Build and push
99+
- name: Build and push node discovery
56100
uses: docker/build-push-action@v3
57101
with:
58102
context: .
59103
push: true
104+
file: dockerfile/node-disvoery.Dockerfile
60105
tags: ${{ steps.meta.outputs.tags }}
61106
labels: ${{ steps.meta.outputs.labels }}
62107
no-cache: true

.gitignore

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,15 @@ go.work
2121

2222
# editor and IDE paraphernalia
2323
.idea
24-
.vscode
2524
*.swp
2625
*.swo
2726
*~
2827

2928
.DS_Store
29+
30+
cmd/__debug*
31+
cmd/*/__debug*
32+
33+
prompts/*
34+
35+
tmp*

.vscode/launch.json

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Debug Main Operator",
9+
"type": "go",
10+
"request": "launch",
11+
"mode": "auto",
12+
"env": {
13+
"ENABLE_WEBHOOKS": "false"
14+
},
15+
"program": "${workspaceFolder}/cmd/operator/main.go",
16+
},
17+
{
18+
"name": "Debug Dev Env Operator",
19+
"type": "go",
20+
"request": "launch",
21+
"mode": "auto",
22+
"console": "integratedTerminal",
23+
"env": {
24+
"KUBECONFIG": "~/.kube/config-tf-dev",
25+
"ENABLE_WEBHOOKS": "false"
26+
},
27+
"program": "${workspaceFolder}/cmd/operator/main.go",
28+
},
29+
{
30+
"name": "Debug Demo Env Operator",
31+
"type": "go",
32+
"request": "launch",
33+
"mode": "auto",
34+
"console": "integratedTerminal",
35+
"env": {
36+
"KUBECONFIG": "~/.kube/dev_us-east-1_demo",
37+
"ENABLE_WEBHOOKS": "false"
38+
},
39+
"program": "${workspaceFolder}/cmd/operator/main.go",
40+
}
41+
]
42+
}

.vscode/settings.json

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"cSpell.words": [
3+
"alicloud",
4+
"Aliyun",
5+
"AMDCDNA",
6+
"AMDRDNA",
7+
"apimachinery",
8+
"AWSGPU",
9+
"batchv",
10+
"CDNA",
11+
"certificaterequests",
12+
"certmanager",
13+
"clientgoscheme",
14+
"cloudnative",
15+
"cloudprovider",
16+
"clusterissuers",
17+
"controllerutil",
18+
"corev",
19+
"crds",
20+
"CUDA",
21+
"cycjimmy",
22+
"dylib",
23+
"essd",
24+
"Eventf",
25+
"finalizer",
26+
"Finalizers",
27+
"goconst",
28+
"golint",
29+
"gosec",
30+
"gpunode",
31+
"gpunodeclasses",
32+
"gpunodes",
33+
"gpupool",
34+
"gpupools",
35+
"greptimedb",
36+
"healthz",
37+
"karpenter",
38+
"kubebuilder",
39+
"KUBECONFIG",
40+
"Kubelet",
41+
"kustomization",
42+
"metav",
43+
"metricsserver",
44+
"nindent",
45+
"nolint",
46+
"NVML",
47+
"omitempty",
48+
"onsi",
49+
"printcolumn",
50+
"prometheusagents",
51+
"prometheuses",
52+
"prometheusrules",
53+
"RDNA",
54+
"readyz",
55+
"runpod",
56+
"schedulingconfigtemplate",
57+
"schedulingconfigtemplates",
58+
"schedulingcorev",
59+
"subresource",
60+
"tensorfusion",
61+
"tensorfusionaiv",
62+
"tensorfusioncluster",
63+
"tensorfusionclusters",
64+
"Tera",
65+
"tflops",
66+
"Tmpl",
67+
"Tolerations",
68+
"utilruntime",
69+
"webhookcorev",
70+
"Xlarge"
71+
]
72+
}

Makefile

+4-3
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ help: ## Display this help.
4545

4646
.PHONY: manifests
4747
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
48-
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
48+
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases && \
49+
cp -r config/crd/bases/* ./charts/tensor-fusion/crds/
4950

5051
.PHONY: generate
5152
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
@@ -92,11 +93,11 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
9293

9394
.PHONY: build
9495
build: manifests generate fmt vet ## Build manager binary.
95-
go build -o bin/manager cmd/main.go
96+
go build -o bin/manager cmd/operator/main.go
9697

9798
.PHONY: run
9899
run: manifests generate fmt vet ## Run a controller from your host.
99-
go run ./cmd/main.go
100+
go run ./cmd/operator/main.go
100101

101102
# If you wish to build the manager image targeting other platforms you can use the --platform flag.
102103
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.

api/v1/gpu_types.go

+32-3
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,49 @@ limitations under the License.
1717
package v1
1818

1919
import (
20+
"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
2021
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2122
)
2223

2324
// GPUStatus defines the observed state of GPU.
2425
type GPUStatus struct {
25-
UUID string `json:"uuid"`
26+
// +kubebuilder:default=Pending
27+
Phase TensorFusionGPUPhase `json:"phase"`
28+
29+
Capacity *Resource `json:"capacity"`
30+
Available *Resource `json:"available"`
31+
32+
UUID string `json:"uuid"`
33+
34+
// The host match selector to schedule worker pods
2635
NodeSelector map[string]string `json:"nodeSelector"`
27-
Capacity Resource `json:"capacity"`
28-
Available Resource `json:"available"`
36+
GPUModel string `json:"gpuModel"`
37+
38+
Message string `json:"message"`
2939
}
3040

41+
// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
42+
type TensorFusionGPUPhase string
43+
44+
const (
45+
TensorFusionGPUPhasePending TensorFusionGPUPhase = constants.PhasePending
46+
TensorFusionGPUPhaseUpdating TensorFusionGPUPhase = constants.PhaseUpdating
47+
TensorFusionGPUPhaseRunning TensorFusionGPUPhase = constants.PhaseRunning
48+
TensorFusionGPUPhaseUnknown TensorFusionGPUPhase = constants.PhaseUnknown
49+
TensorFusionGPUPhaseDestroying TensorFusionGPUPhase = constants.PhaseDestroying
50+
TensorFusionGPUPhaseMigrating TensorFusionGPUPhase = constants.PhaseMigrating
51+
)
52+
3153
// +kubebuilder:object:root=true
3254
// +kubebuilder:subresource:status
3355
// +kubebuilder:resource:scope=Cluster
56+
// +kubebuilder:printcolumn:name="GPU Model",type="string",JSONPath=".spec.gpuModel"
57+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
58+
// +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.capacity.flops"
59+
// +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.capacity.vram"
60+
// +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.available.flops"
61+
// +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.available.vram"
62+
// +kubebuilder:printcolumn:name="Device UUID",type="string",JSONPath=".status.uuid"
3463
// GPU is the Schema for the gpus API.
3564
type GPU struct {
3665
metav1.TypeMeta `json:",inline"`

0 commit comments

Comments
 (0)