Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Cleanup helm chart #33

Merged
merged 1 commit into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ jobs:

- name: Run e2e test
run: |
OUTPUT_TYPE=type=registry make e2etests
make e2etests

- name: Cleanup e2e resources
if: ${{ always() }}
Expand Down
342 changes: 163 additions & 179 deletions Makefile

Large diffs are not rendered by default.

328 changes: 0 additions & 328 deletions Makefile-az.mk

This file was deleted.

118 changes: 51 additions & 67 deletions charts/gpu-provisioner/README.md

Large diffs are not rendered by default.

34 changes: 1 addition & 33 deletions charts/gpu-provisioner/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,10 @@ tolerations:
operator: Exists
# -- Additional volumes for the pod.
extraVolumes: []
# - name: aws-iam-token
# projected:
# defaultMode: 420
# sources:
# - serviceAccountToken:
# audience: sts.amazonaws.com
# expirationSeconds: 86400
# path: token
controller:
image:
# -- Repository path to the controller image.
repository: aimodelsregistry.azurecr.io/gpu-provisioner
repository: ghcr.io/azure/gpu-provisioner
# -- Tag of the controller image.
tag: 0.2.0
# -- SHA256 digest of the controller image.
Expand Down Expand Up @@ -156,10 +148,6 @@ controller:
logEncoding: ""
# -- Additional volumeMounts for the controller pod.
extraVolumeMounts: []
# - name: aws-iam-token
# mountPath: /var/run/secrets/eks.amazonaws.com/serviceaccount
# readOnly: true
# -- Additional sidecarContainer config
sidecarContainer: []
# -- Additional volumeMounts for the sidecar - this will be added to the volume mounts on top of extraVolumeMounts
sidecarVolumeMounts: []
Expand All @@ -175,33 +163,13 @@ logLevel: debug
logEncoding: console
# -- Global Settings to configure gpu-provisioner
settings:
# -- The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one
# time which usually results in fewer but larger nodes.
batchMaxDuration: 10s
# -- The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive
# faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods
# will be batched separately.
batchIdleDuration: 1s
# -- Azure-specific configuration values
azure:
# -- Cluster name.
clusterName: new_demo
# -- Cluster endpoint.
clusterEndpoint: https://newdemo-llm-test-ff05f5-vs68qyru.hcp.eastus.azmk8s.io:443
# -- The VM memory overhead as a percent that will be subtracted from the total memory for all instance types
# TODO: not used yet ...
vmMemoryOverheadPercent: 0.075
# TODO: autogenerate
sshPublicKey:
networkPlugin: "kubenet" # TODO: get this from the cluster
networkPolicy: ""
# -- The global tags to use on all Azure infrastructure resources (VMs, etc.)
# TODO: not propagated yet ...
tags:
# -- Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates
# in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features
featureGates:
# -- driftEnabled is in ALPHA and is disabled by default.
# Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes
# and the desired state of nodes set in provisioners and node templates
driftEnabled: false
93 changes: 0 additions & 93 deletions hack/azure/aks-savm.bicep

This file was deleted.

54 changes: 0 additions & 54 deletions hack/azure/perftest.sh

This file was deleted.

77 changes: 4 additions & 73 deletions pkg/apis/settings/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ import (
"context"
"encoding/json"
"fmt"
"hash/fnv"
"math/rand"
"net/url"

"github.com/go-playground/validator/v10"
"go.uber.org/multierr"
Expand All @@ -33,32 +30,14 @@ type settingsKeyType struct{}
var ContextKey = settingsKeyType{}

var defaultSettings = Settings{
ClusterName: "",
ClusterEndpoint: "",
VMMemoryOverheadPercent: 0.075,
Tags: map[string]string{},
ClusterID: "",
SSHPublicKey: "",
NetworkPlugin: "",
NetworkPolicy: "",
ClusterName: "",
Tags: map[string]string{},
}

// +k8s:deepcopy-gen=true
type Settings struct {
ClusterName string `validate:"required"`
ClusterEndpoint string `validate:"required"` // => APIServerName in bootstrap, except needs to be w/o https/port
VMMemoryOverheadPercent float64 `validate:"min=0"`
Tags map[string]string

// Cluster-level settings required for nodebootstrap (category "x")
// (Candidates for exposure/accessibility via API)
// TODO: consider making these AKS-specific (e.g. subkey?)

ClusterID string

SSHPublicKey string // ssh.publicKeys.keyData => VM SSH public key // TODO: move to node template?
NetworkPlugin string `validate:"required"` // => NetworkPlugin in bootstrap
NetworkPolicy string // => NetworkPolicy in bootstrap
ClusterName string `validate:"required"`
Tags map[string]string
}

func (*Settings) ConfigMap() string {
Expand All @@ -71,25 +50,14 @@ func (*Settings) Inject(ctx context.Context, cm *v1.ConfigMap) (context.Context,

if err := configmap.Parse(cm.Data,
configmap.AsString("azure.clusterName", &s.ClusterName),
configmap.AsString("azure.clusterEndpoint", &s.ClusterEndpoint),
configmap.AsFloat64("azure.vmMemoryOverheadPercent", &s.VMMemoryOverheadPercent),
AsStringMap("azure.tags", &s.Tags),
configmap.AsString("azure.clusterID", &s.ClusterID),
configmap.AsString("azure.sshPublicKey", &s.SSHPublicKey),
configmap.AsString("azure.networkPlugin", &s.NetworkPlugin),
configmap.AsString("azure.networkPolicy", &s.NetworkPolicy),
); err != nil {
return ctx, fmt.Errorf("parsing settings, %w", err)
}
if err := s.Validate(); err != nil {
return ctx, fmt.Errorf("validating settings, %w", err)
}

// if clusterID is not set, generate it from cluster endpoint
if s.ClusterID == "" {
s.ClusterID = getAKSClusterID(s.GetAPIServerName())
}

return ToContext(ctx, s), nil
}

Expand All @@ -115,26 +83,10 @@ func (s Settings) Data() (map[string]string, error) {
func (s Settings) Validate() error {
validate := validator.New()
return multierr.Combine(
s.validateEndpoint(),
validate.Struct(s),
)
}

func (s Settings) validateEndpoint() error {
endpoint, err := url.Parse(s.ClusterEndpoint)
// url.Parse() will accept a lot of input without error; make
// sure it's a real URL
if err != nil || !endpoint.IsAbs() || endpoint.Hostname() == "" {
return fmt.Errorf("\"%s\" not a valid clusterEndpoint URL", s.ClusterEndpoint)
}
return nil
}

func (s Settings) GetAPIServerName() string {
endpoint, _ := url.Parse(s.ClusterEndpoint) // already validated
return endpoint.Hostname()
}

func ToContext(ctx context.Context, s *Settings) context.Context {
return context.WithValue(ctx, ContextKey, s)
}
Expand All @@ -148,16 +100,6 @@ func FromContext(ctx context.Context) *Settings {
return data.(*Settings)
}

// AsTypedString passes the value at key through into the target, if it exists.
func AsTypedString[T ~string](key string, target *T) configmap.ParseFunc {
return func(data map[string]string) error {
if raw, ok := data[key]; ok {
*target = T(raw)
}
return nil
}
}

// AsStringMap parses a value as a JSON map of map[string]string.
func AsStringMap(key string, target *map[string]string) configmap.ParseFunc {
return func(data map[string]string) error {
Expand All @@ -171,14 +113,3 @@ func AsStringMap(key string, target *map[string]string) configmap.ParseFunc {
return nil
}
}

// getAKSClusterID returns cluster ID based on the DNS prefix of the cluster.
// The logic comes from AgentBaker and other places, originally from aks-engine
// with the additional assumption of DNS prefix being the first 33 chars of FQDN
func getAKSClusterID(apiServerFQDN string) string {
dnsPrefix := apiServerFQDN[:33]
h := fnv.New64a()
h.Write([]byte(dnsPrefix))
r := rand.New(rand.NewSource(int64(h.Sum64()))) //nolint:gosec
return fmt.Sprintf("%08d", r.Uint32())[:8]
}
Loading