Skip to content

Commit

Permalink
Opt in device plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
Issacwww committed Sep 19, 2024
1 parent 05e0da5 commit 00888a0
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 29 deletions.
16 changes: 11 additions & 5 deletions e2e2/test/cases/neuron/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ import (
)

var (
testenv env.Environment
neuronTestImage *string
testenv env.Environment
neuronTestImage *string
installDevicePlugin *bool
)

var (
Expand All @@ -32,6 +33,7 @@ var (

func TestMain(m *testing.M) {
neuronTestImage = flag.String("neuronTestImage", "", "image for neuron single node test")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
cfg, err := envconf.NewFromFlags()
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
Expand All @@ -41,9 +43,10 @@ func TestMain(m *testing.M) {
defer cancel()
testenv = testenv.WithContext(ctx)

manifests := [][]byte{
neuronDevicePluginManifest,
neuronDevicePlugiRbacManifest,
var manifests [][]byte

if *installDevicePlugin {
manifests = append(manifests, neuronDevicePluginManifest, neuronDevicePlugiRbacManifest)
}

testenv.Setup(
Expand All @@ -55,6 +58,9 @@ func TestMain(m *testing.M) {
return ctx, nil
},
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
if !*installDevicePlugin {
return ctx, nil
}
ds := appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"},
}
Expand Down
23 changes: 15 additions & 8 deletions e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ import (
)

var (
testenv env.Environment
nodeType *string
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
testenv env.Environment
nodeType *string
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
)

var (
Expand All @@ -46,6 +47,7 @@ func TestMain(m *testing.M) {
nodeType = flag.String("nodeType", "", "node type for the tests")
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
cfg, err := envconf.NewFromFlags()
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
Expand All @@ -57,9 +59,11 @@ func TestMain(m *testing.M) {

// all NVIDIA tests require the device plugin and MPI operator
manifests := [][]byte{
nvidiaDevicePluginManifest,
mpiOperatorManifest,
}
if *installDevicePlugin {
manifests = append(manifests, nvidiaDevicePluginManifest)
}

testenv.Setup(
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
Expand All @@ -81,6 +85,9 @@ func TestMain(m *testing.M) {
return ctx, nil
},
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
if !*installDevicePlugin {
return ctx, nil
}
ds := appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Name: "nvidia-device-plugin-daemonset", Namespace: "kube-system"},
}
Expand Down
4 changes: 2 additions & 2 deletions kubetest2/internal/deployers/eksapi/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ func (m *NodegroupManager) createManagedNodegroup(infra *Infrastructure, cluster
func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
stackName := m.getUnmanagedNodegroupStackName()
klog.Infof("creating unmanaged nodegroup stack...")
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, opts.EFA, cluster)
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, cluster)
if err != nil {
return err
}
Expand Down Expand Up @@ -258,7 +258,7 @@ func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, clust
func (m *NodegroupManager) createUnmanagedNodegroupWithEFA(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
stackName := m.getUnmanagedNodegroupStackName()
klog.Infof("creating unmanaged nodegroup with EFA stack...")
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, opts.EFA, cluster)
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, cluster)
if err != nil {
return err
}
Expand Down
1 change: 0 additions & 1 deletion kubetest2/internal/deployers/eksapi/templates/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ type UserDataTemplateData struct {
CertificateAuthority string
CIDR string
APIServerEndpoint string
EFAEnabled bool
}

var (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,3 @@

[settings.host-containers.admin]
"enabled" = true

[settings.efa]
"enabled" = {{.EFAEnabled}}
3 changes: 1 addition & 2 deletions kubetest2/internal/deployers/eksapi/userdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"github.com/aws/aws-k8s-tester/kubetest2/internal/deployers/eksapi/templates"
)

func generateUserData(format string, efaEnabled bool, cluster *Cluster) (string, bool, error) {
func generateUserData(format string, cluster *Cluster) (string, bool, error) {
userDataIsMimePart := true
var t *template.Template
switch format {
Expand All @@ -29,7 +29,6 @@ func generateUserData(format string, efaEnabled bool, cluster *Cluster) (string,
CertificateAuthority: cluster.certificateAuthorityData,
CIDR: cluster.cidr,
Name: cluster.name,
EFAEnabled: efaEnabled,
}); err != nil {
return "", false, err
}
Expand Down
9 changes: 1 addition & 8 deletions kubetest2/internal/deployers/eksapi/userdata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,40 +42,33 @@ const bottlerocketUserData = `[settings.kubernetes]
[settings.host-containers.admin]
"enabled" = true
[settings.efa]
"enabled" = true
`

func Test_generateUserData(t *testing.T) {
cases := []struct {
format string
efa bool
expected string
expectedIsMimePart bool
}{
{
format: "bootstrap.sh",
efa: false,
expected: bootstrapShUserData,
expectedIsMimePart: true,
},
{
format: "nodeadm",
efa: false,
expected: nodeadmUserData,
expectedIsMimePart: true,
},
{
format: "bottlerocket",
efa: true,
expected: bottlerocketUserData,
expectedIsMimePart: false,
},
}
for _, c := range cases {
t.Run(c.format, func(t *testing.T) {
actual, isMimePart, err := generateUserData(c.format, c.efa, &cluster)
actual, isMimePart, err := generateUserData(c.format, &cluster)
if err != nil {
t.Log(err)
t.Error(err)
Expand Down

0 comments on commit 00888a0

Please sign in to comment.