Skip to content

Commit

Permalink
Merge pull request kubernetes#51660 from jiayingz/deviceplugin-e2e
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue

Extend nvidia-gpus e2e test to include a device plugin based test

**What this PR does / why we need it**:
This is needed to verify device plugin feature.

**Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes kubernetes/enhancements#368

**Special notes for your reviewer**:
Related test_infra PR: kubernetes/test-infra#4265

**Release note**:
Add an e2e test for nvidia gpu device plugin
  • Loading branch information
Kubernetes Submit Queue authored Sep 9, 2017
2 parents a5f7660 + 01b49b4 commit 24ad0d2
Showing 1 changed file with 58 additions and 7 deletions.
65 changes: 58 additions & 7 deletions test/e2e/scheduling/nvidia-gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,14 @@ const (
cosOSImage = "Container-Optimized OS from Google"
// Nvidia driver installation can take upwards of 5 minutes.
driverInstallTimeout = 10 * time.Minute
// Nvidia COS driver installer daemonset.
cosNvidiaDriverInstallerUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/stable/cos-nvidia-gpu-installer/daemonset.yaml"
)

type podCreationFuncType func() *v1.Pod

var (
gpuResourceName v1.ResourceName
dsYamlUrl string
podCreationFunc podCreationFuncType
)

func makeCudaAdditionTestPod() *v1.Pod {
Expand All @@ -60,7 +66,7 @@ func makeCudaAdditionTestPod() *v1.Pod {
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
},
},
VolumeMounts: []v1.VolumeMount{
Expand All @@ -86,6 +92,30 @@ func makeCudaAdditionTestPod() *v1.Pod {
return testPod
}

func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
podName := testPodNamePrefix + string(uuid.NewUUID())
testPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: "vector-addition",
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
},
},
},
},
},
}
return testPod
}

func isClusterRunningCOS(f *framework.Framework) bool {
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
Expand All @@ -105,7 +135,8 @@ func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
if node.Spec.Unschedulable {
continue
}
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
framework.Logf("gpuResourceName %s", gpuResourceName)
if val, ok := node.Status.Capacity[gpuResourceName]; !ok || val.Value() == 0 {
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
return false
}
Expand All @@ -119,7 +150,9 @@ func getGPUsAvailable(f *framework.Framework) int64 {
framework.ExpectNoError(err, "getting node list")
var gpusAvailable int64
for _, node := range nodeList.Items {
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
if val, ok := node.Status.Capacity[gpuResourceName]; ok {
gpusAvailable += (&val).Value()
}
}
return gpusAvailable
}
Expand All @@ -133,10 +166,21 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
}
framework.Logf("Cluster is running on COS. Proceeding with test")

if f.BaseName == "device-plugin-gpus" {
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/device-plugin-daemonset.yaml"
gpuResourceName = "nvidia.com/gpu"
podCreationFunc = makeCudaAdditionDevicePluginTestPod
} else {
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
gpuResourceName = v1.ResourceNvidiaGPU
podCreationFunc = makeCudaAdditionTestPod
}

// GPU drivers might have already been installed.
if !areGPUsAvailableOnAllSchedulableNodes(f) {
// Install Nvidia Drivers.
ds := dsFromManifest(cosNvidiaDriverInstallerUrl)
ds := dsFromManifest(dsYamlUrl)
ds.Namespace = f.Namespace.Name
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
framework.ExpectNoError(err, "failed to create daemonset")
Expand All @@ -149,7 +193,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
podList := []*v1.Pod{}
for i := int64(0); i < getGPUsAvailable(f); i++ {
podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod()))
podList = append(podList, f.PodClient().Create(podCreationFunc()))
}
framework.Logf("Wait for all test pods to succeed")
// Wait for all pods to succeed
Expand Down Expand Up @@ -192,3 +236,10 @@ var _ = SIGDescribe("[Feature:GPU]", func() {
testNvidiaGPUsOnCOS(f)
})
})

var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
f := framework.NewDefaultFramework("device-plugin-gpus")
It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {
testNvidiaGPUsOnCOS(f)
})
})

0 comments on commit 24ad0d2

Please sign in to comment.