Skip to content

Commit

Permalink
Support resource configuration for imagebuilder (#351)
Browse files Browse the repository at this point in the history
<!--  Thanks for sending a pull request!  Here are some tips for you:

1. Run unit tests and ensure that they are passing
2. If your change introduces any API changes, make sure to update the
e2e tests
3. Make sure documentation is updated for your PR!

-->

**What this PR does / why we need it**:
<!-- Explain here the context and why you're making the change. What is
the problem you're trying to solve. --->

Currently, Merlin image building jobs don't support resource
configuration, and requests.cpu is hardcoded. Without requests.memory
configured on them, it's common to see jobs failing with OOMKilled error
if multiple jobs are running concurrently. This PR supports resource
configuration so that some reasonable amount of memory can be set to
mitigate this issue. In addition, the GKE autoscaler can spin up another
node if multiple jobs are running concurrently, instead of placing all
of the jobs onto the same node, and causing them to fight for available
memory.

**Which issue(s) this PR fixes**:
<!--
*Automatically closes linked issue when PR is merged.
Usage: `Fixes #<issue number>`, or `Fixes (paste link of issue)`.
-->

Fixes #

**Does this PR introduce a user-facing change?**:
<!--
If no, just write "NONE" in the release-note block below.
If yes, a release note is required. Enter your extended release note in
the block below.
If the PR requires additional action from users switching to the new
release, include the string "action required".

For more information about release notes, see kubernetes' guide here:
http://git.k8s.io/community/contributors/guide/release-notes.md
-->

```release-note
NONE
```

**Checklist**

- [x] Added unit test, integration, and/or e2e tests
  • Loading branch information
terryyylim authored Feb 21, 2023
1 parent 14d23c9 commit 101989d
Show file tree
Hide file tree
Showing 12 changed files with 207 additions and 47 deletions.
2 changes: 2 additions & 0 deletions api/cmd/api/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ func initImageBuilder(cfg *config.Config) (webserviceBuilder imagebuilder.ImageB
ContextSubPath: cfg.ImageBuilderConfig.ContextSubPath,
BuildTimeoutDuration: timeout,
KanikoImage: cfg.ImageBuilderConfig.KanikoImage,
Resources: cfg.ImageBuilderConfig.Resources,
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,
Expand All @@ -157,6 +158,7 @@ func initImageBuilder(cfg *config.Config) (webserviceBuilder imagebuilder.ImageB
ContextSubPath: cfg.ImageBuilderConfig.PredictionJobContextSubPath,
BuildTimeoutDuration: timeout,
KanikoImage: cfg.ImageBuilderConfig.KanikoImage,
Resources: cfg.ImageBuilderConfig.Resources,
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,
Expand Down
41 changes: 27 additions & 14 deletions api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,21 +135,34 @@ type DatabaseConfig struct {
MaxOpenConns int `envconfig:"DATABASE_MAX_OPEN_CONNS" default:"0"`
}

// Resource contains the Kubernetes resource request and limits
type Resource struct {
CPU string `validate:"required"`
Memory string `validate:"required"`
}

// ResourceRequestsLimits contains the Kubernetes resource request and limits for kaniko
type ResourceRequestsLimits struct {
Requests Resource `validate:"required"`
Limits Resource `validate:"required"`
}

type ImageBuilderConfig struct {
ClusterName string `envconfig:"IMG_BUILDER_CLUSTER_NAME"`
GcpProject string `envconfig:"IMG_BUILDER_GCP_PROJECT"`
BuildContextURI string `envconfig:"IMG_BUILDER_BUILD_CONTEXT_URI"`
ContextSubPath string `envconfig:"IMG_BUILDER_CONTEXT_SUB_PATH"`
DockerfilePath string `envconfig:"IMG_BUILDER_DOCKERFILE_PATH" default:"./Dockerfile"`
BaseImages BaseImageConfigs `envconfig:"IMG_BUILDER_BASE_IMAGES"`
PredictionJobBuildContextURI string `envconfig:"IMG_BUILDER_PREDICTION_JOB_BUILD_CONTEXT_URI"`
PredictionJobContextSubPath string `envconfig:"IMG_BUILDER_PREDICTION_JOB_CONTEXT_SUB_PATH"`
PredictionJobDockerfilePath string `envconfig:"IMG_BUILDER_PREDICTION_JOB_DOCKERFILE_PATH" default:"./Dockerfile"`
PredictionJobBaseImages BaseImageConfigs `envconfig:"IMG_BUILDER_PREDICTION_JOB_BASE_IMAGES"`
BuildNamespace string `envconfig:"IMG_BUILDER_NAMESPACE" default:"mlp"`
DockerRegistry string `envconfig:"IMG_BUILDER_DOCKER_REGISTRY"`
BuildTimeout string `envconfig:"IMG_BUILDER_TIMEOUT" default:"10m"`
KanikoImage string `envconfig:"IMG_BUILDER_KANIKO_IMAGE" default:"gcr.io/kaniko-project/executor:v1.6.0"`
ClusterName string `envconfig:"IMG_BUILDER_CLUSTER_NAME"`
GcpProject string `envconfig:"IMG_BUILDER_GCP_PROJECT"`
BuildContextURI string `envconfig:"IMG_BUILDER_BUILD_CONTEXT_URI"`
ContextSubPath string `envconfig:"IMG_BUILDER_CONTEXT_SUB_PATH"`
DockerfilePath string `envconfig:"IMG_BUILDER_DOCKERFILE_PATH" default:"./Dockerfile"`
BaseImages BaseImageConfigs `envconfig:"IMG_BUILDER_BASE_IMAGES"`
PredictionJobBuildContextURI string `envconfig:"IMG_BUILDER_PREDICTION_JOB_BUILD_CONTEXT_URI"`
PredictionJobContextSubPath string `envconfig:"IMG_BUILDER_PREDICTION_JOB_CONTEXT_SUB_PATH"`
PredictionJobDockerfilePath string `envconfig:"IMG_BUILDER_PREDICTION_JOB_DOCKERFILE_PATH" default:"./Dockerfile"`
PredictionJobBaseImages BaseImageConfigs `envconfig:"IMG_BUILDER_PREDICTION_JOB_BASE_IMAGES"`
BuildNamespace string `envconfig:"IMG_BUILDER_NAMESPACE" default:"mlp"`
DockerRegistry string `envconfig:"IMG_BUILDER_DOCKER_REGISTRY"`
BuildTimeout string `envconfig:"IMG_BUILDER_TIMEOUT" default:"10m"`
KanikoImage string `envconfig:"IMG_BUILDER_KANIKO_IMAGE" default:"gcr.io/kaniko-project/executor:v1.6.0"`
Resources ResourceRequestsLimits `envconfig:"IMG_BUILDER_RESOURCES"`
// How long to keep the image building job resource in the Kubernetes cluster. Default: 2 days (48 hours).
Retention time.Duration `envconfig:"IMG_BUILDER_RETENTION" default:"48h"`
Tolerations Tolerations `envconfig:"IMG_BUILDER_TOLERATIONS"`
Expand Down
48 changes: 48 additions & 0 deletions api/pkg/imagebuilder/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright 2020 The Merlin Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package imagebuilder

import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

// RequestLimitResources is a Kubernetes resource request and limits
type RequestLimitResources struct {
Request Resource
Limit Resource
}

// Build converts the spec into a Kubernetes spec
func (r *RequestLimitResources) Build() corev1.ResourceRequirements {
return corev1.ResourceRequirements{
Requests: r.Request.Build(),
Limits: r.Limit.Build(),
}
}

// Resource is a Kubernetes resource
type Resource struct {
CPU resource.Quantity
Memory resource.Quantity
}

// Build converts the spec into a Kubernetes spec
func (r *Resource) Build() corev1.ResourceList {
return corev1.ResourceList{
corev1.ResourceCPU: r.CPU,
corev1.ResourceMemory: r.Memory,
}
}
57 changes: 57 additions & 0 deletions api/pkg/imagebuilder/common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright 2020 The Merlin Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package imagebuilder

import (
"testing"

"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

var (
cpu = resource.MustParse("500m")
memory = resource.MustParse("500Mi")
)

func CreateRequestLimitResources() RequestLimitResources {
return RequestLimitResources{
Request: Resource{
CPU: cpu,
Memory: memory,
},
Limit: Resource{
CPU: cpu,
Memory: memory,
},
}
}

func TestContainer(t *testing.T) {
expected := corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: cpu,
corev1.ResourceMemory: memory,
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: cpu,
corev1.ResourceMemory: memory,
},
}
c := CreateRequestLimitResources()

assert.Equal(t, expected, c.Build())
}
2 changes: 2 additions & 0 deletions api/pkg/imagebuilder/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type Config struct {
BuildTimeoutDuration time.Duration
// Kaniko docker image
KanikoImage string
// Kubernetes resource request and limits for kaniko
Resources cfg.ResourceRequestsLimits
// Tolerations for Jobs Specification
Tolerations []v1.Toleration
// Node Selectors for Jobs Specification
Expand Down
27 changes: 20 additions & 7 deletions api/pkg/imagebuilder/imagebuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,6 @@ var (
jobCompletions int32 = 1
)

var defaultResourceRequests = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
}

func newImageBuilder(kubeClient kubernetes.Interface, config Config, nameGenerator nameGenerator) ImageBuilder {
return &imageBuilder{
kubeClient: kubeClient,
Expand Down Expand Up @@ -365,6 +361,25 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo

activeDeadlineSeconds := int64(c.config.BuildTimeoutDuration / time.Second)

resourceRequirements := RequestLimitResources{
Request: Resource{
CPU: resource.MustParse(
c.config.Resources.Requests.CPU,
),
Memory: resource.MustParse(
c.config.Resources.Requests.Memory,
),
},
Limit: Resource{
CPU: resource.MustParse(
c.config.Resources.Limits.CPU,
),
Memory: resource.MustParse(
c.config.Resources.Limits.Memory,
),
},
}

return &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: kanikoPodName,
Expand Down Expand Up @@ -400,9 +415,7 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
Value: saFilePath,
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: resourceRequirements.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down
57 changes: 33 additions & 24 deletions api/pkg/imagebuilder/imagebuilder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -110,6 +111,16 @@ var (
GcpProject: "test-project",
Environment: "dev",
KanikoImage: "gcr.io/kaniko-project/executor:v1.1.0",
Resources: cfg.ResourceRequestsLimits{
Requests: cfg.Resource{
CPU: "500m",
Memory: "1Gi",
},
Limits: cfg.Resource{
CPU: "500m",
Memory: "1Gi",
},
},
Tolerations: []v1.Toleration{
{
Key: "image-build-job",
Expand All @@ -123,6 +134,17 @@ var (
},
MaximumRetry: jobBackOffLimit,
}

defaultResourceRequests = RequestLimitResources{
Request: Resource{
CPU: resource.MustParse("500m"),
Memory: resource.MustParse("1Gi"),
},
Limit: Resource{
CPU: resource.MustParse("500m"),
Memory: resource.MustParse("1Gi"),
},
}
)

func TestBuildImage(t *testing.T) {
Expand Down Expand Up @@ -210,9 +232,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -312,9 +332,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -369,6 +387,7 @@ func TestBuildImage(t *testing.T) {
GcpProject: "test-project",
Environment: "dev",
KanikoImage: "gcr.io/kaniko-project/executor:v1.1.0",
Resources: config.Resources,
NodeSelectors: map[string]string{
"cloud.google.com/gke-nodepool": "image-building-job-node-pool",
},
Expand Down Expand Up @@ -441,9 +460,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -503,6 +520,7 @@ func TestBuildImage(t *testing.T) {
GcpProject: "test-project",
Environment: "dev",
KanikoImage: "gcr.io/kaniko-project/executor:v1.1.0",
Resources: config.Resources,
Tolerations: []v1.Toleration{
{
Key: "image-build-job",
Expand Down Expand Up @@ -579,9 +597,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -643,6 +659,7 @@ func TestBuildImage(t *testing.T) {
GcpProject: config.GcpProject,
Environment: config.Environment,
KanikoImage: config.KanikoImage,
Resources: config.Resources,
MaximumRetry: config.MaximumRetry,
NodeSelectors: config.NodeSelectors,
Tolerations: config.Tolerations,
Expand Down Expand Up @@ -713,9 +730,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -814,9 +829,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -918,9 +931,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down Expand Up @@ -1010,9 +1021,7 @@ func TestBuildImage(t *testing.T) {
Value: "/secret/kaniko-secret.json",
},
},
Resources: v1.ResourceRequirements{
Requests: defaultResourceRequests,
},
Resources: defaultResourceRequests.Build(),
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
Expand Down
2 changes: 1 addition & 1 deletion charts/merlin/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: v1
description: Kubernetes-friendly ML model management, deployment, and serving.
name: merlin
version: 0.8.1
version: 0.8.2
2 changes: 2 additions & 0 deletions charts/merlin/templates/merlin-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ spec:
value: "{{ .Values.merlin.imageBuilder.timeout }}"
- name: IMG_BUILDER_KANIKO_IMAGE
value: "{{ .Values.merlin.imageBuilder.kanikoImage }}"
- name: IMG_BUILDER_RESOURCES
value: {{ .Values.merlin.imageBuilder.resources | toJson | quote }}
- name: IMG_BUILDER_TOLERATIONS
value: {{ .Values.merlin.imageBuilder.tolerations | toJson | quote }}
- name: IMG_BUILDER_NODE_SELECTORS
Expand Down
Loading

0 comments on commit 101989d

Please sign in to comment.