Skip to content

Commit

Permalink
Improve Image Builder Jobs Label and Annotation (#380)
Browse files Browse the repository at this point in the history
<!--  Thanks for sending a pull request!  Here are some tips for you:

1. Run unit tests and ensure that they are passing
2. If your change introduces any API changes, make sure to update the
e2e tests
3. Make sure documentation is updated for your PR!

-->

**What this PR does / why we need it**:
<!-- Explain here the context and why you're making the change. What is
the problem you're trying to solve. --->

1. The image-building jobs in Merlin are timing out. After some
investigation, we found that one of the root causes is the node pool got
scaled down resulting in the image building pods to be rescheduled. This
PR adds "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" to
avoid the pod get killed and rescheduled.
2. In Refactor deployed model labels #346, we managed to propagate user
labels to models and batch prediction jobs, but not to image builder
jobs (here). This PR includes adding project + model version labels to
image builder job.
3. This PR also improve e2e test by:
a. Removing the duplicate of kserve-controller-manager statefulset as in
kserve 0.9.0 manifest has both of statefulset and deployment
  b. Removing kserve-controller-manager's cpu limits

**Does this PR introduce a user-facing change?**:
<!--
If no, just write "NONE" in the release-note block below.
If yes, a release note is required. Enter your extended release note in
the block below.
If the PR requires additional action from users switching to the new
release, include the string "action required".

For more information about release notes, see kubernetes' guide here:
http://git.k8s.io/community/contributors/guide/release-notes.md
-->

```release-note
NONE
```

**Checklist**

- [x] Added unit test, integration, and/or e2e tests
- [x] Tested locally
- [x] Updated documentation
- [ ] Update Swagger spec if the PR introduce API changes
- [ ] Regenerated Golang and Python client if the PR introduce API
changes
  • Loading branch information
ariefrahmansyah authored Apr 17, 2023
1 parent 2916332 commit f20550d
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 21 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/merlin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
env:
ARTIFACT_RETENTION_DAYS: 7
DOCKER_BUILDKIT: 1
GO_VERSION: '1.20'
GO_VERSION: "1.20"

jobs:
create-version:
Expand Down Expand Up @@ -477,7 +477,7 @@ jobs:
echo "::endgroup::"
echo "::group::KServe log"
kubectl logs statefulset/kserve-controller-manager -n kserve -c manager
kubectl logs deployment/kserve-controller-manager -n kserve -c manager
echo "::endgroup::"
echo "::group::KServe events"
Expand Down
2 changes: 2 additions & 0 deletions api/cmd/api/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ func initImageBuilder(cfg *config.Config) (webserviceBuilder imagebuilder.ImageB
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,
SafeToEvict: cfg.ImageBuilderConfig.SafeToEvict,

ClusterName: cfg.ImageBuilderConfig.ClusterName,
GcpProject: cfg.ImageBuilderConfig.GcpProject,
Expand All @@ -164,6 +165,7 @@ func initImageBuilder(cfg *config.Config) (webserviceBuilder imagebuilder.ImageB
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,
SafeToEvict: cfg.ImageBuilderConfig.SafeToEvict,

ClusterName: cfg.ImageBuilderConfig.ClusterName,
GcpProject: cfg.ImageBuilderConfig.GcpProject,
Expand Down
1 change: 1 addition & 0 deletions api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ type ImageBuilderConfig struct {
NodeSelectors DictEnv `envconfig:"IMG_BUILDER_NODE_SELECTORS"`
MaximumRetry int32 `envconfig:"IMG_BUILDER_MAX_RETRY" default:"3"`
K8sConfig mlpcluster.K8sConfig `envconfig:"IMG_BUILDER_K8S_CONFIG"`
SafeToEvict bool `envconfig:"IMG_BUILDER_SAFE_TO_EVICT" default:"false"`
}

type Tolerations []v1.Toleration
Expand Down
2 changes: 2 additions & 0 deletions api/pkg/imagebuilder/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type Config struct {
NodeSelectors map[string]string
// Maximum number of retry of image builder job
MaximumRetry int32
// Value for cluster-autoscaler.kubernetes.io/safe-to-evict annotation
SafeToEvict bool

// Cluster Name
ClusterName string
Expand Down
19 changes: 15 additions & 4 deletions api/pkg/imagebuilder/imagebuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,15 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
Component: models.ComponentImageBuilder,
Stream: project.Stream,
Team: project.Team,
Labels: models.MergeProjectVersionLabels(project.Labels, version.Labels),
}

annotations := make(map[string]string)
if !c.config.SafeToEvict {
// The image-building jobs are timing out. We found that one of the root causes is the node pool got scaled down resulting in the image building pods to be rescheduled.
// Adding "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" to avoid the pod get killed and rescheduled.
// https://kubernetes.io/docs/reference/labels-annotations-taints/#cluster-autoscaler-kubernetes-io-safe-to-evict
annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
}

baseImageTag, ok := c.config.BaseImages[version.PythonVersion]
Expand Down Expand Up @@ -411,9 +420,10 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo

job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: kanikoPodName,
Namespace: c.config.BuildNamespace,
Labels: metadata.ToLabel(),
Name: kanikoPodName,
Namespace: c.config.BuildNamespace,
Labels: metadata.ToLabel(),
Annotations: annotations,
},
Spec: batchv1.JobSpec{
Completions: &jobCompletions,
Expand All @@ -422,7 +432,8 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
ActiveDeadlineSeconds: &activeDeadlineSeconds,
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: metadata.ToLabel(),
Labels: metadata.ToLabel(),
Annotations: annotations,
},
Spec: v1.PodSpec{
// https://stackoverflow.com/questions/54091659/kubernetes-pods-disappear-after-failed-jobs
Expand Down
93 changes: 93 additions & 0 deletions api/pkg/imagebuilder/imagebuilder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ var (
ID: models.ID(1),
ArtifactURI: testArtifactURI,
PythonVersion: "3.10.*",
Labels: models.KV{
"test": "true",
},
}

timeout, _ = time.ParseDuration("10s")
Expand Down Expand Up @@ -246,6 +249,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -262,6 +270,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -346,6 +359,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/team": project.Team,
"gojek.com/environment": config.Environment,
"gojek.com/component": "image-builder",
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -362,6 +380,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/team": project.Team,
"gojek.com/environment": config.Environment,
"gojek.com/component": "image-builder",
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -424,6 +447,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -440,6 +468,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -552,6 +585,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -568,6 +606,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -690,6 +733,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -706,6 +754,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -822,6 +875,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -838,6 +896,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -921,6 +984,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -937,6 +1005,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -1023,6 +1096,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -1039,6 +1117,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down Expand Up @@ -1113,6 +1196,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: batchv1.JobSpec{
Expand All @@ -1129,6 +1217,11 @@ func TestBuildImage(t *testing.T) {
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": project.Stream,
"gojek.com/team": project.Team,
"sample": "true",
"test": "true",
},
Annotations: map[string]string{
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
},
Spec: v1.PodSpec{
Expand Down
2 changes: 2 additions & 0 deletions charts/merlin/templates/merlin-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ spec:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /etc/gcp_service_account/service-account.json
{{- end }}
- name: IMG_BUILDER_SAFE_TO_EVICT
value: "{{ .Values.merlin.imageBuilder.safeToEvict }}"
volumeMounts:
- mountPath: /opt/config
name: config
Expand Down
7 changes: 4 additions & 3 deletions charts/merlin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ merlin:

environment: dev
deploymentLabelPrefix: "gojek.com/"
pyfuncGRPCOptions : "{}"
pyfuncGRPCOptions: "{}"

loggerDestinationURL: "http://yourDestinationLogger"

Expand Down Expand Up @@ -82,6 +82,7 @@ merlin:
nodeSelectors: {}
maxRetry: 3
k8sConfig: ""
safeToEvict: false

gitlab:
baseURL: https://gitlab.com/
Expand Down Expand Up @@ -172,7 +173,7 @@ merlin:
keepAliveTime: 60s
keepAliveTimeout: 5s
kafka:
# brokers:
# brokers:
maxMessageSize: "1048588"

# Google service account used to access GCP's resources.
Expand Down Expand Up @@ -425,7 +426,7 @@ swagger:
externalPort: 8080

mlp:
environmentConfigSecret:
environmentConfigSecret:
name: ""
envKey: environment.yaml
imageBuilderKey: imageBuilderK8sConfig
Loading

0 comments on commit f20550d

Please sign in to comment.