diff --git a/.github/workflows/check-release.yaml b/.github/workflows/check-release.yaml new file mode 100644 index 000000000..8a18e4975 --- /dev/null +++ b/.github/workflows/check-release.yaml @@ -0,0 +1,64 @@ +name: Check Release + +on: + pull_request: + branches: + - release-* + paths: + - VERSION + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + SEMVER_PATTERN: '^v([0-9]+)\.([0-9]+)\.([0-9]+)(-rc\.([0-9]+))?$' + +jobs: + check: + runs-on: ubuntu-latest + + steps: + - name: Checkout source code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check whether version matches semver pattern + run: | + VERSION=$(cat VERSION) + if [[ ${VERSION} =~ ${{ env.SEMVER_PATTERN }} ]]; then + echo "Version '${VERSION}' matches semver pattern." + else + echo "Version '${VERSION}' does not match semver pattern." + exit 1 + fi + echo "VERSION=${VERSION}" >> $GITHUB_ENV + + - name: Check whether chart version and appVersion matches version + run: | + VERSION=${VERSION#v} + CHART_VERSION=$(cat charts/spark-operator-chart/Chart.yaml | grep version | awk '{print $2}') + CHART_APP_VERSION=$(cat charts/spark-operator-chart/Chart.yaml | grep appVersion | awk '{print $2}') + if [[ ${CHART_VERSION} == ${VERSION} ]]; then + echo "Chart version '${CHART_VERSION}' matches version '${VERSION}'." + else + echo "Chart version '${CHART_VERSION}' does not match version '${VERSION}'." + exit 1 + fi + if [[ ${CHART_APP_VERSION} == ${VERSION} ]]; then + echo "Chart appVersion '${CHART_APP_VERSION}' matches version '${VERSION}'." + else + echo "Chart appVersion '${CHART_APP_VERSION}' does not match version '${VERSION}'." + exit 1 + fi + + - name: Check if tag exists + run: | + git fetch --tags + if git tag -l | grep -q "^${VERSION}$"; then + echo "Tag '${VERSION}' already exists." + exit 1 + else + echo "Tag '${VERSION}' does not exist." + fi diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 9380dfb2d..b37425701 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -91,7 +91,7 @@ jobs: done build-helm-chart: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Determine branch name id: get_branch @@ -163,37 +163,25 @@ jobs: minikube image load docker.io/kubeflow/spark-operator:local ct install - integration-test: - runs-on: ubuntu-22.04 + e2e-test: + runs-on: ubuntu-latest steps: - name: Checkout source code uses: actions/checkout@v4 with: - fetch-depth: "0" + fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v5 with: - go-version-file: "go.mod" + go-version-file: go.mod - - name: setup minikube - uses: manusa/actions-setup-minikube@v2.11.0 - with: - minikube version: v1.33.0 - kubernetes version: v1.30.0 - start args: --memory 6g --cpus=2 --addons ingress - github token: ${{ inputs.github-token }} + - name: Create a Kind cluster + run: make kind-create-cluster - - name: Build local spark-operator docker image for minikube testing + - name: Build and load image to Kind cluster run: | - docker build -t docker.io/kubeflow/spark-operator:local . - minikube image load docker.io/kubeflow/spark-operator:local - - # The integration tests are currently broken see: https://github.com/kubeflow/spark-operator/issues/1416 - # - name: Run chart-testing (integration test) - # run: make integration-test + make kind-load-image IMAGE_TAG=local - - name: Setup tmate session - if: failure() - uses: mxschmitt/action-tmate@v3 - timeout-minutes: 15 + - name: Run e2e tests + run: make e2e-test diff --git a/.github/workflows/push-tag.yaml b/.github/workflows/push-tag.yaml deleted file mode 100644 index f9329f080..000000000 --- a/.github/workflows/push-tag.yaml +++ /dev/null @@ -1,44 +0,0 @@ -name: Push Tag on VERSION change - -on: - push: - branches: - - master - - release-* - paths: - - VERSION - -jobs: - push_tag: - runs-on: ubuntu-latest - - steps: - - name: Checkout source code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Read version from VERSION file - run: | - VERSION=$(cat VERSION) - echo "VERSION=$VERSION" >> $GITHUB_ENV - - - name: Check if tag exists - run: | - git fetch --tags - if git tag -l | grep -q "^${VERSION}$"; then - echo "TAG_EXISTS=true" >> $GITHUB_ENV - else - echo "TAG_EXISTS=false" >> $GITHUB_ENV - fi - - - name: Create and push tag - if: env.TAG_EXISTS == 'false' - run: | - git tag -a "$VERSION" -m "Release $VERSION" - git push origin "$VERSION" diff --git a/.github/workflows/release-docker.yaml b/.github/workflows/release-docker.yaml deleted file mode 100644 index 849a0e109..000000000 --- a/.github/workflows/release-docker.yaml +++ /dev/null @@ -1,120 +0,0 @@ -name: Release Docker images - -on: - push: - tags: - - v*.*.* - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - IMAGE_REGISTRY: docker.io - IMAGE_REPOSITORY: kubeflow/spark-operator - -# Ref: https://docs.docker.com/build/ci/github-actions/multi-platform/#distribute-build-across-multiple-runners. -jobs: - build: - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - platform: - - linux/amd64 - - linux/arm64 - - steps: - - name: Prepare - run: | - platform=${{ matrix.platform }} - echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV - - - name: Checkout source code - uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }} - tags: | - type=ref,event=branch - type=semver,pattern={{version}} - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ${{ env.IMAGE_REGISTRY }} - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and push by digest - id: build - uses: docker/build-push-action@v6 - with: - platforms: ${{ matrix.platform }} - labels: ${{ steps.meta.outputs.labels }} - outputs: type=image,name=${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: digests-${{ env.PLATFORM_PAIR }} - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - merge: - runs-on: ubuntu-latest - needs: - - build - steps: - - name: Download digests - uses: actions/download-artifact@v4 - with: - path: /tmp/digests - pattern: digests-* - merge-multiple: true - - - name: Set up Docker buildx - uses: docker/setup-buildx-action@v3 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }} - tags: | - type=ref,event=branch - type=semver,pattern={{version}} - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ${{ env.IMAGE_REGISTRY }} - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Create manifest list and push - working-directory: /tmp/digests - run: | - docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }}@sha256:%s ' *) - - - name: Inspect image - run: | - docker buildx imagetools inspect ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }}:${{ steps.meta.outputs.version }} diff --git a/.github/workflows/release-charts.yaml b/.github/workflows/release-helm-charts.yaml similarity index 55% rename from .github/workflows/release-charts.yaml rename to .github/workflows/release-helm-charts.yaml index 874696f09..f69884887 100644 --- a/.github/workflows/release-charts.yaml +++ b/.github/workflows/release-helm-charts.yaml @@ -2,17 +2,25 @@ name: Release Helm charts on: release: - types: [published] + types: + - published concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + HELM_REGISTRY: ghcr.io + HELM_REPOSITORY: ${{ github.repository_owner }}/helm-charts + jobs: - build: + release_helm_charts: permissions: contents: write + packages: write + runs-on: ubuntu-latest + steps: - name: Checkout source code uses: actions/checkout@v4 @@ -27,10 +35,28 @@ jobs: with: version: v3.14.4 + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.HELM_REGISTRY }} + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Read version from VERSION file + run: | + VERSION=$(cat VERSION) + echo "VERSION=${VERSION}" >> $GITHUB_ENV + - name: Package Helm charts run: | for chart in $(ls charts); do - helm package charts/$chart + helm package charts/${chart} + done + + - name: Upload charts to GHCR + run: | + for pkg in $(ls *.tgz); do + helm push ${pkg} oci://${{ env.HELM_REGISTRY }}/${{ env.HELM_REPOSITORY }} done - name: Save packaged charts to temp directory @@ -44,7 +70,7 @@ jobs: ref: gh-pages fetch-depth: 0 - - name: Copy packages charts + - name: Copy packaged charts run: | cp /tmp/charts/*.tgz . @@ -52,7 +78,7 @@ jobs: env: CHART_URL: https://github.com/${{ github.repository }}/releases/download/${{ github.ref_name }} run: | - helm repo index --merge index.yaml --url $CHART_URL . + helm repo index --merge index.yaml --url ${CHART_URL} . git add index.yaml - git commit -s -m "Update index.yaml" || exit 0 + git commit -s -m "Add index for Spark operator chart ${VERSION}" || exit 0 git push diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index ebd0e62a5..cd9f09a5b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,28 +1,244 @@ -name: Create draft release +name: Release on: push: - tags: - - v*.*.* + branches: + - release-* + paths: + - VERSION concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + SEMVER_PATTERN: '^v([0-9]+)\.([0-9]+)\.([0-9]+)(-rc\.([0-9]+))?$' + IMAGE_REGISTRY: docker.io + IMAGE_REPOSITORY: kubeflow/spark-operator + jobs: - release: + check-release: + runs-on: ubuntu-latest + + steps: + - name: Checkout source code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check whether version matches semver pattern + run: | + VERSION=$(cat VERSION) + if [[ ${VERSION} =~ ${{ env.SEMVER_PATTERN }} ]]; then + echo "Version '${VERSION}' matches semver pattern." + else + echo "Version '${VERSION}' does not match semver pattern." + exit 1 + fi + echo "VERSION=${VERSION}" >> $GITHUB_ENV + + - name: Check whether chart version and appVersion matches version + run: | + VERSION=${VERSION#v} + CHART_VERSION=$(cat charts/spark-operator-chart/Chart.yaml | grep version | awk '{print $2}') + CHART_APP_VERSION=$(cat charts/spark-operator-chart/Chart.yaml | grep appVersion | awk '{print $2}') + if [[ ${CHART_VERSION} == ${VERSION} ]]; then + echo "Chart version '${CHART_VERSION}' matches version '${VERSION}'." + else + echo "Chart version '${CHART_VERSION}' does not match version '${VERSION}'." + exit 1 + fi + if [[ ${CHART_APP_VERSION} == ${VERSION} ]]; then + echo "Chart appVersion '${CHART_APP_VERSION}' matches version '${VERSION}'." + else + echo "Chart appVersion '${CHART_APP_VERSION}' does not match version '${VERSION}'." + exit 1 + fi + + - name: Check if tag exists + run: | + git fetch --tags + if git tag -l | grep -q "^${VERSION}$"; then + echo "Tag '${VERSION}' already exists." + exit 1 + else + echo "Tag '${VERSION}' does not exist." + fi + + build_images: + needs: + - check-release + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + platform: + - linux/amd64 + - linux/arm64 + + steps: + - name: Prepare + run: | + platform=${{ matrix.platform }} + echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV + + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Read version from VERSION file + run: | + VERSION=$(cat VERSION) + if [[ ! ${VERSION} =~ ${{ env.SEMVER_PATTERN }} ]]; then + echo "Version '${VERSION}' does not match semver pattern." + exit 1 + fi + echo "VERSION=${VERSION}" >> $GITHUB_ENV + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }} + tags: | + type=semver,pattern={{version}},value=${{ env.VERSION }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.IMAGE_REGISTRY }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push by digest + id: build + uses: docker/build-push-action@v6 + with: + platforms: ${{ matrix.platform }} + labels: ${{ steps.meta.outputs.labels }} + outputs: type=image,name=${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }},push-by-digest=true,name-canonical=true,push=true + + - name: Export digest + run: | + mkdir -p /tmp/digests + digest="${{ steps.build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: digests-${{ env.PLATFORM_PAIR }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + + release_images: + needs: + - build_images + + runs-on: ubuntu-latest + + steps: + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Read version from VERSION file + run: | + VERSION=$(cat VERSION) + echo "VERSION=${VERSION}" >> $GITHUB_ENV + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }} + tags: | + type=semver,pattern={{version}},value=${{ env.VERSION }} + + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digests-* + merge-multiple: true + + - name: Set up Docker buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.IMAGE_REGISTRY }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Create manifest list and push + working-directory: /tmp/digests + run: | + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }}@sha256:%s ' *) + + - name: Inspect image + run: | + docker buildx imagetools inspect ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_REPOSITORY }}:${{ steps.meta.outputs.version }} + + push_tag: + needs: + - release_images + + runs-on: ubuntu-latest + + steps: + - name: Checkout source code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Read version from VERSION file + run: | + VERSION=$(cat VERSION) + echo "VERSION=${VERSION}" >> $GITHUB_ENV + + - name: Create and push tag + run: | + git tag -a "${VERSION}" -m "Spark Operator Official Release ${VERSION}" + git push origin "${VERSION}" + + draft_release: + needs: + - push_tag + permissions: contents: write + runs-on: ubuntu-latest + steps: - name: Checkout uses: actions/checkout@v4 - + - name: Configure Git run: | git config user.name "$GITHUB_ACTOR" git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + - name: Read version from VERSION file + run: | + VERSION=$(cat VERSION) + echo "VERSION=${VERSION}" >> $GITHUB_ENV + - name: Set up Helm uses: azure/setup-helm@v4.2.0 with: @@ -31,17 +247,18 @@ jobs: - name: Package Helm charts run: | for chart in $(ls charts); do - helm package charts/$chart + helm package charts/${chart} done - + - name: Release id: release uses: softprops/action-gh-release@v2 with: token: ${{ secrets.GITHUB_TOKEN }} - draft: true - prerelease: ${{ contains(github.ref, 'rc') }} + name: "Spark Operator ${{ env.VERSION }}" + tag_name: ${{ env.VERSION }} + prerelease: ${{ contains(env.VERSION, 'rc') }} target_commitish: ${{ github.sha }} + draft: true files: | *.tgz - diff --git a/Makefile b/Makefile index 9e05175a1..324ff3d67 100644 --- a/Makefile +++ b/Makefile @@ -246,7 +246,7 @@ endif .PHONY: kind-create-cluster kind-create-cluster: kind ## Create a kind cluster for integration tests. if ! $(KIND) get clusters 2>/dev/null | grep -q "^$(KIND_CLUSTER_NAME)$$"; then \ - kind create cluster --name $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG_FILE) --kubeconfig $(KIND_KUBE_CONFIG); \ + kind create cluster --name $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG_FILE) --kubeconfig $(KIND_KUBE_CONFIG) --wait=1m; \ fi .PHONY: kind-load-image diff --git a/charts/spark-operator-chart/README.md b/charts/spark-operator-chart/README.md index d07b315b3..dbd4969e9 100644 --- a/charts/spark-operator-chart/README.md +++ b/charts/spark-operator-chart/README.md @@ -90,6 +90,8 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum | controller.uiIngress.enable | bool | `false` | Specifies whether to create ingress for Spark web UI. `controller.uiService.enable` must be `true` to enable ingress. | | controller.uiIngress.urlFormat | string | `""` | Ingress URL format. Required if `controller.uiIngress.enable` is true. | | controller.batchScheduler.enable | bool | `false` | Specifies whether to enable batch scheduler for spark jobs scheduling. If enabled, users can specify batch scheduler name in spark application. | +| controller.batchScheduler.kubeSchedulerNames | list | `[]` | Specifies a list of kube-scheduler names for scheduling Spark pods. | +| controller.batchScheduler.default | string | `""` | Default batch scheduler to be used if not specified by the user. If specified, this value must be either "volcano" or "yunikorn". Specifying any other value will cause the controller to error on startup. | | controller.serviceAccount.create | bool | `true` | Specifies whether to create a service account for the controller. | | controller.serviceAccount.name | string | `""` | Optional name for the controller service account. | | controller.serviceAccount.annotations | object | `{}` | Extra annotations for the controller service account. | @@ -112,6 +114,7 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum | controller.sidecars | list | `[]` | Sidecar containers for controller pods. | | controller.podDisruptionBudget.enable | bool | `false` | Specifies whether to create pod disruption budget for controller. Ref: [Specifying a Disruption Budget for your Application](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) | | controller.podDisruptionBudget.minAvailable | int | `1` | The number of pods that must be available. Require `controller.replicas` to be greater than 1 | +| webhook.enable | bool | `true` | Specifies whether to enable webhook. | | webhook.replicas | int | `1` | Number of replicas of webhook server. | | webhook.logLevel | string | `"info"` | Configure the verbosity of logging, can be one of `debug`, `info`, `error`. | | webhook.port | int | `9443` | Specifies webhook port. | diff --git a/charts/spark-operator-chart/ci/kind-config.yaml b/charts/spark-operator-chart/ci/kind-config.yaml new file mode 100644 index 000000000..4e8cae8d9 --- /dev/null +++ b/charts/spark-operator-chart/ci/kind-config.yaml @@ -0,0 +1,7 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + image: kindest/node:v1.29.2 + - role: worker + image: kindest/node:v1.29.2 diff --git a/charts/spark-operator-chart/templates/controller/deployment.yaml b/charts/spark-operator-chart/templates/controller/deployment.yaml index 02f9c2c90..74cf30a0f 100644 --- a/charts/spark-operator-chart/templates/controller/deployment.yaml +++ b/charts/spark-operator-chart/templates/controller/deployment.yaml @@ -21,9 +21,7 @@ metadata: labels: {{- include "spark-operator.controller.labels" . | nindent 4 }} spec: - {{- with .Values.controller.replicas }} - replicas: {{ . }} - {{- end }} + replicas: {{ .Values.controller.replicas }} selector: matchLabels: {{- include "spark-operator.controller.selectorLabels" . | nindent 6 }} @@ -59,8 +57,12 @@ spec: - --zap-log-level={{ . }} {{- end }} {{- with .Values.spark.jobNamespaces }} + {{- if has "" . }} + - --namespaces="" + {{- else }} - --namespaces={{ . | join "," }} {{- end }} + {{- end }} - --controller-threads={{ .Values.controller.workers }} {{- with .Values.controller.uiService.enable }} - --enable-ui-service=true @@ -70,8 +72,14 @@ spec: - --ingress-url-format={{ . }} {{- end }} {{- end }} - {{- with .Values.controller.batchScheduler.enable }} + {{- if .Values.controller.batchScheduler.enable }} - --enable-batch-scheduler=true + {{- with .Values.controller.batchScheduler.kubeSchedulerNames }} + - --kube-scheduler-names={{ . | join "," }} + {{- end }} + {{- with .Values.controller.batchScheduler.default }} + - --default-batch-scheduler={{ . }} + {{- end }} {{- end }} {{- if .Values.prometheus.metrics.enable }} - --enable-metrics=true diff --git a/charts/spark-operator-chart/templates/controller/rbac.yaml b/charts/spark-operator-chart/templates/controller/rbac.yaml index 472d0fcc7..7f0417eda 100644 --- a/charts/spark-operator-chart/templates/controller/rbac.yaml +++ b/charts/spark-operator-chart/templates/controller/rbac.yaml @@ -129,6 +129,17 @@ rules: - podgroups verbs: - "*" +- apiGroups: + - scheduling.x-k8s.io + resources: + - podgroups + verbs: + - get + - list + - watch + - create + - update + - delete {{- end }} --- diff --git a/charts/spark-operator-chart/templates/spark/rbac.yaml b/charts/spark-operator-chart/templates/spark/rbac.yaml index 692eda48e..9e15d6dbb 100644 --- a/charts/spark-operator-chart/templates/spark/rbac.yaml +++ b/charts/spark-operator-chart/templates/spark/rbac.yaml @@ -16,7 +16,7 @@ limitations under the License. {{- if .Values.spark.rbac.create -}} {{- range $jobNamespace := .Values.spark.jobNamespaces | default list }} -{{- if $jobNamespace }} +{{- if ne $jobNamespace "" }} --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/charts/spark-operator-chart/templates/spark/serviceaccount.yaml b/charts/spark-operator-chart/templates/spark/serviceaccount.yaml index f05d8fae3..de24d801e 100644 --- a/charts/spark-operator-chart/templates/spark/serviceaccount.yaml +++ b/charts/spark-operator-chart/templates/spark/serviceaccount.yaml @@ -15,16 +15,19 @@ limitations under the License. */}} {{- if .Values.spark.serviceAccount.create }} -{{- range $sparkJobNamespace := .Values.spark.jobNamespaces | default list }} +{{- range $jobNamespace := .Values.spark.jobNamespaces | default list }} +{{- if ne $jobNamespace "" }} + --- apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "spark-operator.spark.serviceAccountName" $ }} - namespace: {{ $sparkJobNamespace }} + namespace: {{ $jobNamespace }} labels: {{ include "spark-operator.labels" $ | nindent 4 }} {{- with $.Values.spark.serviceAccount.annotations }} annotations: {{ toYaml . | nindent 4 }} {{- end }} {{- end }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/deployment.yaml b/charts/spark-operator-chart/templates/webhook/deployment.yaml index 89b07e3df..ae5167a6e 100644 --- a/charts/spark-operator-chart/templates/webhook/deployment.yaml +++ b/charts/spark-operator-chart/templates/webhook/deployment.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} apiVersion: apps/v1 kind: Deployment metadata: @@ -21,9 +22,7 @@ metadata: labels: {{- include "spark-operator.webhook.labels" . | nindent 4 }} spec: - {{- with .Values.webhook.replicas }} - replicas: {{ . }} - {{- end }} + replicas: {{ .Values.webhook.replicas }} selector: matchLabels: {{- include "spark-operator.webhook.selectorLabels" . | nindent 6 }} @@ -52,8 +51,12 @@ spec: - --zap-log-level={{ . }} {{- end }} {{- with .Values.spark.jobNamespaces }} + {{- if has "" . }} + - --namespaces="" + {{- else }} - --namespaces={{ . | join "," }} {{- end }} + {{- end }} - --webhook-secret-name={{ include "spark-operator.webhook.secretName" . }} - --webhook-secret-namespace={{ .Release.Namespace }} - --webhook-svc-name={{ include "spark-operator.webhook.serviceName" . }} @@ -153,3 +156,4 @@ spec: - {{ mergeOverwrite . $labelSelectorDict | toYaml | nindent 8 | trim }} {{- end }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/mutatingwebhookconfiguration.yaml b/charts/spark-operator-chart/templates/webhook/mutatingwebhookconfiguration.yaml index 951284d19..e7e70d335 100644 --- a/charts/spark-operator-chart/templates/webhook/mutatingwebhookconfiguration.yaml +++ b/charts/spark-operator-chart/templates/webhook/mutatingwebhookconfiguration.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: @@ -33,16 +34,18 @@ webhooks: {{- with .Values.webhook.failurePolicy }} failurePolicy: {{ . }} {{- end }} - {{- if .Values.spark.jobNamespaces }} + {{- with .Values.spark.jobNamespaces }} + {{- if not (has "" .) }} namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: In values: - {{- range .Values.spark.jobNamespaces }} - - {{ . }} + {{- range $jobNamespace := . }} + - {{ $jobNamespace }} {{- end }} {{- end }} + {{- end }} objectSelector: matchLabels: sparkoperator.k8s.io/launched-by-spark-operator: "true" @@ -67,16 +70,18 @@ webhooks: {{- with .Values.webhook.failurePolicy }} failurePolicy: {{ . }} {{- end }} - {{- if .Values.spark.jobNamespaces }} + {{- with .Values.spark.jobNamespaces }} + {{- if not (has "" .) }} namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: In values: - {{- range .Values.spark.jobNamespaces }} - - {{ . }} + {{- range $jobNamespace := . }} + - {{ $jobNamespace }} {{- end }} {{- end }} + {{- end }} rules: - apiGroups: ["sparkoperator.k8s.io"] apiVersions: ["v1beta2"] @@ -97,16 +102,18 @@ webhooks: {{- with .Values.webhook.failurePolicy }} failurePolicy: {{ . }} {{- end }} - {{- if .Values.spark.jobNamespaces }} + {{- with .Values.spark.jobNamespaces }} + {{- if not (has "" .) }} namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: In values: - {{- range .Values.spark.jobNamespaces }} - - {{ . }} + {{- range $jobNamespace := . }} + - {{ $jobNamespace }} {{- end }} {{- end }} + {{- end }} rules: - apiGroups: ["sparkoperator.k8s.io"] apiVersions: ["v1beta2"] @@ -115,3 +122,4 @@ webhooks: {{- with .Values.webhook.timeoutSeconds }} timeoutSeconds: {{ . }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/poddisruptionbudget.yaml b/charts/spark-operator-chart/templates/webhook/poddisruptionbudget.yaml index 6de7e6ef5..5a6d91d8c 100644 --- a/charts/spark-operator-chart/templates/webhook/poddisruptionbudget.yaml +++ b/charts/spark-operator-chart/templates/webhook/poddisruptionbudget.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} {{- if .Values.webhook.podDisruptionBudget.enable }} {{- if le (int .Values.webhook.replicas) 1 }} {{- fail "webhook.replicas must be greater than 1 to enable pod disruption budget for webhook" }} @@ -32,3 +33,4 @@ spec: minAvailable: {{ . }} {{- end }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/rbac.yaml b/charts/spark-operator-chart/templates/webhook/rbac.yaml index b1c5d426f..1891a83ec 100644 --- a/charts/spark-operator-chart/templates/webhook/rbac.yaml +++ b/charts/spark-operator-chart/templates/webhook/rbac.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} {{- if .Values.webhook.rbac.create }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -169,3 +170,4 @@ roleRef: kind: Role name: {{ include "spark-operator.webhook.name" . }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/service.yaml b/charts/spark-operator-chart/templates/webhook/service.yaml index 45064a807..51695b8c3 100644 --- a/charts/spark-operator-chart/templates/webhook/service.yaml +++ b/charts/spark-operator-chart/templates/webhook/service.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} apiVersion: v1 kind: Service metadata: @@ -27,3 +28,4 @@ spec: - port: {{ .Values.webhook.port }} targetPort: {{ .Values.webhook.portName | quote }} name: {{ .Values.webhook.portName }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/serviceaccount.yaml b/charts/spark-operator-chart/templates/webhook/serviceaccount.yaml index 77944b83c..a36378e8e 100644 --- a/charts/spark-operator-chart/templates/webhook/serviceaccount.yaml +++ b/charts/spark-operator-chart/templates/webhook/serviceaccount.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} {{- if .Values.webhook.serviceAccount.create -}} apiVersion: v1 kind: ServiceAccount @@ -26,3 +27,4 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/templates/webhook/validatingwebhookconfiguration.yaml b/charts/spark-operator-chart/templates/webhook/validatingwebhookconfiguration.yaml index 3fbf55184..8cd3b11f4 100644 --- a/charts/spark-operator-chart/templates/webhook/validatingwebhookconfiguration.yaml +++ b/charts/spark-operator-chart/templates/webhook/validatingwebhookconfiguration.yaml @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- if .Values.webhook.enable }} apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: @@ -33,16 +34,18 @@ webhooks: {{- with .Values.webhook.failurePolicy }} failurePolicy: {{ . }} {{- end }} - {{- if .Values.spark.jobNamespaces }} + {{- with .Values.spark.jobNamespaces }} + {{- if not (has "" .) }} namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: In values: - {{- range .Values.spark.jobNamespaces }} - - {{ . }} + {{- range $jobNamespace := . }} + - {{ $jobNamespace }} {{- end }} {{- end }} + {{- end }} rules: - apiGroups: ["sparkoperator.k8s.io"] apiVersions: ["v1beta2"] @@ -63,16 +66,18 @@ webhooks: {{- with .Values.webhook.failurePolicy }} failurePolicy: {{ . }} {{- end }} - {{- if .Values.spark.jobNamespaces }} + {{- with .Values.spark.jobNamespaces }} + {{- if not (has "" .) }} namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: In values: - {{- range .Values.spark.jobNamespaces }} - - {{ . }} + {{- range $jobNamespace := . }} + - {{ $jobNamespace }} {{- end }} {{- end }} + {{- end }} rules: - apiGroups: ["sparkoperator.k8s.io"] apiVersions: ["v1beta2"] @@ -81,3 +86,4 @@ webhooks: {{- with .Values.webhook.timeoutSeconds }} timeoutSeconds: {{ . }} {{- end }} +{{- end }} diff --git a/charts/spark-operator-chart/tests/controller/deployment_test.yaml b/charts/spark-operator-chart/tests/controller/deployment_test.yaml index e4b6983a7..3de4aa57d 100644 --- a/charts/spark-operator-chart/tests/controller/deployment_test.yaml +++ b/charts/spark-operator-chart/tests/controller/deployment_test.yaml @@ -53,6 +53,15 @@ tests: path: spec.replicas value: 10 + - it: Should set replicas if `controller.replicas` is set + set: + controller: + replicas: 0 + asserts: + - equal: + path: spec.replicas + value: 0 + - it: Should add pod labels if `controller.labels` is set set: controller: @@ -110,14 +119,26 @@ tests: - it: Should contain `--namespaces` arg if `spark.jobNamespaces` is set set: - spark.jobNamespaces: - - ns1 - - ns2 + spark: + jobNamespaces: + - ns1 + - ns2 asserts: - contains: path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args content: --namespaces=ns1,ns2 + - it: Should set namespaces to all namespaces (`""`) if `spark.jobNamespaces` contains empty string + set: + spark: + jobNamespaces: + - "" + - default + asserts: + - contains: + path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args + content: --namespaces="" + - it: Should contain `--controller-threads` arg if `controller.workers` is set set: controller: @@ -160,6 +181,17 @@ tests: path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args content: --enable-batch-scheduler=true + - it: Should contain `--default-batch-scheduler` arg if `controller.batchScheduler.default` is set + set: + controller: + batchScheduler: + enable: true + default: yunikorn + asserts: + - contains: + path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args + content: --default-batch-scheduler=yunikorn + - it: Should contain `--enable-metrics` arg if `prometheus.metrics.enable` is set to `true` set: prometheus: diff --git a/charts/spark-operator-chart/tests/spark/serviceaccount_test.yaml b/charts/spark-operator-chart/tests/spark/serviceaccount_test.yaml index a1f1898b4..e8b764a6f 100644 --- a/charts/spark-operator-chart/tests/spark/serviceaccount_test.yaml +++ b/charts/spark-operator-chart/tests/spark/serviceaccount_test.yaml @@ -66,59 +66,36 @@ tests: path: metadata.annotations.key2 value: value2 - - it: Should create multiple service accounts if `spark.jobNamespaces` is set + - it: Should create service account for every non-empty spark job namespace if `spark.jobNamespaces` is set with multiple values set: spark: - serviceAccount: - name: spark jobNamespaces: + - "" - ns1 - ns2 - - ns3 documentIndex: 0 asserts: - hasDocuments: - count: 3 + count: 2 - containsDocument: apiVersion: v1 kind: ServiceAccount - name: spark + name: spark-operator-spark namespace: ns1 - - it: Should create multiple service accounts if `spark.jobNamespaces` is set + - it: Should create service account for every non-empty spark job namespace if `spark.jobNamespaces` is set with multiple values set: spark: - serviceAccount: - name: spark jobNamespaces: + - "" - ns1 - ns2 - - ns3 documentIndex: 1 asserts: - hasDocuments: - count: 3 + count: 2 - containsDocument: apiVersion: v1 kind: ServiceAccount - name: spark + name: spark-operator-spark namespace: ns2 - - - it: Should create multiple service accounts if `spark.jobNamespaces` is set - set: - spark: - serviceAccount: - name: spark - jobNamespaces: - - ns1 - - ns2 - - ns3 - documentIndex: 2 - asserts: - - hasDocuments: - count: 3 - - containsDocument: - apiVersion: v1 - kind: ServiceAccount - name: spark - namespace: ns3 diff --git a/charts/spark-operator-chart/tests/webhook/deployment_test.yaml b/charts/spark-operator-chart/tests/webhook/deployment_test.yaml index 14c34f7a8..bf6bc03c8 100644 --- a/charts/spark-operator-chart/tests/webhook/deployment_test.yaml +++ b/charts/spark-operator-chart/tests/webhook/deployment_test.yaml @@ -31,6 +31,14 @@ tests: kind: Deployment name: spark-operator-webhook + - it: Should not create webhook deployment if `webhook.enable` is `false` + set: + webhook: + enable: false + asserts: + - hasDocuments: + count: 0 + - it: Should set replicas if `webhook.replicas` is set set: webhook: @@ -40,6 +48,15 @@ tests: path: spec.replicas value: 10 + - it: Should set replicas if `webhook.replicas` is set + set: + webhook: + replicas: 0 + asserts: + - equal: + path: spec.replicas + value: 0 + - it: Should add pod labels if `webhook.labels` is set set: webhook: @@ -107,6 +124,17 @@ tests: path: spec.template.spec.containers[?(@.name=="spark-operator-webhook")].args content: --namespaces=ns1,ns2 + - it: Should set namespaces to all namespaces (`""`) if `spark.jobNamespaces` contains empty string + set: + spark: + jobNamespaces: + - "" + - default + asserts: + - contains: + path: spec.template.spec.containers[?(@.name=="spark-operator-webhook")].args + content: --namespaces="" + - it: Should contain `--enable-metrics` arg if `prometheus.metrics.enable` is set to `true` set: prometheus: diff --git a/charts/spark-operator-chart/tests/webhook/mutatingwebhookconfiguration_test.yaml b/charts/spark-operator-chart/tests/webhook/mutatingwebhookconfiguration_test.yaml index 54273df18..d68a74d09 100644 --- a/charts/spark-operator-chart/tests/webhook/mutatingwebhookconfiguration_test.yaml +++ b/charts/spark-operator-chart/tests/webhook/mutatingwebhookconfiguration_test.yaml @@ -31,6 +31,14 @@ tests: kind: MutatingWebhookConfiguration name: spark-operator-webhook + - it: Should not create the mutating webhook configuration if `webhook.enable` is `false` + set: + webhook: + enable: false + asserts: + - hasDocuments: + count: 0 + - it: Should use the specified webhook port set: webhook: @@ -49,7 +57,7 @@ tests: path: webhooks[*].failurePolicy value: Fail - - it: Should set namespaceSelector if sparkJobNamespaces is not empty + - it: Should set namespaceSelector if `spark.jobNamespaces` is set with non-empty strings set: spark: jobNamespaces: @@ -68,6 +76,19 @@ tests: - ns2 - ns3 + - it: Should not set namespaceSelector if `spark.jobNamespaces` contains empty string + set: + spark: + jobNamespaces: + - "" + - ns1 + - ns2 + - ns3 + asserts: + - notExists: + path: webhooks[*].namespaceSelector + + - it: Should should use the specified timeoutSeconds set: webhook: diff --git a/charts/spark-operator-chart/tests/webhook/poddisruptionbudget_test.yaml b/charts/spark-operator-chart/tests/webhook/poddisruptionbudget_test.yaml index f45350dbb..e7295ff02 100644 --- a/charts/spark-operator-chart/tests/webhook/poddisruptionbudget_test.yaml +++ b/charts/spark-operator-chart/tests/webhook/poddisruptionbudget_test.yaml @@ -24,6 +24,14 @@ release: namespace: spark-operator tests: + - it: Should not render podDisruptionBudget if `webhook.enable` is `false` + set: + webhook: + enable: false + asserts: + - hasDocuments: + count: 0 + - it: Should not render podDisruptionBudget if `webhook.podDisruptionBudget.enable` is false set: webhook: @@ -40,7 +48,7 @@ tests: podDisruptionBudget: enable: true asserts: - - failedTemplate: + - failedTemplate: errorMessage: "webhook.replicas must be greater than 1 to enable pod disruption budget for webhook" - it: Should render spark operator podDisruptionBudget if `webhook.podDisruptionBudget.enable` is true diff --git a/charts/spark-operator-chart/tests/webhook/service_test.yaml b/charts/spark-operator-chart/tests/webhook/service_test.yaml index c06631f97..6ef15726b 100644 --- a/charts/spark-operator-chart/tests/webhook/service_test.yaml +++ b/charts/spark-operator-chart/tests/webhook/service_test.yaml @@ -24,6 +24,14 @@ release: namespace: spark-operator tests: + - it: Should not create webhook service if `webhook.enable` is `false` + set: + webhook: + enable: false + asserts: + - hasDocuments: + count: 0 + - it: Should create the webhook service correctly set: webhook: diff --git a/charts/spark-operator-chart/tests/webhook/validatingwebhookconfiguration_test.yaml b/charts/spark-operator-chart/tests/webhook/validatingwebhookconfiguration_test.yaml index 9c7fa4daa..a252d7f4a 100644 --- a/charts/spark-operator-chart/tests/webhook/validatingwebhookconfiguration_test.yaml +++ b/charts/spark-operator-chart/tests/webhook/validatingwebhookconfiguration_test.yaml @@ -31,6 +31,14 @@ tests: kind: ValidatingWebhookConfiguration name: spark-operator-webhook + - it: Should not create the validating webhook configuration if `webhook.enable` is `false` + set: + webhook: + enable: false + asserts: + - hasDocuments: + count: 0 + - it: Should use the specified webhook port set: webhook: @@ -49,7 +57,7 @@ tests: path: webhooks[*].failurePolicy value: Fail - - it: Should set namespaceSelector if `spark.jobNamespaces` is not empty + - it: Should set namespaceSelector if `spark.jobNamespaces` is set with non-empty strings set: spark.jobNamespaces: - ns1 @@ -67,6 +75,18 @@ tests: - ns2 - ns3 + - it: Should not set namespaceSelector if `spark.jobNamespaces` contains empty string + set: + spark: + jobNamespaces: + - "" + - ns1 + - ns2 + - ns3 + asserts: + - notExists: + path: webhooks[*].namespaceSelector + - it: Should should use the specified timeoutSeconds set: webhook: diff --git a/charts/spark-operator-chart/values.yaml b/charts/spark-operator-chart/values.yaml index a5adbe477..7b9b9330c 100644 --- a/charts/spark-operator-chart/values.yaml +++ b/charts/spark-operator-chart/values.yaml @@ -67,6 +67,13 @@ controller: # -- Specifies whether to enable batch scheduler for spark jobs scheduling. # If enabled, users can specify batch scheduler name in spark application. enable: false + # -- Specifies a list of kube-scheduler names for scheduling Spark pods. + kubeSchedulerNames: [] + # - default-scheduler + # -- Default batch scheduler to be used if not specified by the user. + # If specified, this value must be either "volcano" or "yunikorn". Specifying any other + # value will cause the controller to error on startup. + default: "" serviceAccount: # -- Specifies whether to create a service account for the controller. @@ -164,6 +171,9 @@ controller: minAvailable: 1 webhook: + # -- Specifies whether to enable webhook. + enable: true + # -- Number of replicas of webhook server. replicas: 1 diff --git a/cmd/operator/controller/start.go b/cmd/operator/controller/start.go index 6f851ed76..e6bf91c64 100644 --- a/cmd/operator/controller/start.go +++ b/cmd/operator/controller/start.go @@ -20,6 +20,7 @@ import ( "crypto/tls" "flag" "os" + "slices" "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -44,6 +45,7 @@ import ( logzap "sigs.k8s.io/controller-runtime/pkg/log/zap" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" ctrlwebhook "sigs.k8s.io/controller-runtime/pkg/webhook" + schedulingv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" sparkoperator "github.com/kubeflow/spark-operator" "github.com/kubeflow/spark-operator/api/v1beta1" @@ -52,6 +54,7 @@ import ( "github.com/kubeflow/spark-operator/internal/controller/sparkapplication" "github.com/kubeflow/spark-operator/internal/metrics" "github.com/kubeflow/spark-operator/internal/scheduler" + "github.com/kubeflow/spark-operator/internal/scheduler/kubescheduler" "github.com/kubeflow/spark-operator/internal/scheduler/volcano" "github.com/kubeflow/spark-operator/internal/scheduler/yunikorn" "github.com/kubeflow/spark-operator/pkg/common" @@ -72,7 +75,9 @@ var ( cacheSyncTimeout time.Duration // Batch scheduler - enableBatchScheduler bool + enableBatchScheduler bool + kubeSchedulerNames []string + defaultBatchScheduler string // Spark web UI service and ingress enableUIService bool @@ -104,6 +109,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(schedulingv1alpha1.AddToScheme(scheme)) utilruntime.Must(v1beta1.AddToScheme(scheme)) utilruntime.Must(v1beta2.AddToScheme(scheme)) @@ -124,10 +130,13 @@ func NewStartCommand() *cobra.Command { } command.Flags().IntVar(&controllerThreads, "controller-threads", 10, "Number of worker threads used by the SparkApplication controller.") - command.Flags().StringSliceVar(&namespaces, "namespaces", []string{""}, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset.") + command.Flags().StringSliceVar(&namespaces, "namespaces", []string{}, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset or contains empty string.") command.Flags().DurationVar(&cacheSyncTimeout, "cache-sync-timeout", 30*time.Second, "Informer cache sync timeout.") command.Flags().BoolVar(&enableBatchScheduler, "enable-batch-scheduler", false, "Enable batch schedulers.") + command.Flags().StringSliceVar(&kubeSchedulerNames, "kube-scheduler-names", []string{}, "The kube-scheduler names for scheduling Spark applications.") + command.Flags().StringVar(&defaultBatchScheduler, "default-batch-scheduler", "", "Default batch scheduler.") + command.Flags().BoolVar(&enableUIService, "enable-ui-service", true, "Enable Spark Web UI service.") command.Flags().StringVar(&ingressClassName, "ingress-class-name", "", "Set ingressClassName for ingress resources created.") command.Flags().StringVar(&ingressURLFormat, "ingress-url-format", "", "Ingress URL format.") @@ -207,8 +216,19 @@ func start() { var registry *scheduler.Registry if enableBatchScheduler { registry = scheduler.GetRegistry() - registry.Register(common.VolcanoSchedulerName, volcano.Factory) - registry.Register(yunikorn.SchedulerName, yunikorn.Factory) + _ = registry.Register(common.VolcanoSchedulerName, volcano.Factory) + _ = registry.Register(yunikorn.SchedulerName, yunikorn.Factory) + + // Register kube-schedulers. + for _, name := range kubeSchedulerNames { + registry.Register(name, kubescheduler.Factory) + } + + schedulerNames := registry.GetRegisteredSchedulerNames() + if defaultBatchScheduler != "" && !slices.Contains(schedulerNames, defaultBatchScheduler) { + logger.Error(nil, "Failed to find default batch scheduler in registered schedulers") + os.Exit(1) + } } // Setup controller for SparkApplication. @@ -348,9 +368,13 @@ func newSparkApplicationReconcilerOptions() sparkapplication.Options { EnableUIService: enableUIService, IngressClassName: ingressClassName, IngressURLFormat: ingressURLFormat, + DefaultBatchScheduler: defaultBatchScheduler, SparkApplicationMetrics: sparkApplicationMetrics, SparkExecutorMetrics: sparkExecutorMetrics, } + if enableBatchScheduler { + options.KubeSchedulerNames = kubeSchedulerNames + } return options } diff --git a/cmd/operator/webhook/start.go b/cmd/operator/webhook/start.go index b9b7d8d9e..b6b08777a 100644 --- a/cmd/operator/webhook/start.go +++ b/cmd/operator/webhook/start.go @@ -130,7 +130,7 @@ func NewStartCommand() *cobra.Command { } command.Flags().IntVar(&controllerThreads, "controller-threads", 10, "Number of worker threads used by the SparkApplication controller.") - command.Flags().StringSliceVar(&namespaces, "namespaces", []string{""}, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset.") + command.Flags().StringSliceVar(&namespaces, "namespaces", []string{}, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset or contains empty string.") command.Flags().StringVar(&labelSelectorFilter, "label-selector-filter", "", "A comma-separated list of key=value, or key labels to filter resources during watch and list based on the specified labels.") command.Flags().DurationVar(&cacheSyncTimeout, "cache-sync-timeout", 30*time.Second, "Informer cache sync timeout.") diff --git a/docs/release.md b/docs/release.md index d3a385b60..35c313ce7 100644 --- a/docs/release.md +++ b/docs/release.md @@ -71,28 +71,42 @@ If you want to push changes to the `release-X.Y` release branch, you have to che ```bash # Get version and remove the leading 'v' VERSION=$(cat VERSION | sed "s/^v//") + + # Change the version and appVersion in Chart.yaml + # On Linux sed -i "s/^version.*/version: ${VERSION}/" charts/spark-operator-chart/Chart.yaml sed -i "s/^appVersion.*/appVersion: ${VERSION}/" charts/spark-operator-chart/Chart.yaml + + # On MacOS + sed -i '' "s/^version.*/version: ${VERSION}/" charts/spark-operator-chart/Chart.yaml + sed -i '' "s/^appVersion.*/appVersion: ${VERSION}/" charts/spark-operator-chart/Chart.yaml + ``` + +3. Update the Helm chart README: + + ```bash + make helm-docs ``` -3. Commit the changes: +4. Commit the changes: ```bash git add VERSION git add charts/spark-operator-chart/Chart.yaml - git commit -s -m "Release $VERSION" - git push + git add charts/spark-operator-chart/README.md + git commit -s -m "Spark Operator Official Release v${VERSION}" + git push origin release-X.Y ``` -4. Submit a PR to the release branch. After the PR is merged, a new tag will be automatically created if the `VERSION` file has changed. +5. Submit a PR to the release branch. ### Release Spark Operator Image -After a pre-release/release tag is pushed, a release workflow will be triggered to build and push Spark operator docker image to Docker Hub. +After `VERSION` file is modified and pushed to the release branch, a release workflow will be triggered to build and push Spark operator docker images to Docker Hub. ### Publish release -After a pre-release/release tag is pushed, a release workflow will be triggered to create a new draft release. +After `VERSION` file is modified and pushed to the release branch, a release workflow will be triggered to create a new draft release with the Spark operator Helm chart packaged as an artifact. After modifying the release notes, then publish the release. ### Release Spark Operator Helm Chart diff --git a/examples/spark-pi-kube-scheduler.yaml b/examples/spark-pi-kube-scheduler.yaml new file mode 100644 index 000000000..010154fe2 --- /dev/null +++ b/examples/spark-pi-kube-scheduler.yaml @@ -0,0 +1,43 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: spark-pi-kube-scheduler + namespace: default +spec: + type: Scala + mode: cluster + image: spark:3.5.0 + imagePullPolicy: IfNotPresent + mainClass: org.apache.spark.examples.SparkPi + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar + sparkVersion: 3.5.0 + driver: + labels: + version: 3.5.0 + cores: 1 + coreLimit: 1200m + memory: 512m + serviceAccount: spark-operator-spark + executor: + labels: + version: 3.5.0 + instances: 2 + cores: 1 + coreLimit: 1200m + memory: 512m + batchScheduler: kube-scheduler diff --git a/go.mod b/go.mod index ddeb94627..9a40cee58 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,7 @@ require ( k8s.io/utils v0.0.0-20240710235135-d4aae2beeffc sigs.k8s.io/controller-runtime v0.17.5 sigs.k8s.io/yaml v1.4.0 + sigs.k8s.io/scheduler-plugins v0.29.7 volcano.sh/apis v1.9.0 ) diff --git a/go.sum b/go.sum index 947142c72..9fd3aeeac 100644 --- a/go.sum +++ b/go.sum @@ -723,6 +723,8 @@ sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g sigs.k8s.io/kustomize/api v0.17.2/go.mod h1:UWTz9Ct+MvoeQsHcJ5e+vziRRkwimm3HytpZgIYqye0= sigs.k8s.io/kustomize/kyaml v0.17.1 h1:TnxYQxFXzbmNG6gOINgGWQt09GghzgTP6mIurOgrLCQ= sigs.k8s.io/kustomize/kyaml v0.17.1/go.mod h1:9V0mCjIEYjlXuCdYsSXvyoy2BTsLESH7TlGV81S282U= +sigs.k8s.io/scheduler-plugins v0.29.7 h1:FSV/uGoU1shHoCoHDiXYHREI1ZLj/VaOkAWyRWXSZzs= +sigs.k8s.io/scheduler-plugins v0.29.7/go.mod h1:fin+Wv9sMnkcDUtXcBRoR9S4vYVCkhyY4ZMi9mJyzLY= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/internal/controller/scheduledsparkapplication/event_filter.go b/internal/controller/scheduledsparkapplication/event_filter.go index e6ea5487b..f0d7568f5 100644 --- a/internal/controller/scheduledsparkapplication/event_filter.go +++ b/internal/controller/scheduledsparkapplication/event_filter.go @@ -35,9 +35,14 @@ var _ predicate.Predicate = &EventFilter{} // NewEventFilter creates a new EventFilter instance. func NewEventFilter(namespaces []string) *EventFilter { nsMap := make(map[string]bool) - for _, ns := range namespaces { - nsMap[ns] = true + if len(namespaces) == 0 { + nsMap[metav1.NamespaceAll] = true + } else { + for _, ns := range namespaces { + nsMap[ns] = true + } } + return &EventFilter{ namespaces: nsMap, } diff --git a/internal/controller/sparkapplication/controller.go b/internal/controller/sparkapplication/controller.go index 1c49d5f24..fb30be85f 100644 --- a/internal/controller/sparkapplication/controller.go +++ b/internal/controller/sparkapplication/controller.go @@ -43,6 +43,7 @@ import ( "github.com/kubeflow/spark-operator/api/v1beta2" "github.com/kubeflow/spark-operator/internal/metrics" "github.com/kubeflow/spark-operator/internal/scheduler" + "github.com/kubeflow/spark-operator/internal/scheduler/kubescheduler" "github.com/kubeflow/spark-operator/internal/scheduler/volcano" "github.com/kubeflow/spark-operator/internal/scheduler/yunikorn" "github.com/kubeflow/spark-operator/pkg/common" @@ -55,10 +56,13 @@ var ( // Options defines the options of the controller. type Options struct { - Namespaces []string - EnableUIService bool - IngressClassName string - IngressURLFormat string + Namespaces []string + EnableUIService bool + IngressClassName string + IngressURLFormat string + DefaultBatchScheduler string + + KubeSchedulerNames []string SparkApplicationMetrics *metrics.SparkApplicationMetrics SparkExecutorMetrics *metrics.SparkExecutorMetrics @@ -1186,14 +1190,24 @@ func (r *Reconciler) resetSparkApplicationStatus(app *v1beta2.SparkApplication) } func (r *Reconciler) shouldDoBatchScheduling(app *v1beta2.SparkApplication) (bool, scheduler.Interface) { - if r.registry == nil || app.Spec.BatchScheduler == nil || *app.Spec.BatchScheduler == "" { + // If batch scheduling isn't enabled + if r.registry == nil { + return false, nil + } + + schedulerName := r.options.DefaultBatchScheduler + if app.Spec.BatchScheduler != nil && *app.Spec.BatchScheduler != "" { + schedulerName = *app.Spec.BatchScheduler + } + + // If both the default and app batch scheduler are unspecified or empty + if schedulerName == "" { return false, nil } var err error var scheduler scheduler.Interface - schedulerName := *app.Spec.BatchScheduler switch schedulerName { case common.VolcanoSchedulerName: config := &volcano.Config{ @@ -1204,6 +1218,16 @@ func (r *Reconciler) shouldDoBatchScheduling(app *v1beta2.SparkApplication) (boo scheduler, err = r.registry.GetScheduler(schedulerName, nil) } + for _, name := range r.options.KubeSchedulerNames { + if schedulerName == name { + config := &kubescheduler.Config{ + SchedulerName: name, + Client: r.manager.GetClient(), + } + scheduler, err = r.registry.GetScheduler(name, config) + } + } + if err != nil || scheduler == nil { logger.Error(err, "Failed to get scheduler for SparkApplication", "name", app.Name, "namespace", app.Namespace, "scheduler", schedulerName) return false, nil diff --git a/internal/controller/sparkapplication/event_filter.go b/internal/controller/sparkapplication/event_filter.go index 3fe49ee13..121155f2e 100644 --- a/internal/controller/sparkapplication/event_filter.go +++ b/internal/controller/sparkapplication/event_filter.go @@ -42,8 +42,12 @@ var _ predicate.Predicate = &sparkPodEventFilter{} // newSparkPodEventFilter creates a new SparkPodEventFilter instance. func newSparkPodEventFilter(namespaces []string) *sparkPodEventFilter { nsMap := make(map[string]bool) - for _, ns := range namespaces { - nsMap[ns] = true + if len(namespaces) == 0 { + nsMap[metav1.NamespaceAll] = true + } else { + for _, ns := range namespaces { + nsMap[ns] = true + } } return &sparkPodEventFilter{ @@ -118,8 +122,12 @@ var _ predicate.Predicate = &EventFilter{} func NewSparkApplicationEventFilter(client client.Client, recorder record.EventRecorder, namespaces []string) *EventFilter { nsMap := make(map[string]bool) - for _, ns := range namespaces { - nsMap[ns] = true + if len(namespaces) == 0 { + nsMap[metav1.NamespaceAll] = true + } else { + for _, ns := range namespaces { + nsMap[ns] = true + } } return &EventFilter{ diff --git a/internal/scheduler/kubescheduler/scheduler.go b/internal/scheduler/kubescheduler/scheduler.go new file mode 100644 index 000000000..b7126b137 --- /dev/null +++ b/internal/scheduler/kubescheduler/scheduler.go @@ -0,0 +1,159 @@ +/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubescheduler + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + schedulingv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + + "github.com/kubeflow/spark-operator/api/v1beta2" + "github.com/kubeflow/spark-operator/internal/scheduler" + "github.com/kubeflow/spark-operator/pkg/util" +) + +const ( + Name = "kube-scheduler" +) + +var ( + logger = log.Log.WithName("") +) + +// Scheduler is a scheduler that uses scheduler plugins to schedule Spark pods. +// Ref: https://github.com/kubernetes-sigs/scheduler-plugins. +type Scheduler struct { + name string + client client.Client +} + +// Scheduler implements scheduler.Interface. +var _ scheduler.Interface = &Scheduler{} + +// Config defines the configurations of kube-scheduler. +type Config struct { + SchedulerName string + Client client.Client +} + +// Config implements scheduler.Config. +var _ scheduler.Config = &Config{} + +// Factory creates a new Scheduler instance. +func Factory(config scheduler.Config) (scheduler.Interface, error) { + c, ok := config.(*Config) + if !ok { + return nil, fmt.Errorf("failed to get kube-scheduler config") + } + + scheduler := &Scheduler{ + name: c.SchedulerName, + client: c.Client, + } + return scheduler, nil +} + +// Name implements scheduler.Interface. +func (s *Scheduler) Name() string { + return s.name +} + +// ShouldSchedule implements scheduler.Interface. +func (s *Scheduler) ShouldSchedule(app *v1beta2.SparkApplication) bool { + // There is no additional requirements for scheduling. + return true +} + +// Schedule implements scheduler.Interface. +func (s *Scheduler) Schedule(app *v1beta2.SparkApplication) error { + minResources := util.SumResourceList([]corev1.ResourceList{util.GetDriverRequestResource(app), util.GetExecutorRequestResource(app)}) + podGroup := &schedulingv1alpha1.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: getPodGroupName(app), + Namespace: app.Namespace, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(app, v1beta2.SchemeGroupVersion.WithKind("SparkApplication")), + }, + }, + Spec: schedulingv1alpha1.PodGroupSpec{ + MinMember: 1, + MinResources: minResources, + }, + } + + if err := s.syncPodGroup(podGroup); err != nil { + return fmt.Errorf("failed to sync pod group: %v", err) + } + + // Add a label `scheduling.x-k8s.io/pod-group` to mark the pod belongs to a group + if app.ObjectMeta.Labels == nil { + app.ObjectMeta.Labels = make(map[string]string) + } + app.ObjectMeta.Labels[schedulingv1alpha1.PodGroupLabel] = podGroup.Name + + return nil +} + +// Cleanup implements scheduler.Interface. +func (s *Scheduler) Cleanup(app *v1beta2.SparkApplication) error { + podGroup := &schedulingv1alpha1.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: getPodGroupName(app), + Namespace: app.Namespace, + }, + } + if err := s.client.Delete(context.TODO(), podGroup); err != nil { + if errors.IsNotFound(err) { + return nil + } + return err + } + logger.Info("Deleted PodGroup", "Name", podGroup.Name, "Namespace", podGroup.Namespace) + return nil +} + +func (s *Scheduler) syncPodGroup(podGroup *schedulingv1alpha1.PodGroup) error { + key := types.NamespacedName{ + Namespace: podGroup.Namespace, + Name: podGroup.Name, + } + + if err := s.client.Get(context.TODO(), key, &schedulingv1alpha1.PodGroup{}); err != nil { + if !errors.IsNotFound(err) { + return err + } + + if err := s.client.Create(context.TODO(), podGroup); err != nil { + return err + } + logger.Info("Created PodGroup", "Name", podGroup.Name, "Namespace", podGroup.Namespace) + return nil + } + + if err := s.client.Update(context.TODO(), podGroup); err != nil { + return err + } + logger.Info("Updated PodGroup", "Name", podGroup.Name, "Namespace", podGroup.Namespace) + return nil +} diff --git a/internal/scheduler/kubescheduler/util.go b/internal/scheduler/kubescheduler/util.go new file mode 100644 index 000000000..f4996a204 --- /dev/null +++ b/internal/scheduler/kubescheduler/util.go @@ -0,0 +1,27 @@ +/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubescheduler + +import ( + "fmt" + + "github.com/kubeflow/spark-operator/api/v1beta2" +) + +func getPodGroupName(app *v1beta2.SparkApplication) string { + return fmt.Sprintf("%s-pg", app.Name) +} diff --git a/test/e2e/suit_test.go b/test/e2e/suit_test.go index 4c60f9762..5c8a21dd4 100644 --- a/test/e2e/suit_test.go +++ b/test/e2e/suit_test.go @@ -27,9 +27,9 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/chartutil" "helm.sh/helm/v3/pkg/cli" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -128,7 +128,10 @@ var _ = BeforeSuite(func() { chart, err := loader.Load(chartPath) Expect(err).NotTo(HaveOccurred()) Expect(chart).NotTo(BeNil()) - release, err := installAction.Run(chart, nil) + values, err := chartutil.ReadValuesFile(filepath.Join(chartPath, "ci", "ci-values.yaml")) + Expect(err).NotTo(HaveOccurred()) + Expect(values).NotTo(BeNil()) + release, err := installAction.Run(chart, values) Expect(err).NotTo(HaveOccurred()) Expect(release).NotTo(BeNil()) })