From f70b3854482814e7b0c33e2321e7fcb3f2333774 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Sat, 31 Dec 2022 15:44:18 -0600 Subject: [PATCH 1/3] feat(elasticsearch): adding elasticsearch index job --- charts/datahub/Chart.yaml | 6 +- .../datahub/subcharts/datahub-gms/Chart.yaml | 2 +- .../datahub-gms/templates/deployment.yaml | 4 + .../datahub/subcharts/datahub-gms/values.yaml | 6 + .../subcharts/datahub-mae-consumer/Chart.yaml | 2 +- .../templates/deployment.yaml | 4 + .../datahub-mae-consumer/values.yaml | 6 + .../datahub-build-indices-job.yml | 212 ++++++++++++++++++ charts/datahub/values.yaml | 50 +++++ 9 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml diff --git a/charts/datahub/Chart.yaml b/charts/datahub/Chart.yaml index 8dd1baf1d..82c65ad7c 100644 --- a/charts/datahub/Chart.yaml +++ b/charts/datahub/Chart.yaml @@ -4,13 +4,13 @@ description: A Helm chart for LinkedIn DataHub type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.2.126 +version: 0.2.127 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 0.9.5 dependencies: - name: datahub-gms - version: 0.2.121 + version: 0.2.122 repository: file://./subcharts/datahub-gms condition: datahub-gms.enabled - name: datahub-frontend @@ -18,7 +18,7 @@ dependencies: repository: file://./subcharts/datahub-frontend condition: datahub-frontend.enabled - name: datahub-mae-consumer - version: 0.2.121 + version: 0.2.122 repository: file://./subcharts/datahub-mae-consumer condition: global.datahub_standalone_consumers_enabled - name: datahub-mce-consumer diff --git a/charts/datahub/subcharts/datahub-gms/Chart.yaml b/charts/datahub/subcharts/datahub-gms/Chart.yaml index 164dc1d78..5c3b61683 100644 --- a/charts/datahub/subcharts/datahub-gms/Chart.yaml +++ b/charts/datahub/subcharts/datahub-gms/Chart.yaml @@ -12,7 +12,7 @@ description: A Helm chart for LinkedIn DataHub's datahub-gms component type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.2.121 +version: 0.2.122 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: v0.9.3 diff --git a/charts/datahub/subcharts/datahub-gms/templates/deployment.yaml b/charts/datahub/subcharts/datahub-gms/templates/deployment.yaml index 89f57237c..fbf372d5f 100644 --- a/charts/datahub/subcharts/datahub-gms/templates/deployment.yaml +++ b/charts/datahub/subcharts/datahub-gms/templates/deployment.yaml @@ -152,6 +152,10 @@ spec: - name: INDEX_PREFIX value: {{ . }} {{- end }} + {{- if .Values.global.elasticsearch.index.upgrade.enabled }} + - name: BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID + value: {{ printf "%s-%s" .Release.Name "bihe-consumer-job-client-gms" }} + {{- end }} - name: GRAPH_SERVICE_IMPL value: {{ .Values.global.graph_service_impl }} {{- if eq .Values.global.graph_service_impl "neo4j" }} diff --git a/charts/datahub/subcharts/datahub-gms/values.yaml b/charts/datahub/subcharts/datahub-gms/values.yaml index 8981e8b72..9c0f8c2bc 100644 --- a/charts/datahub/subcharts/datahub-gms/values.yaml +++ b/charts/datahub/subcharts/datahub-gms/values.yaml @@ -126,6 +126,12 @@ global: port: "9200" skipcheck: "false" + ## Settings for supporting datahub-upgrade job for index creation/reindex + index: + ## The following options control settings for datahub-upgrade job when creating or reindexing indices + upgrade: + enabled: true + kafka: bootstrap: server: "broker:9092" diff --git a/charts/datahub/subcharts/datahub-mae-consumer/Chart.yaml b/charts/datahub/subcharts/datahub-mae-consumer/Chart.yaml index 1920ffcce..5cdf44d4d 100644 --- a/charts/datahub/subcharts/datahub-mae-consumer/Chart.yaml +++ b/charts/datahub/subcharts/datahub-mae-consumer/Chart.yaml @@ -12,7 +12,7 @@ description: A Helm chart for Kubernetes type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.2.121 +version: 0.2.122 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: v0.9.3 diff --git a/charts/datahub/subcharts/datahub-mae-consumer/templates/deployment.yaml b/charts/datahub/subcharts/datahub-mae-consumer/templates/deployment.yaml index e807bd1d8..aeefe92be 100644 --- a/charts/datahub/subcharts/datahub-mae-consumer/templates/deployment.yaml +++ b/charts/datahub/subcharts/datahub-mae-consumer/templates/deployment.yaml @@ -132,6 +132,10 @@ spec: - name: INDEX_PREFIX value: {{ . }} {{- end }} + {{- if .Values.global.elasticsearch.index.upgrade.enabled }} + - name: BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID + value: {{ printf "%s-%s" .Release.Name "bihe-consumer-job-client-mcl" }} + {{- end }} - name: GRAPH_SERVICE_IMPL value: {{ .Values.global.graph_service_impl }} {{- if eq .Values.global.graph_service_impl "neo4j" }} diff --git a/charts/datahub/subcharts/datahub-mae-consumer/values.yaml b/charts/datahub/subcharts/datahub-mae-consumer/values.yaml index 5c8b07143..8f5f5e60e 100644 --- a/charts/datahub/subcharts/datahub-mae-consumer/values.yaml +++ b/charts/datahub/subcharts/datahub-mae-consumer/values.yaml @@ -161,6 +161,12 @@ global: port: "9200" skipcheck: "false" + ## Settings for supporting datahub-upgrade job for index creation/reindex + index: + ## The following options control settings for datahub-upgrade job when creating or reindexing indices + upgrade: + enabled: true + kafka: bootstrap: server: "broker:9092" diff --git a/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml new file mode 100644 index 000000000..c6dd52675 --- /dev/null +++ b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml @@ -0,0 +1,212 @@ +{{- if .Values.datahubUpgradeBuildIndices.enabled -}} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-datahub-build-indices-job + labels: + app.kubernetes.io/managed-by: {{ .Release.Service | quote }} + app.kubernetes.io/instance: {{ .Release.Name | quote }} + app.kubernetes.io/version: {{ .Chart.AppVersion }} + helm.sh/chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" + annotations: + # This is what defines this resource as a hook. Without this line, the + # job is considered part of the release. + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation +spec: + template: + {{- if or .Values.global.podLabels .Values.datahubUpgradeBuildIndices.podAnnotations}} + metadata: + {{- with .Values.datahubUpgradeBuildIndices.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.global.podLabels }} + labels: + {{- range $key, $value := . }} + {{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} + {{- end }} + spec: + {{- with .Values.global.hostAliases }} + hostAliases: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.serviceAccount }} + serviceAccountName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + {{- with .Values.global.credentialsAndCertsSecrets }} + - name: datahub-certs-dir + secret: + defaultMode: 0444 + secretName: {{ .name }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.extraVolumes }} + {{- toYaml . | nindent 8}} + {{- end }} + restartPolicy: Never + securityContext: + {{- toYaml .Values.datahubUpgradeBuildIndices.podSecurityContext | nindent 8 }} + initContainers: + {{- with .Values.datahubUpgradeBuildIndices.extraInitContainers }} + {{- toYaml . | nindent 12 }} + {{- end }} + containers: + - name: datahub-build-indices-job + image: "{{ .Values.datahubUpgradeBuildIndices.image.repository }}:{{ required "Global or specific tag is required" (.Values.datahubUpgradeBuildIndices.image.tag | default .Values.global.datahub.version) }}" + imagePullPolicy: {{ .Values.datahubUpgradeBuildIndices.imagePullPolicy | default "IfNotPresent" }} + args: + - "-u" + - "BuildIndices" + env: + {{- include "datahub.upgrade.env" . | nindent 12}} + - name: DATAHUB_ANALYTICS_ENABLED + value: {{ .Values.global.datahub_analytics_enabled | quote }} + - name: ENTITY_REGISTRY_CONFIG_PATH + value: /datahub/datahub-gms/resources/entity-registry.yml + - name: EBEAN_DATASOURCE_USERNAME + value: {{ (.Values.sql).datasource.username | default .Values.global.sql.datasource.username | quote }} + - name: EBEAN_DATASOURCE_PASSWORD + {{- $passwordValue := (.Values.sql).datasource.password.value | default .Values.global.sql.datasource.password.value }} + {{- if $passwordValue }} + value: {{ $passwordValue | quote }} + {{- else }} + valueFrom: + secretKeyRef: + name: "{{ (.Values.sql).datasource.password.secretRef | default .Values.global.sql.datasource.password.secretRef }}" + key: "{{ (.Values.sql).datasource.password.secretKey | default .Values.global.sql.datasource.password.secretKey }}" + {{- end }} + - name: EBEAN_DATASOURCE_HOST + value: "{{ .Values.global.sql.datasource.host }}" + - name: EBEAN_DATASOURCE_URL + value: "{{ .Values.global.sql.datasource.url }}" + - name: EBEAN_DATASOURCE_DRIVER + value: "{{ .Values.global.sql.datasource.driver }}" + - name: KAFKA_BOOTSTRAP_SERVER + value: "{{ .Values.global.kafka.bootstrap.server }}" + {{- with .Values.global.kafka.schemaregistry.url }} + - name: KAFKA_SCHEMAREGISTRY_URL + value: "{{ . }}" + {{- end }} + {{- with .Values.global.kafka.schemaregistry.type }} + - name: SCHEMA_REGISTRY_TYPE + value: "{{ . }}" + {{- end }} + {{- with .Values.global.kafka.schemaregistry.glue }} + - name: AWS_GLUE_SCHEMA_REGISTRY_REGION + value: "{{ .region }}" + {{- with .registry }} + - name: AWS_GLUE_SCHEMA_REGISTRY_NAME + value: "{{ . }}" + {{- end }} + {{- end }} + - name: ELASTICSEARCH_HOST + value: "{{ .Values.global.elasticsearch.host }}" + - name: ELASTICSEARCH_PORT + value: "{{ .Values.global.elasticsearch.port }}" + - name: SKIP_ELASTICSEARCH_CHECK + value: "{{ .Values.global.elasticsearch.skipcheck }}" + {{- with .Values.global.elasticsearch.useSSL }} + - name: ELASTICSEARCH_USE_SSL + value: {{ . | quote }} + {{- end }} + {{- with .Values.global.elasticsearch.auth }} + - name: ELASTICSEARCH_USERNAME + value: {{ .username }} + - name: ELASTICSEARCH_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .password.secretRef }}" + key: "{{ .password.secretKey }}" + {{- end }} + {{- with .Values.global.elasticsearch.indexPrefix }} + - name: INDEX_PREFIX + value: {{ . }} + {{- end }} + - name: ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES + value: {{ .Values.global.elasticsearch.index.upgrade.cloneIndices | quote }} + {{- with .Values.global.elasticsearch.index.enableMappingsReindex }} + - name: ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX + value: {{ . | quote }} + {{- end }} + {{- with .Values.global.elasticsearch.index.enableSettingsReindex }} + - name: ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX + value: {{ . | quote }} + {{- end }} + {{- with .Values.global.elasticsearch.index.settingsOverrides }} + - name: ELASTICSEARCH_INDEX_BUILDER_SETTINGS_OVERRIDES + value: {{ . | quote }} + {{- end }} + {{- with .Values.global.elasticsearch.index.entitySettingsOverrides }} + - name: ELASTICSEARCH_INDEX_BUILDER_ENTITY_SETTINGS_OVERRIDES + value: {{ . | quote }} + {{- end }} + {{- with .Values.global.elasticsearch.index.refreshIntervalSeconds }} + - name: ELASTICSEARCH_INDEX_BUILDER_REFRESH_INTERVAL_SECONDS + value: {{ . | quote }} + {{- end }} + - name: GRAPH_SERVICE_IMPL + value: {{ .Values.global.graph_service_impl }} + {{- if eq .Values.global.graph_service_impl "neo4j" }} + - name: NEO4J_HOST + value: "{{ .Values.global.neo4j.host }}" + - name: NEO4J_URI + value: "{{ .Values.global.neo4j.uri }}" + - name: NEO4J_USERNAME + value: "{{ .Values.global.neo4j.username }}" + - name: NEO4J_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Values.global.neo4j.password.secretRef }}" + key: "{{ .Values.global.neo4j.password.secretKey }}" + {{- end }} + {{- if .Values.global.springKafkaConfigurationOverrides }} + {{- range $configName, $configValue := .Values.global.springKafkaConfigurationOverrides }} + - name: SPRING_KAFKA_PROPERTIES_{{ $configName | replace "." "_" | upper }} + value: {{ $configValue | quote }} + {{- end }} + {{- end }} + {{- if .Values.global.credentialsAndCertsSecrets }} + {{- range $envVarName, $envVarValue := .Values.global.credentialsAndCertsSecrets.secureEnv }} + - name: SPRING_KAFKA_PROPERTIES_{{ $envVarName | replace "." "_" | upper }} + valueFrom: + secretKeyRef: + name: {{ $.Values.global.credentialsAndCertsSecrets.name }} + key: {{ $envVarValue }} + {{- end }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.extraEnvs }} + {{- toYaml . | nindent 12 }} + {{- end }} + securityContext: + {{- toYaml .Values.datahubUpgradeBuildIndices.securityContext | nindent 12 }} + volumeMounts: + {{- with .Values.global.credentialsAndCertsSecrets }} + - name: datahub-certs-dir + mountPath: {{ .path | default "/mnt/certs" }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.datahubUpgradeBuildIndices.resources | nindent 12 }} + {{- with .Values.datahubUpgradeBuildIndices.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.datahubUpgradeBuildIndices.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end -}} diff --git a/charts/datahub/values.yaml b/charts/datahub/values.yaml index 309032b28..ee068f650 100644 --- a/charts/datahub/values.yaml +++ b/charts/datahub/values.yaml @@ -156,6 +156,27 @@ datahubUpgrade: cpu: 300m memory: 256Mi +## Elasticsearch Indices Creation/Reindex +## ** This feature is currently under development ** +## See global.elasticsearch.index for additional configuration +datahubUpgradeBuildIndices: + enabled: false + image: + repository: acryldata/datahub-upgrade + # tag: + podSecurityContext: {} + # fsGroup: 1000 + securityContext: {} + # runAsUser: 1000 + podAnnotations: {} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 300m + memory: 256Mi + global: graph_service_impl: neo4j datahub_analytics_enabled: true @@ -168,6 +189,35 @@ global: insecure: "false" useSSL: "false" + ## The following section controls when and how reindexing of elasticsearch indices are performed + index: + ## Enable reindexing when mappings change based on the data model annotations + enableMappingsReindex: false + + ## Enable reindexing when static index settings change. + ## Dynamic settings which do not require reindexing are not affected + ## Primarily this should be enabled when re-sharding is necessary for scaling/performance. + enableSettingsReindex: false + + ## Index settings can be overridden for entity indices or other indices on an index by index basis + ## Some index settings, such as # of shards, requires reindexing while others, i.e. replicas, do not + ## Non-Entity indices do not require the prefix + # settingsOverrides: '{"graph_service_v1":{"number_of_shards":"5"},"system_metadata_service_v1":{"number_of_shards":"5"}}' + ## Entity indices do not require the prefix or suffix + # entitySettingsOverrides: '{"dataset":{"number_of_shards":"10"}}' + + ## The amount of delay between indexing a document and having it returned in queries + ## Increasing this value can improve performance when ingesting large amounts of data + # refreshIntervalSeconds: 1 + + ## The following options control settings for datahub-upgrade job when creating or reindexing indices + upgrade: + enabled: true + + ## When reindexing is required, this option will clone the existing index as a backup + ## The clone indices are not currently managed + # cloneIndices: true + kafka: bootstrap: server: "prerequisites-kafka:9092" From 926dd2adb7d8825510f40d1a8615f5836d3b686e Mon Sep 17 00:00:00 2001 From: David Leifker Date: Tue, 3 Jan 2023 09:09:03 -0600 Subject: [PATCH 2/3] feat(elasticsearch): allow document count mismatch override --- charts/datahub/Chart.yaml | 2 +- .../datahub-upgrade/datahub-build-indices-job.yml | 4 ++++ charts/datahub/values.yaml | 11 ++++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/charts/datahub/Chart.yaml b/charts/datahub/Chart.yaml index 82c65ad7c..cd58fefd8 100644 --- a/charts/datahub/Chart.yaml +++ b/charts/datahub/Chart.yaml @@ -4,7 +4,7 @@ description: A Helm chart for LinkedIn DataHub type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.2.127 +version: 0.2.128 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 0.9.5 diff --git a/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml index c6dd52675..c5bc174af 100644 --- a/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml +++ b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml @@ -152,6 +152,10 @@ spec: - name: ELASTICSEARCH_INDEX_BUILDER_REFRESH_INTERVAL_SECONDS value: {{ . | quote }} {{- end }} + {{- with .Values.global.elasticsearch.index.upgrade.allowDocCountMismatch }} + - name: ELASTICSEARCH_BUILD_INDICES_ALLOW_DOC_COUNT_MISMATCH + value: {{ . | quote }} + {{- end }} - name: GRAPH_SERVICE_IMPL value: {{ .Values.global.graph_service_impl }} {{- if eq .Values.global.graph_service_impl "neo4j" }} diff --git a/charts/datahub/values.yaml b/charts/datahub/values.yaml index ee068f650..51db17d36 100644 --- a/charts/datahub/values.yaml +++ b/charts/datahub/values.yaml @@ -215,9 +215,18 @@ global: enabled: true ## When reindexing is required, this option will clone the existing index as a backup - ## The clone indices are not currently managed + ## The clone indices are not currently managed. # cloneIndices: true + ## Typically when reindexing the document counts between the original and destination indices should match. + ## In some cases reindexing might not be able to proceed due to incompatibilities between a document in the + ## orignal index and the new index's mappings. This document could be dropped and re-ingested or restored from + ## the SQL database. + ## + ## This setting allows continuing if and only if the cloneIndices setting is also enabled which + ## ensures a complete backup of the original index is preserved. + # allowDocCountMismatch: false + kafka: bootstrap: server: "prerequisites-kafka:9092" From 3163a13e046aa2cb319eca95037cf5c8a299f0ea Mon Sep 17 00:00:00 2001 From: David Leifker Date: Tue, 3 Jan 2023 18:31:47 -0600 Subject: [PATCH 3/3] Adjusting hook order build-indices job --- charts/datahub/Chart.yaml | 2 +- .../templates/datahub-upgrade/datahub-build-indices-job.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/datahub/Chart.yaml b/charts/datahub/Chart.yaml index cd58fefd8..5fd9e94f6 100644 --- a/charts/datahub/Chart.yaml +++ b/charts/datahub/Chart.yaml @@ -10,7 +10,7 @@ version: 0.2.128 appVersion: 0.9.5 dependencies: - name: datahub-gms - version: 0.2.122 + version: 0.2.123 repository: file://./subcharts/datahub-gms condition: datahub-gms.enabled - name: datahub-frontend diff --git a/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml index c5bc174af..e1b4368a5 100644 --- a/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml +++ b/charts/datahub/templates/datahub-upgrade/datahub-build-indices-job.yml @@ -12,7 +12,7 @@ metadata: # This is what defines this resource as a hook. Without this line, the # job is considered part of the release. "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "-5" + "helm.sh/hook-weight": "-4" "helm.sh/hook-delete-policy": before-hook-creation spec: template: