From ec9cb7acef91cd6381bed05a77b8e5d29437e201 Mon Sep 17 00:00:00 2001 From: Alexander Zielenski <351783+alexzielenski@users.noreply.github.com> Date: Tue, 16 May 2023 13:13:32 -0700 Subject: [PATCH] add crd validation ratcheting KEP --- .../sig-api-machinery/4008.yaml | 3 + .../4008-crd-ratcheting/README.md | 1293 +++++++++++++++++ .../4008-crd-ratcheting/kep.yaml | 48 + 3 files changed, 1344 insertions(+) create mode 100644 keps/prod-readiness/sig-api-machinery/4008.yaml create mode 100644 keps/sig-api-machinery/4008-crd-ratcheting/README.md create mode 100644 keps/sig-api-machinery/4008-crd-ratcheting/kep.yaml diff --git a/keps/prod-readiness/sig-api-machinery/4008.yaml b/keps/prod-readiness/sig-api-machinery/4008.yaml new file mode 100644 index 000000000000..d132f02640bb --- /dev/null +++ b/keps/prod-readiness/sig-api-machinery/4008.yaml @@ -0,0 +1,3 @@ +kep-number: 4008 +alpha: + approver: "@deads2k" diff --git a/keps/sig-api-machinery/4008-crd-ratcheting/README.md b/keps/sig-api-machinery/4008-crd-ratcheting/README.md new file mode 100644 index 000000000000..0bc74e4c028a --- /dev/null +++ b/keps/sig-api-machinery/4008-crd-ratcheting/README.md @@ -0,0 +1,1293 @@ + +# KEP-4008: CRD Validation Ratcheting + + + + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [CRD Author Tightens a Field](#crd-author-tightens-a-field) + - [K8s Update Tightens CRD validation](#k8s-update-tightens-crd-validation) + - [K8s Update Widens CRD Validation](#k8s-update-widens-crd-validation) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) + - [Detection of Breaking Schema Changes is Different](#detection-of-breaking-schema-changes-is-different) + - [Mitigation: Use CRD-Schema-Checker](#mitigation-use-crd-schema-checker) + - [Mitigation: Use New Objects](#mitigation-use-new-objects) + - [Not All Rules Can Be Correctly/Easily Ratcheted](#not-all-rules-can-be-correctlyeasily-ratcheted) + - [Mitigation: Blacklisted Validations](#mitigation-blacklisted-validations) + - [Mitigation: Conservative Ratcheting Rule](#mitigation-conservative-ratcheting-rule) +- [Design Details](#design-details) + - [kube-openapi changes](#-changes) + - [Nested Value Validations](#nested-value-validations) + - [Structural-Schema-based validation changes](#structural-schema-based-validation-changes) + - [Correlation of Old and New](#correlation-of-old-and-new) + - [Cel-Validator changes](#cel-validator-changes) +- [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [CRDs Opt-in to Ratcheting Functionality](#crds-opt-in-to-ratcheting-functionality) + - [Offline Pass to Flag Invalid Objects at Rest During Upgrade](#offline-pass-to-flag-invalid-objects-at-rest-during-upgrade) + - [Post-Process Errors](#post-process-errors) + - [Drawbacks](#drawbacks-1) + - [Robustness of paths returned by OpenAPI Schema Validator](#robustness-of-paths-returned-by-openapi-schema-validator) + - [Evaluation of JSON Paths](#evaluation-of-json-paths) + - [Correlating Errors To Fields](#correlating-errors-to-fields) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [x] (R) KEP approvers have approved the KEP status as `implementable` +- [x] (R) Design details are appropriately documented +- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [x] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [x] (R) Production readiness review completed +- [x] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + + +The ability to shift left validation logic from controllers to the front-end +is a long-term goal for improving the useability of the Kubernetes project. As +it stands today, our treatment of validation for unchanged fields stands as a +barrier to both CRD authors and Kubernetes developers to adopting the available +validation features. + +## Motivation + + + +CRD authors today are thrown into the deep end when it comes to validation of +the values in their schemas. Generally authors understand to increment version +when adjusting the field layout of a CRD, but to do so when only modifying value +validations is cumbersome, and more trouble than it is worth. + +Modifying a value validation on a CRD today means that you risk breaking the +workflow of all your users, this high price to pay limits adoption, and degrades +the Kubernetes user experience: we are prevented from shifting validation logic left. + +Additionally, this sad state of affairs stunts the advancement of Kubernetes itself. +[KEP-3937](https://github.com/kubernetes/enhancements/pull/3938) proposes to add +declarative validation to Kubernetes, but to do that would require adding new +`format` types. This would also break existing Kubernetes user workflows. + +See User Stories for more detailed description of +the motivation behind this KEP. + +### Goals + + + +1. Remove barriers blocking CRD authors from widening value validations +2. Remove barriers blocking CRD authors from tightening value validations +3. Remove barriers blocking Kubernetes from widening validations +4. Remove barriers blocking Kubernetes from tightening validations +5. Do this automatically for all CRDs installed into clusters with the feature enabled +6. Performance Goals: + - Constant/negligible persistent overhead + - Up to 5% time overhead for resource writes + +### Non-Goals + + +1. Implement complicated logic that can unravel the dependencies of CEL rules ([#116779](https://github.com/kubernetes/kubernetes/pull/116779) is a CEL-specific ratcheting feature) +2. Completeness (not every rule will be able to be ratcheted) + +## Proposal + + + +New Feature Flag: `CRDValidationRatcheting` + +When enabled, this flag allows updates to custom resources that fail validation to succeed if the validation errors were on unchanged keypaths. + +For example: + +Take this CRD schema: + +```yaml +properties: + myField: + type: string + myOtherField: + type: string +``` + +Assume we have applied the following resource to our cluster: + +```yaml +apiVersion: stable.example.com/v1 +kind: MyCRD +myField: "" +``` + +Now, lets assume the CRD schema was changed to tighten validation: + +```yaml +properties: + myField: + type: string + minLength: 2 + myOtherField: + type: string +``` + +When a controller attempts to change one field they get an error: + +```yaml +apiVersion: stable.example.com/v1 +kind: MyCRD +myField: "" +myOtherField: newly added field +``` + +``` +* myField: Invalid value: "": myField in body should be at least 2 chars long +``` + +Even more sadly, users of SSA submitting just a patch with the new field get +rejected for fields that were not included in their patch. The following patch +yields the same error for SSA users: +```yaml +myOtherField: newly added field +``` + + +The feature proposed in this KEP would allow this change, since `myField` was +unaltered from the original object. If `myField` was changed, the new value would have to pass the new validation rule to be written. + +### User Stories (Optional) + + + +#### CRD Author Tightens a Field + +Consider the current situation that plays when when a CRD author attempts to +shift left validation logic done by their controller by adding a validation to +their CRD: + +1. CRD author gets complaints from users that they don't know their CRs have +invalid values until the CR is processed by the controller and updates the status. +2. CRD author notices OpenAPI schemas support adding value validations, and happily annotates their schema. +3. Since only validation was changed: no fields were removed or had their types changed, the author considers the change too minor to require a new +CRD version. +4. During testing users apply the updated CRD. Some users have pre-existing resources that passed validation before, and now fail. +5. Controllers break since they can't update the Status of the custom resource: it fails validation. +6. User's entire workflow is now broken until they repair all broken custom resources. +7. User downgrades their CRD and complains to the CRD author +8. CRD author abandons their attempt to improve useability of their CR. + +CRD authors tightening simple validation rules that their controllers tolerate +shouldn't require a version bump. + +With the new feature the story becomes a success: + +1. Users complain to CRD author. +2. CRD author annotates schema +3. CRD author does not bump version +4. Users apply updated CRD schema +5. Controllers can successfully update the status field of the CR +6. User's workflow keeps humming +7. User compliments cunning CRD author for their brilliance +8. CRD author remembers to user value validations in their schemas more in the future. + +#### K8s Update Tightens CRD validation + +In this case the user has a CRD with a field with the following schema: + +```yaml +type: string +format: dns1123subdomain +``` + +Kubernetes does not recognize this as a format. Today, Kubernetes treats +[unrecognized formats as a no-op](https://github.com/kubernetes/kubernetes/blob/f9f9e7177a328ae47bac6b823d57a99dfbfd310b/staging/src/k8s.io/apiextensions-apiserver/pkg/apiserver/validation/formats.go#L53-L65) and ignores them. So users would be able +to write fields not conforming to this format to custom resources. + +If Kubernetes wishes to support this format, users workflows will break upon +upgrade until the offending resources and identified and repaired. + +With this CRD Validation Ratcheting feature, Kubernetes would be able to +adopt new format strings safely without breaking most user workflows which +primarily use UPDATE. Certain workflows using CREATE may still encounter a +breaking api change upon validation tightening. + +#### K8s Update Widens CRD Validation + +In this case the user has a CRD with a field with following schema: + +```yaml +type: string +format: byte +``` + +In k8s 1.27 and below, this format disallows empty strings due to an implementation detail. However, in native types like `Secret`, byte is disallowed. If Kubernetes had an update to CRDs to allow empty string for `byte`, this widens the validation for those CRDS which use it. + +This is fine when upgrading to new versions if Kubernetes: all existing resources still pass validation. But downgrading from an update that widens validation can be understood +semantically as an update that tightens validating. + +For example if they had unreletated issues with the new Kubernetes update and +wanted to downgrade: + +1. User updates to new Kubernetes version +2. New CRs that were previously disallowed are allowed to be stored +3. User is disappointed with K8s version and downgrades +4. Controllers fail to update some resources, since they fail validation. + +If the CRD Validation Ratcheting feature were enabled the following occurs: + +1. User updates to new Kubernetes version +2. New CRs that were previously disallowed are allowed to be stored +3. User is disappointed with K8s version and downgrades +4. Controllers are allowed to update their resources, as long as all their changes are valid. + +### Notes/Constraints/Caveats (Optional) + + + +This KEP is scoped to CRDs, but if [KEP-3937](https://github.com/kubernetes/enhancements/pull/3938) is implemented then we should expect it to also affect native type validation. + +### Risks and Mitigations + +#### Detection of Breaking Schema Changes is Different + +Some users might have a process to detect breaking schema changes by applying +their CRD and poking an existing resource. This KEP would make that process +no longer reliable. After this change, authors of CRs will need to do this test +with a CREATE rather than an UPDATE. + +##### Mitigation: Use CRD-Schema-Checker + +[CRD-Schema-Checker](https://github.com/openshift/crd-schema-checker) may be +used to lint the CRD schema for breaking changes before applying it to a cluster. + +##### Mitigation: Use New Objects + +Alternatively, users may create new objects rather than updating old ones to +test for breaking schema changes. This feature has no effect on the creation +code path. + + +#### Not All Rules Can Be Correctly/Easily Ratcheted + +Some usages of the OpenAPI schema are not possible to reliably ratchet without +creating confusing exceptions, or rewriting completely the validation code used +by Kubernetes. + +For others it is not clear to see why ratcheting them will yield correct results. + +##### Mitigation: Blacklisted Validations + +To mitigate this, validation failures in rules which have an +ancestor in the schema of one of these types will not be ratcheted: + +- not: Supporting negation of rules for ratcheting would require a complete +rewrite of the validation system. +- oneOf: Requires EXACTLY one alternative, so it is not allowed for the same reason as `not` rules. +- allOf: Blacklisted until we can understand the semantics better and see a large need to add support. +- anyOf: Blacklisted until we can understand the semantics better and see a large need to add support. +- x-kubernetes-validations which make use of `oldSelf`: To get ratcheting to +work in an intuitive way with transition rules would require a copy of the +resource two states back. This is not feasible. Also, users can write ratcheting +logic directly in CEL as a transition rule if that was required. + +##### Mitigation: Conservative Ratcheting Rule + +For alpha our rule for ratcheting is fairly conservative: + +If a field does not change, validation rules attached to that rule which fail will +be ignored. + +This is a correct rule due to the following observations: + +1. Each validation rule is attributed to a field. +2. Each rule does not reference fields outside that field to which it is attributed. +This is true by construction for the case of the OpenAPI-schema validation fields, +and validated for the `x-kubernetes-...` CEL rules. +3. Errors raised by validation rules can be attributed to the rules they are attached to +(despite `fieldPath` on CEL Rules we can still determine the true field of the rule) +4. Some rules are attached to a field which captures more data than is referenced +by the rule, but that is OK it just makes the rule more conservative. + +From this we can conclude that a given validation rule is completely dependent +upon a subset of the information contained in the field to which it is attached. +If this subset of data did not change over the update, then the failure of the +rule was definitely due to pre-existing data. + +For now we use DeepEqual to check equality since it is conservative. A semantic +check may be employed if it can be done cleanly and performantly. + +For beta and beyond we will have an investigation and discussion of a more +permissive semantic. + +There are a few cases which, to support ratcheting may require a less strict +ratcheting rule: + +1. `maxItems` on an array - should we consider the "size" of the array to remain +unchanged if elements within it change without adding/removing anything? +The current rule does not allow for this to be racheted. +2. `maxProperties` - similarly to above +3. `required` on an object: this rule is attached to an object, but pertains +to specific keys. Should we allow a non-existing field to ratchet if it is newly +made to be `required`? The current rule does not allow this. +## Design Details + + + +At a high level, CR validation in kubernetes has four components: +1. `kube-openapi`-based `Validate` using `*spec.Schema` +2. StructuralSchema-based Meta, TypeMeta, ListSet/Map validation +3. CEL Validators using a CEL schema interface type +4. Embedded Objects + +Each of these validation paths will need to have their `ValidateUpdate(old, new)` +codepath modified to: + +1. Take an option to enable the feature +2. Ignore errors which occur both on the `old` and `new` objects +3. Return ignored errors in a separate list + +### `kube-openapi` changes + +The `kube-openapi` schema validator validates the standard OpenAPI/JSONSchema +elements of the CRD schema. This is basically everything except CEL rules, +and listset/map types. + +The current SchemaValidator will be wrapped with a `RatchetingSchemaValidator` +which compares old/new values and ignores all errors on its subpath if they are +equal. + +Supporting validation code within kube-openapi which spawn a new `SchemaValidator` +will need to be refactored to find the old value and use `RatchetingSchemaValidator` +when `RatchetingSchemaValidator.ValidateUpdate` is in its callstack. + +anyOf, allOf, not, allOf sections of logic will remain using the normal +`SchemaValidator`. + +#### Nested Value Validations + +OpenAPI-schemas support nested validations: +- not +- oneOf +- anyOf +- allOf + + +To keep the semantics simple and implementation clear, none of these constructs +nor any of their subschemas will be ratcheted. Our intuition is that in the wild +CRD authors do not make much use of them, and when they do the logic is +complicated and references multiple fields. + +### Structural-Schema-based validation changes + +The two structural-schema based validation methods may need to +be refactored to support `ValidateUpdate` returning separate lists of errors. + +[`ValidateListSetsAndMaps(fldPath *field.Path, s *schema.Structural, obj map[string]interface{}) field.ErrorList`](https://github.com/kubernetes/kubernetes/blob/694698ca3801fe5aa258d58c1e27ac7f88c29d35/staging/src/k8s.io/apiextensions-apiserver/pkg/apiserver/schema/listtype/validation.go#L28) + +[`objectmeta.Validate(pth *field.Path, obj interface{}, s *structuralschema.Structural, isResourceRoot bool) field.ErrorList`](https://github.com/kubernetes/kubernetes/blob/92042fe6eac61b6dd4092f5c6737050a4a1c43e5/staging/src/k8s.io/apiextensions-apiserver/pkg/apiserver/schema/objectmeta/validation.go#L32) + +> Note: ObjectMeta.Validate is a core validation function in kube with high risk +when modifying. For Alpha we will only ratchet errors from this function if the +entire metadata did not change, rather than field-by-field basis. Experimenting will +be done to gauge the feasibility/risk of adding ratcheting to metadata for beta. + +#### Correlation of Old and New + +For comparison of `old`/`new` `maps`/`lists`/`sets`, we need a method for +correlation of the old and new elements. Whether that is normalizing the fields, +creating a map representation of the object, or otherwise. This may be tricky +to optimize, so care should be taken to meet performance goals. + +### Cel-Validator changes + +[`cel.Validate`](https://github.com/kubernetes/kubernetes/blob/2a50ef677ebf0729f176e852d0ad16950e122f6e/staging/src/k8s.io/apiextensions-apiserver/pkg/apiserver/schema/cel/validation.go#L148) is already written to validate an update, +correlating old and new fields, so would only need to be modified to return an +ignored error list when applicable. + +## Test Plan + + + +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +###### Schema-Changing Tests May Need Refactoring + +This change could cause some previously valid tests of invalid schema changes to +fail. Those tests will have to be updated by changing to use create, or by +changing the invalid fields during an update. + +##### Unit tests + + + + + +- `k8s.io/apiextensions-apiserver/pkg/apiserver/schema/cel`: `06/14/2023` - `83.2` +- `k8s.io/apiextensions-apiserver/pkg/apiserver/schema/listtype`: `06/14/2023` - `95.8` +- `k8s.io/apiextensions-apiserver/pkg/apiserver/schema/objectmeta`: `06/14/2023` - `83.9` +- `k8s.io/apiextensions-apiserver/pkg/registry/customresource`: `06/14/2023` - `57.5` +- `k8s.io/apiextensions-apiserver/pkg/apiserver/validation`: `06/14/2023` - `85.8` +- `k8s.io/kube-openapi/pkg/validation/validate` : `06/14/2023` - `97.2` + +##### Integration tests + + + + + +1. Show normal creation path is unaffected +2. Tests that ratchet different classes of validations: + - JSON-Schema fields (minItems, dependencies, etc.) + - CRD-Validation Rules + - ListSets/Maps +3. Tests that certain classes of valdiations are not ratcheted: + - Nested Validations (not, oneOf, allOf, anyOf) + - Transition Rules + +##### e2e tests + + + +It would be nice to have a e2e test that tests ratcheting over a k8s upgrade +which tightens a validation. + +- : + +### Graduation Criteria + + +#### Alpha +- Feature Flag added +- Integration Tests implemented and enabled +- Incomplete-but-correct subset of ratcheting rules implemented under flag + +#### Beta +- Gather feedback from developers +- Additional Testing added as needed +- CEL Validation Rules Ratcheting Implemented +- Schema ratcheting rules implemented under flag +- Metadata ratcheting rules implemented under flag +- Detailed analysis of supported validation rules, whether/how they are ratcheted, and discussion +- Investigation and discussion of weaker forms of equality + +#### GA +- Upgrade/Downgrade e2e tests +- Scalability Tests + + + +### Upgrade / Downgrade Strategy + + +No change in how users upgrade/downgrade their clusters. This feature may remove +complexity by removing risk that a tightened validation on Kubernetes' part +does not break workflow. + + +### Version Skew Strategy + + +N/A + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: `CRDValidationRatcheting` + - Components depending on the feature gate: `apiextensions-apiserver`, `kube-apiserver` +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + + + +Yes, enabling the feature will silence errors for unchanged fields. + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +Yes, doing so will restore prior functionality. Resources that were successfully updated while the feature was enabled may still be invalid at rest, but this condition is unchanged from before the feature was enabled, and +the system should behave as expected. + +###### What happens if we reenable the feature if it was previously rolled back? + +Nothing. Future updates to Custom Resources that may have failed validation +before will now succeed if they did not change the failing fields. + +###### Are there any tests for feature enablement/disablement? + + + +We will add an integration test to ensure that the feature is disabled when the feature gate is off. +Additionally an integration test that shows CRs which would be ratcheted can +have their `status` changed when the feature is enabled. + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + +No. + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +We may consider adding a metric in a future release. + +In the absence of a metric, an operator may test if the feature is present by +creating a CRD, and CR, updating the CRD, and checking if an update to the CR +ratchets. + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [x] Other (treat as last resort) + - Details: User can update a CRD to tighten the requirements of one of its fields. If an update to an object that would now fail the new validation succeeds, the feature is enabled. + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + +No. + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + +No. + +###### Will enabling / using this feature result in introducing new API types? + + +No. + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + +No. Possibly only indirectly as a result of writes succeeding instead of failing. + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + +No. + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + +Performance of any writes to a custom resource may be impacted. Benchmarks must +be taken to measure magniture. + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + +Should have benchmarks before beta, but expecting to see some measureable impact to writes only to CRDs. + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + +No. + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +The same way any write to apiserver would. + +###### What are other known failure modes? + + +None. + +###### What steps should be taken if SLOs are not being met to determine the problem? + +Disable the feature. + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +### CRDs Opt-in to Ratcheting Functionality + +One alternative to appling ratcheting generally to all crds when the feature +is enabled would be to add something like an `x-kubernetes-...` extension to +CRDs to allow the individual CRDs to decide to ratchet themselves. + +This solves one of the goals of this KEP of allowing CRD authors more flexibility +in modifying their schemas. But this does not solve the other goal of enabling +Kubernetes developers from fixing bugs with/extending the supported OpenAPI +schema. + +For example: + +1. The discrepency between `byte` as a format for CRDs, and `byte` used by native +types cannot be fixed if this feature is opt-in. +2. New string formats cannot be added like `dns1123subdomain`, since to add it +would break any CRD not opted into ratcheting. + +Another drawback is `x-kubernetes-ratcheting: true` cannot be disabled without +problems for the user if other rules were added at the same time. + +One might say to explicitly only ratchet things that Kubernetes' developers have +modified but that increases maintenance burden on Kubernetes developers, and +causes confusing behavior on the user side when parts of your CRD are ratcheting +that you did not enable. + +### Offline Pass to Flag Invalid Objects at Rest During Upgrade + +An alternative suggested to this process would be to provide a tool to scan +all objects in etcd, and flag to the user which ones are currently failing validation +at rest. This would help operators clean up a broken workflow caused by objects made +invalid by a validation tightening. + +This is undesierable for a few reasons related to UX: + +1. Workflows are still broken for a period after the update. +2. Updates to CRD Schemas are still very disruptive, remaining a barrier to validation adoption +3. Operators who may not be the original author of a resource will have to correct the objects themselves manually. + +### Post-Process Errors + +The basic idea is to add a post-processing step to the `Validate(Update)` methods of the CRD strategy. Given a list of `StatusError`, we check the field path associated with the error, if it is unchanged from old to new then the error is changed into a warning. + +#### Drawbacks +##### Robustness of paths returned by OpenAPI Schema Validator + +This approach would make the field paths returned by the OpenAPI schema validator +important for correctness. Previously they have only been used informationally +to the user. + +##### Evaluation of JSON Paths + +This approach also introduces complexity evaluating the JSON paths. It may take +a refactor of schema validator to return paths good enough to use to evaluate +against old and new objects. + +##### Correlating Errors To Fields + +There are a few problems with using the FieldPath associated with the error +to potentially ignore errors: + +1. Prepending item to atomic list. + +This case changes the index of all items, and the list is atomic, so it is not possible to ratchet individual items in an atomic list. + +2. Field Path containing list-type=map still contains a position index relative to the new object. + +We will have to look up the map key in the new object to find the element's map key. Using the map key we can find the associated element in the old object. + +3. Field Path in Errors from CEL Rules may be a lie: https://github.com/kubernetes/kubernetes/pull/118041 + +The field to which the CEL rule in its `x-kubernetes-validations` list must +contain all fields referred by the CEL rule; so the CEL error logic can have to +be modified to also include the rule's original location. + +## Infrastructure Needed (Optional) + + \ No newline at end of file diff --git a/keps/sig-api-machinery/4008-crd-ratcheting/kep.yaml b/keps/sig-api-machinery/4008-crd-ratcheting/kep.yaml new file mode 100644 index 000000000000..35510cbcfe73 --- /dev/null +++ b/keps/sig-api-machinery/4008-crd-ratcheting/kep.yaml @@ -0,0 +1,48 @@ +title: CRD Validation Ratcheting +kep-number: 4008 +authors: + - "@alexzielenski" +owning-sig: sig-api-machinery +participating-sigs: +status: implementable +creation-date: 2023-05-15 +reviewers: + - "@jpbetz" + - "@apelisse" + - '@stts' +approvers: + - "@jpbetz" + - "@deads2k" + +##### WARNING !!! ###### +# prr-approvers has been moved to its own location +# You should create your own in keps/prod-readiness +# Please make a copy of keps/prod-readiness/template/nnnn.yaml +# to keps/prod-readiness/sig-xxxxx/00000.yaml (replace with kep number) +#prr-approvers: + +see-also: + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.28" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.28" + + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: CRDValidationRatcheting + components: + - kube-apiserver +disable-supported: true + +# The following PRR answers are required at beta release +metrics: []