diff --git a/.github/workflows/docs-gen-and-push.yaml b/.github/workflows/docs-gen-and-push.yaml index 53477b83cd0..570713d98af 100644 --- a/.github/workflows/docs-gen-and-push.yaml +++ b/.github/workflows/docs-gen-and-push.yaml @@ -15,7 +15,6 @@ on: - "docs/**" - "pkg/**" - ".github/workflows/docs-gen-and-push.yaml" - - "hack/deploy-docs.sh" permissions: contents: write diff --git a/Makefile b/Makefile index 956d04c7ea4..cb653abfd58 100644 --- a/Makefile +++ b/Makefile @@ -174,12 +174,12 @@ REQUIREMENTS_TXT=docs/requirements.txt .PHONY: serve-docs serve-docs: venv . $(VENV)/activate; \ - VENV=$(VENV) REMOTE=$(REMOTE) BRANCH=$(BRANCH) hack/serve-docs.sh + VENV=$(VENV) REMOTE=$(REMOTE) BRANCH=$(BRANCH) docs/scripts/serve-docs.sh .PHONY: deploy-docs deploy-docs: venv . $(VENV)/activate; \ - REMOTE=$(REMOTE) BRANCH=$(BRANCH) hack/deploy-docs.sh + REMOTE=$(REMOTE) BRANCH=$(BRANCH) docs/scripts/deploy-docs.sh vendor: ## Vendor the dependencies go mod tidy diff --git a/docs/content/concepts/apis-in-kcp.md b/docs/content/concepts/apis-in-kcp.md index b2c6ae8ba98..7b87f5b2cb0 100644 --- a/docs/content/concepts/apis-in-kcp.md +++ b/docs/content/concepts/apis-in-kcp.md @@ -1,3 +1,8 @@ +--- +description: > + What APIs come standard, how to share APIs with others, how to consume shared APIs. +--- + # APIs in kcp ## Overview diff --git a/docs/content/concepts/authorization.md b/docs/content/concepts/authorization.md index d46c5a79570..43b723c2a7f 100644 --- a/docs/content/concepts/authorization.md +++ b/docs/content/concepts/authorization.md @@ -1,6 +1,6 @@ --- description: > - How to authorize requests to kcp + How to authorize requests to kcp. --- # Authorization diff --git a/docs/content/concepts/cluster-mapper.md b/docs/content/concepts/cluster-mapper.md index 87dd786218c..237d356c2cd 100644 --- a/docs/content/concepts/cluster-mapper.md +++ b/docs/content/concepts/cluster-mapper.md @@ -1,6 +1,6 @@ --- description: > - How to use the cluster mapper + How to use the cluster mapper. --- # Cluster Mapper diff --git a/docs/content/concepts/index.md b/docs/content/concepts/index.md index f550d93b4aa..e8c89b69a91 100644 --- a/docs/content/concepts/index.md +++ b/docs/content/concepts/index.md @@ -1,116 +1,3 @@ ---- -description: > - Contains the definitions shared across design documents around prototyping a kube-like control plane (in KCP). This is - a derivative work of other design documents intended to frame terminology. All future statements that may be changed by - designs is covered by those designs, and not duplicated here. ---- +# kcp concepts -# Terminology for kcp - -## Logical cluster - -A logical cluster is a way to subdivide a single kube-apiserver + etcd storage into multiple clusters (different APIs, -separate semantics for access, policy, and control) without requiring multiple instances. A logical cluster is a -mechanism for achieving separation, but may be modelled differently in different use cases. A logical cluster is -similar to a virtual cluster as defined by sig-multicluster, but is able to amortize the cost of a new cluster to be -zero or near-zero memory and storage so that we can create tens of millions of empty clusters cheaply. - -A logical cluster is a storage level concept that adds an additional attribute to an object’s identifier on a -kube-apiserver. Regular servers identify objects by (group, version, resource, optional namespace, name). A logical -cluster enriches an identifier: (group, version, resource, **logical cluster name**, optional namespace, name). - -## Workload Cluster - -A physical cluster is a “real Kubernetes cluster”, i.e. one that can run Kubernetes workloads and accepts standard -Kubernetes API objects. For the near term, it is assumed that a physical cluster is a distribution of Kubernetes and -passes the conformance tests and exposes the behavior a regular Kubernetes admin or user expects. - -## Workspace - -A workspace models a set of user-facing APIs for CRUD. Each workspace is backed by a logical cluster, but not all -logical clusters may be exposed as workspaces. Creating a Workspace object results in a logical cluster being available -via a URL for the client to connect and create resources supported by the APIs in that workspace. There could be -multiple different models that result in logical clusters being created, with different policies or lifecycles, but -Workspace is intended to be the most generic representation of the concept with the broadest possible utility to anyone -building control planes. - -A workspace binds APIs and makes them accessible inside the logical cluster, allocates capacity for creating instances -of those APIs (quota), and defines how multi-workspace operations can be performed by users, clients, and controller -integrations. - -To a user, a workspace appears to be a Kubernetes cluster minus all the container orchestration specific resources. It -has its own discovery, its own OpenAPI spec, and follows the kube-like constraints about uniqueness of -Group-Version-Resource and its behaviour (no two GVRs with different schemas can exist per workspace, but workspaces can -have different schemas). A user can define a workspace as a context in a kubeconfig file and `kubectl get all -A` would -return all objects in all namespaces of that workspace. - -Workspace naming is chosen to be aligned with the Kubernetes Namespace object - a Namespace subdivides a workspace by -name, a workspace subdivides the universe into chunks of meaningful work. - -Workspaces are the containers for all API objects, so users orient by viewing lists of workspaces from APIs. - -## Workspace type - -Workspaces have types, which are mostly oriented around a set of default or optional APIs exposed. For instance, a -workspace intended for use deploying Kube applications might expose the same API objects a user would encounter on a -physical cluster. A workspace intended for building functions might expose only the knative serving APIs, config maps -and secrets, and optionally enable knative eventing APIs. - -At the current time there is no decision on whether a workspace type represents an inheritance or composition model, -although in general we prefer composition approaches. We also do not have a fully resolved design. - -## Virtual Workspace - -An API object has one source of truth (is stored transactionally in one system), but may be exposed to different use -cases with different fields or schemas. Since a workspace is the user facing interaction with an API object, if we want -to deal with Workspaces in aggregate, we need to be able to list them. Since a user may have access to workspaces in -multiple different contexts, or for different use cases (a workspace that belongs to the user personally, or one that -belongs to a business organization), the list of “all workspaces” itself needs to be exposed as an API object to an end -user inside a workspace. That workspace is “virtual” - it adapts or transforms the underlying source of truth for the -object and potentially the schema the user sees. - -## Index (e.g. Workspace Index) - -An index is the authoritative list of a particular API in their source of truth across the system. For instance, in -order for a user to see all the workspaces they have available, they must consult the workspace index to return a list -of their workspaces. It is expected that indices are suitable for consistent LIST/WATCHing (in the kubernetes sense) so -that integrations can be built to view the list of those objects. - -Index in the control plane sense should not be confused with secondary indices (in the database sense), which may be -used to enable a particular index. - -## Shard - -A failure domain within the larger control plane service that cuts across the primary functionality. Most distributed -systems must separate functionality across shards to mitigate failures, and typically users interact with shards through -some transparent serving infrastructure. Since the primary problem of building distributed systems is reasoning about -failure domains and dependencies across them, it is critical to allow operators to effectively match shards, understand -dependencies, and bring them together. - -A control plane should be shardable in a way that maximizes application SLO - gives users a tool that allows them to -better define their applications not to fail. - -## API Binding - -The act of associating a set of APIs with a given logical cluster. The Workspace model defines one particular -implementation of the lifecycle of a logical cluster and the APIs within it. Because APIs and the implementations that -back an API evolve over time, it is important that the binding be introspectable and orchestrate-able - that a consumer -can provide a rolling deployment of a new API or new implementation across hundreds or thousands of workspaces. - -There are likely a few objects involved in defining the APIs exposed within a workspace, but in general they probably -define a spec (which APIs / implementations to associate with) and a status (the chosen APIs / implementations that are -currently bound), allow a user to bulk associate APIs (i.e. multiple APIs at the same time, like “all knative serving -APIs”), and may be defaulted based on some attributes of a workspace type (all workspaces of this “type” get the default -Kube APIs, this other “type” get the knative apis). - -The evolution of an API within a workspace and across workspaces is of key importance. - -## Syncer - -A syncer is installed on a SyncTarget and is responsible for synchronizing data between kcp and that cluster. - -## Location - -A collection of SyncTargets that describe runtime characteristics that allow placement of applications. -Characteristics are not limited but could describe things like GPU, supported storage, compliance or -regulatory fulfillment, or geographical placement. +{% include "partials/section-overview.html" %} diff --git a/docs/content/concepts/kubectl-kcp-plugin.md b/docs/content/concepts/kubectl-kcp-plugin.md index e67bd1f535f..af268f418e4 100644 --- a/docs/content/concepts/kubectl-kcp-plugin.md +++ b/docs/content/concepts/kubectl-kcp-plugin.md @@ -1,6 +1,6 @@ --- description: > - How to use the kubectl kcp plugin + How to use the kubectl kcp plugin. --- # kubectl kcp plugin diff --git a/docs/content/concepts/partitions.md b/docs/content/concepts/partitions.md index c510d448687..2932fd62905 100644 --- a/docs/content/concepts/partitions.md +++ b/docs/content/concepts/partitions.md @@ -1,6 +1,6 @@ --- description: > - How to create shard partitions + How to create shard partitions. --- # Partition API @@ -69,4 +69,4 @@ status: It is to note that a `Partition` is created only if it matches at least one shard. With the provided example if there is no shard in the cloud provider `aliyun` in the region `europe` no `Partition` will be created for it. -An example of a `Partition` generated by this `PartitionSet` can be found above. The `dimensions` are translated into `matchLabels` with values specific to each `Partition`. An owner reference of the `Partition` will be set to the `PartitionSet`. \ No newline at end of file +An example of a `Partition` generated by this `PartitionSet` can be found above. The `dimensions` are translated into `matchLabels` with values specific to each `Partition`. An owner reference of the `Partition` will be set to the `PartitionSet`. diff --git a/docs/content/concepts/locations-and-scheduling.md b/docs/content/concepts/placement-locations-and-scheduling.md similarity index 100% rename from docs/content/concepts/locations-and-scheduling.md rename to docs/content/concepts/placement-locations-and-scheduling.md diff --git a/docs/content/concepts/quickstart-tenancy-and-apis.md b/docs/content/concepts/quickstart-tenancy-and-apis.md index e2b1afcd30e..e49fe9bfcdb 100644 --- a/docs/content/concepts/quickstart-tenancy-and-apis.md +++ b/docs/content/concepts/quickstart-tenancy-and-apis.md @@ -1,6 +1,6 @@ --- description: > - How to create a new API and use it with tenancy + How to create a new API and use it with tenancy. --- # Quickstart: Tenancy and APIs diff --git a/docs/content/concepts/syncer.md b/docs/content/concepts/registering-kubernetes-clusters-using-syncer.md similarity index 99% rename from docs/content/concepts/syncer.md rename to docs/content/concepts/registering-kubernetes-clusters-using-syncer.md index 2b2f27ad787..3914fb78fbc 100644 --- a/docs/content/concepts/syncer.md +++ b/docs/content/concepts/registering-kubernetes-clusters-using-syncer.md @@ -1,6 +1,6 @@ --- description: > - How to register Kubernetes clusters using syncer + How to register Kubernetes clusters using syncer. --- # Registering Kubernetes Clusters using syncer @@ -134,7 +134,7 @@ kubectl kcp bind compute --location-selectors=env=test ``` this command will create a `Placement` selecting a `Location` with label `env=test` and bind the selected `Location` to namespaces with -label `purpose=workload`. See more details of placement and location [here](locations-and-scheduling.md) +label `purpose=workload`. See more details of placement and location [here](placement-locations-and-scheduling.md) ### Running a workload diff --git a/docs/content/concepts/storage.md b/docs/content/concepts/storage.md index f76a38875a0..43f9adcbc4a 100644 --- a/docs/content/concepts/storage.md +++ b/docs/content/concepts/storage.md @@ -1,92 +1,193 @@ +--- +description: > + How kcp handles storage and statefulness. +--- + # Storage and stateful applications ## Overview -KCP provides a control plane that implements the concept of Transparent Multi Cluster (TMC) for compute, network, and storage. In order to give the illusion of transparent storage in KCP, it exposes the same Kubernetes APIs for storage (PVC/PV), so users and workloads do not need to be aware of the coordinations taken by the control plane behind the scenes. +KCP provides a control plane that implements the concept of Transparent Multi Cluster (TMC) for compute, network, and +storage. In order to give the illusion of transparent storage in KCP, it exposes the same Kubernetes APIs for storage ( +PVC/PV), so users and workloads do not need to be aware of the coordinations taken by the control plane behind the +scenes. -Placement for storage in KCP uses the same [concepts used for compute](locations-and-scheduling.md#main-concepts): "`SyncTargets` in a `Location` are transparent to the user, and workloads should be able to seamlessly move from one `SyncTarget` to another within a `Location`, based on operational concerns of the compute service provider, like decommissioning a cluster, rebalancing capacity, or due to an outage of a cluster. It is the compute service's responsibility to ensure that for workloads in a location, to the user it looks like ONE cluster." +Placement for storage in KCP uses the +same [concepts used for compute](placement-locations-and-scheduling.md#main-concepts): "`SyncTargets` in a `Location` +are transparent to the user, and workloads should be able to seamlessly move from one `SyncTarget` to another within +a `Location`, based on operational concerns of the compute service provider, like decommissioning a cluster, rebalancing +capacity, or due to an outage of a cluster. It is the compute service's responsibility to ensure that for workloads in a +location, to the user it looks like ONE cluster." -KCP will provide the basic controllers and coordination logic for moving volumes, as efficiently as possible, using the underlying storage topology and capabilities. It will use the `SyncTargets` storage APIs to manage volumes, and not require direct access from the control plane to the storage itself. For more advanced or custom solutions, KCP will allow external coordinators to take over. +KCP will provide the basic controllers and coordination logic for moving volumes, as efficiently as possible, using the +underlying storage topology and capabilities. It will use the `SyncTargets` storage APIs to manage volumes, and not +require direct access from the control plane to the storage itself. For more advanced or custom solutions, KCP will +allow external coordinators to take over. ## Main concepts - [Transparent multi-cluster](../investigations/transparent-multi-cluster.md) - describes the TMC concepts. -- [Placement, Locations and Scheduling](locations-and-scheduling.md) - describes the KCP APIs and mechanisms used to control compute placement, which will be used for storage as well. Refer to the concepts of `SyncTarget`, `Location`, and `Placement`. +- [Placement, Locations and Scheduling](placement-locations-and-scheduling.md) - describes the KCP APIs and mechanisms + used to control compute placement, which will be used for storage as well. Refer to the concepts + of `SyncTarget`, `Location`, and `Placement`. -- [Kubernetes storage concepts](https://kubernetes.io/docs/concepts/storage/) - documentation of storage APIs in Kubernetes. +- [Kubernetes storage concepts](https://kubernetes.io/docs/concepts/storage/) - documentation of storage APIs in + Kubernetes. -- [Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) - PVCs are the main storage APIs used to request storage resources for applications. PVs are invisible to users, and used by administrators or privileged controllers to provision storage to user claims, and will be coordinated by KCP to support transparent multi-cluster storage. +- [Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) - PVCs are the main storage APIs + used to request storage resources for applications. PVs are invisible to users, and used by administrators or + privileged controllers to provision storage to user claims, and will be coordinated by KCP to support transparent + multi-cluster storage. -- [Kubernetes CSI](https://kubernetes-csi.github.io/docs/) - The Container Storage Interface (CSI) is a standard for exposing arbitrary block and file storage systems to containerized workloads. The list of [drivers](https://kubernetes-csi.github.io/docs/drivers.html) provides a "menu" of storage systems integrated with kubernetes and their properties. +- [Kubernetes CSI](https://kubernetes-csi.github.io/docs/) - The Container Storage Interface (CSI) is a standard for + exposing arbitrary block and file storage systems to containerized workloads. The list + of [drivers](https://kubernetes-csi.github.io/docs/drivers.html) provides a "menu" of storage systems integrated with + kubernetes and their properties. -- [StatefulSets volumeClaimTemplates](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#volume-claim-templates) - workload definition used to manage “sharded” stateful applications. Specifying `volumeClaimTemplates` in the statefulset spec will provide stable storage by creating a PVC per instance. +- [StatefulSets volumeClaimTemplates](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#volume-claim-templates) - + workload definition used to manage “sharded” stateful applications. Specifying `volumeClaimTemplates` in the + statefulset spec will provide stable storage by creating a PVC per instance. ## Volume types -Each physical-cluster (aka "pcluster") brings its own storage to multi-cluster environments, and in order to make efficient coordination decisions, KCP will identify the following types: +Each physical-cluster (aka "pcluster") brings its own storage to multi-cluster environments, and in order to make +efficient coordination decisions, KCP will identify the following types: #### Shared network-volumes -These volumes are provisioned from an external storage system that is available to all/some of the pclusters over an infrastructure network. These volumes are typically provided by a shared-filesystem (aka [NAS](https://en.wikipedia.org/wiki/Network-attached_storage)), with access-mode of ReadWriteMany (RWX) or ReadOnlyMany (ROX). A shared volume can be used by any pod from any pcluster (that can reach it) at the same time. The application is responsible for the consistency of its data (for example with eventual consistency semantics, or stronger synchronization services like zookeeper). Examples of such storage are generic-NFS/SMB, AWS-EFS, Azure-File, GCP-Filestore, CephFS, GlusterFS, NetApp, GPFS, etc. + +These volumes are provisioned from an external storage system that is available to all/some of the pclusters over an +infrastructure network. These volumes are typically provided by a shared-filesystem ( +aka [NAS](https://en.wikipedia.org/wiki/Network-attached_storage)), with access-mode of ReadWriteMany (RWX) or +ReadOnlyMany (ROX). A shared volume can be used by any pod from any pcluster (that can reach it) at the same time. The +application is responsible for the consistency of its data (for example with eventual consistency semantics, or stronger +synchronization services like zookeeper). Examples of such storage are generic-NFS/SMB, AWS-EFS, Azure-File, +GCP-Filestore, CephFS, GlusterFS, NetApp, GPFS, etc. #### Owned network-volumes -These volumes are provisioned from an external storage system that is available to all/some of the pclusters over an infrastructure network. However unlike shared volumes, owned volumes require that only a single node/pod will mount the volume at a time. These volumes are typically provided by a block-level storage system, with access-mode of ReadWriteOnce (RWO) or ReadWriteOncePod (RWOP). It is possible to *move* the ownership between pclusters (that have access to that storage), by detaching from the current owner, and then attaching to the new owner. But it would have to guarantee a single owner to prevent data inconsistencies or corruptions, and even work if the owner pcluster is offline (see forcing detach with “fencing” below). Examples of such storage are AWS-EBS, Azure-Disk, Ceph-RBD, etc. + +These volumes are provisioned from an external storage system that is available to all/some of the pclusters over an +infrastructure network. However unlike shared volumes, owned volumes require that only a single node/pod will mount the +volume at a time. These volumes are typically provided by a block-level storage system, with access-mode of +ReadWriteOnce (RWO) or ReadWriteOncePod (RWOP). It is possible to *move* the ownership between pclusters (that have +access to that storage), by detaching from the current owner, and then attaching to the new owner. But it would have to +guarantee a single owner to prevent data inconsistencies or corruptions, and even work if the owner pcluster is +offline (see forcing detach with “fencing” below). Examples of such storage are AWS-EBS, Azure-Disk, Ceph-RBD, etc. #### Internal volumes -These volumes are provisioned inside the pcluster itself, and rely on its internal resources (aka hyper-converged or software-defined storage). This means that the availability of the pcluster also determines the availability of the volume. In some systems these volumes are bound to a single node in the pcluster, because the storage is physically attached to a host. However, advanced clustered/distributed systems make efforts to overcome temporary and permanent node failures by adding data redundancy over multiple nodes. These volumes can have any type of access-mode (RWO/RWOP/RWX/ROX), but their strong dependency on the pcluster itself is the key difference from network volumes. Examples of such storage are host-path/local-drives, TopoLVM, Ceph-rook, Portworx, OpenEBS, etc. + +These volumes are provisioned inside the pcluster itself, and rely on its internal resources (aka hyper-converged or +software-defined storage). This means that the availability of the pcluster also determines the availability of the +volume. In some systems these volumes are bound to a single node in the pcluster, because the storage is physically +attached to a host. However, advanced clustered/distributed systems make efforts to overcome temporary and permanent +node failures by adding data redundancy over multiple nodes. These volumes can have any type of access-mode ( +RWO/RWOP/RWX/ROX), but their strong dependency on the pcluster itself is the key difference from network volumes. +Examples of such storage are host-path/local-drives, TopoLVM, Ceph-rook, Portworx, OpenEBS, etc. ## Topology and locations #### Regular topology -A regular storage topology is one where every `Location` is defined so that all of its `SyncTargets` are connected to the same storage system. This makes it trivial to move network volumes transparently between `SyncTargets` inside the same location. + +A regular storage topology is one where every `Location` is defined so that all of its `SyncTargets` are connected to +the same storage system. This makes it trivial to move network volumes transparently between `SyncTargets` inside the +same location. #### Multi-zone cluster -A more complex topology is where pclusters contain nodes from several availability-zones, for the sake of being resilient to a zone failure. Since volumes are bound to a single zone (where they were provisioned), then a volume will not be able to move between `SyncTargets` without nodes on that zone. This is ok if all the `SyncTargets` of the `Location` span over the same set of zones, but if the zones are different, or the capacity per zone is too limited, copying to another zone might be necessary. + +A more complex topology is where pclusters contain nodes from several availability-zones, for the sake of being +resilient to a zone failure. Since volumes are bound to a single zone (where they were provisioned), then a volume will +not be able to move between `SyncTargets` without nodes on that zone. This is ok if all the `SyncTargets` of +the `Location` span over the same set of zones, but if the zones are different, or the capacity per zone is too limited, +copying to another zone might be necessary. #### Internal volumes -Internal volumes are always confined to one pcluster, which means it has to be copied outside of the pcluster continuously to keep the application available even in the case where the pcluster fails entirely (network split, region issue, etc). This is similar to how DR solutions work between locations. + +Internal volumes are always confined to one pcluster, which means it has to be copied outside of the pcluster +continuously to keep the application available even in the case where the pcluster fails entirely (network split, region +issue, etc). This is similar to how DR solutions work between locations. #### Disaster recover between locations -A regular Disaster Recovery (DR) topology will create pairs of `Locations` so that one is “primary” and the other is “secondary” (sometimes this relation is mutual). For volumes to be able to move between these locations, their storage systems would need to be configured to mirror/replicate/backup/snapshot (whichever approach is more appropriate depends on the case) every volume to its secondary. With such a setup, KCP would need to be able to map between the volumes on the primary and the secondary, so that it could failover and move workloads to the secondary and reconnect to the last copied volume state. See more on the DR section below. + +A regular Disaster Recovery (DR) topology will create pairs of `Locations` so that one is “primary” and the other is +“secondary” (sometimes this relation is mutual). For volumes to be able to move between these locations, their storage +systems would need to be configured to mirror/replicate/backup/snapshot (whichever approach is more appropriate depends +on the case) every volume to its secondary. With such a setup, KCP would need to be able to map between the volumes on +the primary and the secondary, so that it could failover and move workloads to the secondary and reconnect to the last +copied volume state. See more on the DR section below. ## Provisioning volumes -Volume provisioning in Kubernetes involves the CSI controllers and sidecar, as well as a custom storage driver. It reconciles PVCs by dynamically creating a PV for a PVC, and binding them together. This process depends on the CSI driver to be running on the `SyncTarget` compute resources, and would not be able to run on KCP workspaces. Instead, KCP will pick a designated `SyncTarget` for the workload placement, which will include the storage claims (PVCs), and the CSI driver on the `SyncTarget` will perform the storage provisioning. +Volume provisioning in Kubernetes involves the CSI controllers and sidecar, as well as a custom storage driver. It +reconciles PVCs by dynamically creating a PV for a PVC, and binding them together. This process depends on the CSI +driver to be running on the `SyncTarget` compute resources, and would not be able to run on KCP workspaces. Instead, KCP +will pick a designated `SyncTarget` for the workload placement, which will include the storage claims (PVCs), and the +CSI driver on the `SyncTarget` will perform the storage provisioning. -In order to support changing workload placement overtime, even if the provisioning `SyncTarget` is offline, KCP will have to retrieve the volume information from that `SyncTarget`, and keep it in the KCP workspace for future coordination. The volume information inside the PV is expected to be transferable between `SyncTargets` that connect to the same storage system and drivers, although some transformations would be required. +In order to support changing workload placement overtime, even if the provisioning `SyncTarget` is offline, KCP will +have to retrieve the volume information from that `SyncTarget`, and keep it in the KCP workspace for future +coordination. The volume information inside the PV is expected to be transferable between `SyncTargets` that connect to +the same storage system and drivers, although some transformations would be required. -To retrieve the volume information and maintain it in KCP, a special sync state is required that will sync **UP** the PV from a `SyncTarget` to KCP. This state is referred to as `Upsync` - see [Resource Upsyncing](locations-and-scheduling.md#resource-upsyncing). +To retrieve the volume information and maintain it in KCP, a special sync state is required that will sync **UP** the PV +from a `SyncTarget` to KCP. This state is referred to as `Upsync` - +see [Resource Upsyncing](placement-locations-and-scheduling.md#resource-upsyncing). -The provisioning flow includes: (A) PVC synced to `SyncTarget`, (B) CSI provisioning on the pcluster, (C) Syncer detects PVC binding and initiates PV `Upsync`. Transformations would be applied in KCP virtual workspace to make sure that the PVC and PV would appear bound in KCP, similar to how it is in a single cluster. Once provisioning itself is complete, coordination logic will switch to a normal `Sync` state, to allow multiple `SyncTargets` to share the same volume, and for owned volumes to move ownership to another `SyncTarget`. +The provisioning flow includes: (A) PVC synced to `SyncTarget`, (B) CSI provisioning on the pcluster, (C) Syncer detects +PVC binding and initiates PV `Upsync`. Transformations would be applied in KCP virtual workspace to make sure that the +PVC and PV would appear bound in KCP, similar to how it is in a single cluster. Once provisioning itself is complete, +coordination logic will switch to a normal `Sync` state, to allow multiple `SyncTargets` to share the same volume, and +for owned volumes to move ownership to another `SyncTarget`. ## Moving shared volumes -Shared volume can easily move to any `SyncTarget` in the same `Location` by syncing the PVC and PV together, so they bind only to each other on the pcluster. Syncing will transform their mutual references so that the `PVC.volumeName = PV.name` and `PV.claimRef = { PVC.name, PVC.namespace }` are set appropriately for the `SyncTarget`, since the downstream `PVC.namespace` and `PV.name` will not be the same as upstream. +Shared volume can easily move to any `SyncTarget` in the same `Location` by syncing the PVC and PV together, so they +bind only to each other on the pcluster. Syncing will transform their mutual references so that +the `PVC.volumeName = PV.name` and `PV.claimRef = { PVC.name, PVC.namespace }` are set appropriately for +the `SyncTarget`, since the downstream `PVC.namespace` and `PV.name` will not be the same as upstream. -Moving volumes will set the volume's `reclaimPolicy` to always `Retain`, to avoid unintended deletion by any one of the `SyncTargets` while others use it. Once deletion of the upstream PVC is initiated, the coordination controller will transform the `reclaimPolicy` to `Delete` for one of the `SyncTargets`. See more in the section on deleting volumes. +Moving volumes will set the volume's `reclaimPolicy` to always `Retain`, to avoid unintended deletion by any one of +the `SyncTargets` while others use it. Once deletion of the upstream PVC is initiated, the coordination controller will +transform the `reclaimPolicy` to `Delete` for one of the `SyncTargets`. See more in the section on deleting volumes. ## Moving owned volumes > **TBD** - this section is a work in progress... #### Detach from owner -Owned volumes require that *at most one* pcluster can use them at any given time. As placement changes, the coordination controller is responsible to serialize the state changes of the volume to move the ownership of the volume safely. First, it will detach the volume from the current owner, and wait for it to acknowledge that it successfully removed it, and only then will sync the volume to a new target. + +Owned volumes require that *at most one* pcluster can use them at any given time. As placement changes, the coordination +controller is responsible to serialize the state changes of the volume to move the ownership of the volume safely. +First, it will detach the volume from the current owner, and wait for it to acknowledge that it successfully removed it, +and only then will sync the volume to a new target. #### Forcing detach with fencing -However, in case the owner is not able to acknowledge that it detached the volume, a forced-detach flow might be possible. The storage system has to support a CSI extension for network fencing, effectively blocking an entire pcluster from accessing the storage until fencing is removed. Once the failed pcluster recovers, and can acknowledge that it detached from the moved volumes, fencing will be removed from the storage and that pcluster can recover the rest of its workloads. + +However, in case the owner is not able to acknowledge that it detached the volume, a forced-detach flow might be +possible. The storage system has to support a CSI extension for network fencing, effectively blocking an entire pcluster +from accessing the storage until fencing is removed. Once the failed pcluster recovers, and can acknowledge that it +detached from the moved volumes, fencing will be removed from the storage and that pcluster can recover the rest of its +workloads. - [kubernetes-csi-addons](https://github.com/csi-addons/kubernetes-csi-addons) -- [NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/main/docs/networkfence.md) (currently implemented only by ceph-csi). +- [NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/main/docs/networkfence.md) (currently + implemented only by ceph-csi). ## Storage classes > **TBD** - this section is a work in progress... -Storage classes can be thought of as templates to PVs, which allow pclusters to support multiple storage providers, or configure different policies for the same provider. Just like PVs are invisible to users, so do storage classes. However, users may choose a storage class by name when specifying their PVCs. When the storage class field is left unspecified (which is common), the pcluster will use its default storage class. However, the default storage class is a bit limited for multi-tenancy because it is one class per the entire pcluster. +Storage classes can be thought of as templates to PVs, which allow pclusters to support multiple storage providers, or +configure different policies for the same provider. Just like PVs are invisible to users, so do storage classes. +However, users may choose a storage class by name when specifying their PVCs. When the storage class field is left +unspecified (which is common), the pcluster will use its default storage class. However, the default storage class is a +bit limited for multi-tenancy because it is one class per the entire pcluster. -Matching storage classes between `SyncTargets` in the same `Location` would be a simple way to ensure that storage can be moved transparently. However KCP should be able to verify the storage classes match across the `Location` and warn when this is not the case, to prevent future issues. +Matching storage classes between `SyncTargets` in the same `Location` would be a simple way to ensure that storage can +be moved transparently. However KCP should be able to verify the storage classes match across the `Location` and warn +when this is not the case, to prevent future issues. #### Open questions + - How to match classes and make sure the same storage system is used in the location? - How to support multiple classes per pcluster (eg. RWO + RWX)? - Maybe a separate `SyncTarget` per class? @@ -96,13 +197,21 @@ Matching storage classes between `SyncTargets` in the same `Location` would be a > **TBD** - this section is a work in progress... -[Persistent-volumes reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming) allows volumes to be configured how to behave when they are reclaimed. By default, storage classes will apply a `reclaimPolicy: Delete` to dynamically provisioned PVs unless explicitly specified to `Retain`. This means that volumes there were provisioned, will also get de-provisioned and their storage will be deleted. However, admins can modify the class to `Retain` volumes, and invoke cleanup on their own schedule. +[Persistent-volumes reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming) allows +volumes to be configured how to behave when they are reclaimed. By default, storage classes will apply +a `reclaimPolicy: Delete` to dynamically provisioned PVs unless explicitly specified to `Retain`. This means that +volumes there were provisioned, will also get de-provisioned and their storage will be deleted. However, admins can +modify the class to `Retain` volumes, and invoke cleanup on their own schedule. -While moving volumes, either shared or owned, the volume's `reclaimPolicy` will be set to `Retain` to prevent any `SyncTarget` from releasing the volume storage on scheduling changes. +While moving volumes, either shared or owned, the volume's `reclaimPolicy` will be set to `Retain` to prevent +any `SyncTarget` from releasing the volume storage on scheduling changes. -Once the PVC is marked for deletion on KCP, the coordination controller will first pick one `SyncTarget` as owner (or use the current owner for owned volumes) and make sure to remove all sharers, and wait for their sync state to be cleared. Then it will set the owner's volume `reclaimPolicy` to `Delete` so that it will release the volume storage. +Once the PVC is marked for deletion on KCP, the coordination controller will first pick one `SyncTarget` as owner (or +use the current owner for owned volumes) and make sure to remove all sharers, and wait for their sync state to be +cleared. Then it will set the owner's volume `reclaimPolicy` to `Delete` so that it will release the volume storage. -Setting a PV to `Retain` on KCP itself should also be respected by the controllers and allow manual cleanup of the volume in KCP, instead of automatically with the PVC. +Setting a PV to `Retain` on KCP itself should also be respected by the controllers and allow manual cleanup of the +volume in KCP, instead of automatically with the PVC. ## Copying volumes @@ -124,11 +233,14 @@ Setting a PV to `Retain` on KCP itself should also be respected by the controlle > **TBD** - this section is a work in progress... #### Shared NFS storage + - NFS server running in every location, external to the `SyncTarget`, but available over the network. -- Note that high-availability and data-protection of the storage itself is out of scope and would be handled by storage admin or provided by enterprise products. +- Note that high-availability and data-protection of the storage itself is out of scope and would be handled by storage + admin or provided by enterprise products. - Workloads allow volumes with RWX access-mode. - KCP picks one `SyncTarget` to be the provisioner and syncs up the volume information. -- After provisioning completes, sync down to any `SyncTarget` in the `Location` that the workload decides to be placed to allow moving transparently as needed when clusters become offline or drained. +- After provisioning completes, sync down to any `SyncTarget` in the `Location` that the workload decides to be placed + to allow moving transparently as needed when clusters become offline or drained. - Once the PVC is deleted, the deletion of the volume itself is performed by one of the `SyncTargets`. ## Roadmap diff --git a/docs/content/concepts/terminology.md b/docs/content/concepts/terminology.md new file mode 100644 index 00000000000..f550d93b4aa --- /dev/null +++ b/docs/content/concepts/terminology.md @@ -0,0 +1,116 @@ +--- +description: > + Contains the definitions shared across design documents around prototyping a kube-like control plane (in KCP). This is + a derivative work of other design documents intended to frame terminology. All future statements that may be changed by + designs is covered by those designs, and not duplicated here. +--- + +# Terminology for kcp + +## Logical cluster + +A logical cluster is a way to subdivide a single kube-apiserver + etcd storage into multiple clusters (different APIs, +separate semantics for access, policy, and control) without requiring multiple instances. A logical cluster is a +mechanism for achieving separation, but may be modelled differently in different use cases. A logical cluster is +similar to a virtual cluster as defined by sig-multicluster, but is able to amortize the cost of a new cluster to be +zero or near-zero memory and storage so that we can create tens of millions of empty clusters cheaply. + +A logical cluster is a storage level concept that adds an additional attribute to an object’s identifier on a +kube-apiserver. Regular servers identify objects by (group, version, resource, optional namespace, name). A logical +cluster enriches an identifier: (group, version, resource, **logical cluster name**, optional namespace, name). + +## Workload Cluster + +A physical cluster is a “real Kubernetes cluster”, i.e. one that can run Kubernetes workloads and accepts standard +Kubernetes API objects. For the near term, it is assumed that a physical cluster is a distribution of Kubernetes and +passes the conformance tests and exposes the behavior a regular Kubernetes admin or user expects. + +## Workspace + +A workspace models a set of user-facing APIs for CRUD. Each workspace is backed by a logical cluster, but not all +logical clusters may be exposed as workspaces. Creating a Workspace object results in a logical cluster being available +via a URL for the client to connect and create resources supported by the APIs in that workspace. There could be +multiple different models that result in logical clusters being created, with different policies or lifecycles, but +Workspace is intended to be the most generic representation of the concept with the broadest possible utility to anyone +building control planes. + +A workspace binds APIs and makes them accessible inside the logical cluster, allocates capacity for creating instances +of those APIs (quota), and defines how multi-workspace operations can be performed by users, clients, and controller +integrations. + +To a user, a workspace appears to be a Kubernetes cluster minus all the container orchestration specific resources. It +has its own discovery, its own OpenAPI spec, and follows the kube-like constraints about uniqueness of +Group-Version-Resource and its behaviour (no two GVRs with different schemas can exist per workspace, but workspaces can +have different schemas). A user can define a workspace as a context in a kubeconfig file and `kubectl get all -A` would +return all objects in all namespaces of that workspace. + +Workspace naming is chosen to be aligned with the Kubernetes Namespace object - a Namespace subdivides a workspace by +name, a workspace subdivides the universe into chunks of meaningful work. + +Workspaces are the containers for all API objects, so users orient by viewing lists of workspaces from APIs. + +## Workspace type + +Workspaces have types, which are mostly oriented around a set of default or optional APIs exposed. For instance, a +workspace intended for use deploying Kube applications might expose the same API objects a user would encounter on a +physical cluster. A workspace intended for building functions might expose only the knative serving APIs, config maps +and secrets, and optionally enable knative eventing APIs. + +At the current time there is no decision on whether a workspace type represents an inheritance or composition model, +although in general we prefer composition approaches. We also do not have a fully resolved design. + +## Virtual Workspace + +An API object has one source of truth (is stored transactionally in one system), but may be exposed to different use +cases with different fields or schemas. Since a workspace is the user facing interaction with an API object, if we want +to deal with Workspaces in aggregate, we need to be able to list them. Since a user may have access to workspaces in +multiple different contexts, or for different use cases (a workspace that belongs to the user personally, or one that +belongs to a business organization), the list of “all workspaces” itself needs to be exposed as an API object to an end +user inside a workspace. That workspace is “virtual” - it adapts or transforms the underlying source of truth for the +object and potentially the schema the user sees. + +## Index (e.g. Workspace Index) + +An index is the authoritative list of a particular API in their source of truth across the system. For instance, in +order for a user to see all the workspaces they have available, they must consult the workspace index to return a list +of their workspaces. It is expected that indices are suitable for consistent LIST/WATCHing (in the kubernetes sense) so +that integrations can be built to view the list of those objects. + +Index in the control plane sense should not be confused with secondary indices (in the database sense), which may be +used to enable a particular index. + +## Shard + +A failure domain within the larger control plane service that cuts across the primary functionality. Most distributed +systems must separate functionality across shards to mitigate failures, and typically users interact with shards through +some transparent serving infrastructure. Since the primary problem of building distributed systems is reasoning about +failure domains and dependencies across them, it is critical to allow operators to effectively match shards, understand +dependencies, and bring them together. + +A control plane should be shardable in a way that maximizes application SLO - gives users a tool that allows them to +better define their applications not to fail. + +## API Binding + +The act of associating a set of APIs with a given logical cluster. The Workspace model defines one particular +implementation of the lifecycle of a logical cluster and the APIs within it. Because APIs and the implementations that +back an API evolve over time, it is important that the binding be introspectable and orchestrate-able - that a consumer +can provide a rolling deployment of a new API or new implementation across hundreds or thousands of workspaces. + +There are likely a few objects involved in defining the APIs exposed within a workspace, but in general they probably +define a spec (which APIs / implementations to associate with) and a status (the chosen APIs / implementations that are +currently bound), allow a user to bulk associate APIs (i.e. multiple APIs at the same time, like “all knative serving +APIs”), and may be defaulted based on some attributes of a workspace type (all workspaces of this “type” get the default +Kube APIs, this other “type” get the knative apis). + +The evolution of an API within a workspace and across workspaces is of key importance. + +## Syncer + +A syncer is installed on a SyncTarget and is responsible for synchronizing data between kcp and that cluster. + +## Location + +A collection of SyncTargets that describe runtime characteristics that allow placement of applications. +Characteristics are not limited but could describe things like GPU, supported storage, compliance or +regulatory fulfillment, or geographical placement. diff --git a/docs/content/concepts/workspaces.md b/docs/content/concepts/workspaces.md index 7796acc9cf1..8ac6cd15365 100644 --- a/docs/content/concepts/workspaces.md +++ b/docs/content/concepts/workspaces.md @@ -1,6 +1,6 @@ --- description: > - What are workspaces and how to use them + What are workspaces and how to use them. --- # Workspaces diff --git a/docs/content/developers/inspecting-e2e-metrics.md b/docs/content/developers/inspecting-e2e-metrics.md index 9de474b098f..09c57121dc9 100644 --- a/docs/content/developers/inspecting-e2e-metrics.md +++ b/docs/content/developers/inspecting-e2e-metrics.md @@ -1,6 +1,6 @@ --- description: > - Inspecting Prometheus metrics of e2e runs + Inspecting Prometheus metrics of e2e runs. --- # Inspecting Prometheus metrics for e2e runs diff --git a/docs/content/developers/publishing-a-new-kcp-release.md b/docs/content/developers/publishing-a-new-kcp-release.md index 25aaba7629a..4e8a9a28f2a 100644 --- a/docs/content/developers/publishing-a-new-kcp-release.md +++ b/docs/content/developers/publishing-a-new-kcp-release.md @@ -1,6 +1,6 @@ --- description: > - Information on the kcp release process + Information on the kcp release process. --- # Publishing a new kcp release diff --git a/docs/content/developers/replicate-new-resource.md b/docs/content/developers/replicate-new-resource.md index 1314540e80f..f95be44b9fa 100644 --- a/docs/content/developers/replicate-new-resource.md +++ b/docs/content/developers/replicate-new-resource.md @@ -1,6 +1,6 @@ --- description: > - How to add a new resource for replication by the cache server + How to add a new resource for replication by the cache server. --- # Replicating new resources in the cache server diff --git a/docs/content/developers/using-kcp-as-a-library.md b/docs/content/developers/using-kcp-as-a-library.md index acc01c87f57..0b174b5c525 100644 --- a/docs/content/developers/using-kcp-as-a-library.md +++ b/docs/content/developers/using-kcp-as-a-library.md @@ -1,7 +1,7 @@ --- linkTitle: "Library Usage" description: > - How to use kcp as a library + How to use kcp as a library. --- # Using kcp as a library diff --git a/docs/content/developers/writing-kcp-aware-controllers.md b/docs/content/developers/writing-kcp-aware-controllers.md index 5b137e64b3f..254c45ea3ae 100644 --- a/docs/content/developers/writing-kcp-aware-controllers.md +++ b/docs/content/developers/writing-kcp-aware-controllers.md @@ -2,7 +2,7 @@ linkTitle: "kcp-aware controllers" weight: 1 description: > - How to write a kcp-aware controller + How to write a kcp-aware controller. --- # Writing kcp-aware controllers diff --git a/docs/content/index.md b/docs/content/index.md index dfa048ef92e..2049b32255a 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -168,8 +168,8 @@ documentation: - [Concepts](concepts) - a high level overview of kcp concepts - [Workspaces](concepts/workspaces.md) - a more thorough introduction on kcp's workspaces -- [Locations & scheduling](concepts/locations-and-scheduling.md) - details on kcp's primitives that abstract over clusters -- [Syncer](concepts/syncer.md) - information on running the kcp agent that syncs content between kcp and a physical cluster +- [Locations & scheduling](concepts/placement-locations-and-scheduling.md) - details on kcp's primitives that abstract over clusters +- [Syncer](concepts/registering-kubernetes-clusters-using-syncer.md) - information on running the kcp agent that syncs content between kcp and a physical cluster - [kubectl plugin](concepts/kubectl-kcp-plugin.md) - [Authorization](concepts/authorization.md) - how kcp manages access control to workspaces and content - [Virtual workspaces](concepts/virtual-workspaces.md) - details on kcp's mechanism for virtual views of workspace content diff --git a/hack/deploy-docs.sh b/docs/scripts/deploy-docs.sh similarity index 78% rename from hack/deploy-docs.sh rename to docs/scripts/deploy-docs.sh index 74e3195d827..b82b7b2d476 100755 --- a/hack/deploy-docs.sh +++ b/docs/scripts/deploy-docs.sh @@ -19,7 +19,7 @@ set -o nounset set -o pipefail set -o xtrace -REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) cd "$REPO_ROOT/docs" if [[ "${GITHUB_EVENT_NAME:-}" == "pull_request" ]]; then @@ -40,7 +40,6 @@ else fi fi - MIKE_OPTIONS=() if [[ -n "${REMOTE:-}" ]]; then @@ -51,13 +50,15 @@ if [[ -n "${BRANCH:-}" ]]; then MIKE_OPTIONS+=(--branch "$BRANCH") fi -git config user.name kcp-docs-bot -git config user.email no-reply@kcp.io - -# Only push to gh-pages if we're in GitHub Actions (CI is set) and we have a non-PR event. -if [[ -n "${CI:-}" && "${GITHUB_EVENT_NAME:-}" == "push" ]]; then - MIKE_OPTIONS+=(--push) +if [[ -n "${CI:-}" ]]; then + if [[ "${GITHUB_EVENT_NAME:-}" == "push" ]]; then + # Only push to gh-pages if we're in GitHub Actions (CI is set) and we have a non-PR event. + MIKE_OPTIONS+=(--push) + fi + # Always set git user info in CI because even if we're not pushing, we need it + git config user.name kcp-docs-bot + git config user.email no-reply@kcp.io fi mike deploy "${MIKE_OPTIONS[@]}" "$VERSION" diff --git a/hack/serve-docs.sh b/docs/scripts/serve-docs.sh similarity index 93% rename from hack/serve-docs.sh rename to docs/scripts/serve-docs.sh index 8bd9c3f671c..45aa61908dd 100755 --- a/hack/serve-docs.sh +++ b/docs/scripts/serve-docs.sh @@ -18,7 +18,7 @@ set -o errexit set -o nounset set -o pipefail -REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) cd "$REPO_ROOT/docs" MIKE_OPTIONS=()