diff --git a/LICENSE.md b/LICENSE.md index 1319ef937..cbfddd387 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -299,6 +299,12 @@ Alibaba Cloud Go Software Development Kit. https://github.com/aliyun/alibaba-cloud-sdk-go. Apache 2 license (https://github.com/aliyun/alibaba-cloud-sdk-go/blob/master/LICENSE) +gRPC-Go. +https://github.com/grpc/grpc-go. +Copyright 2017 gRPC authors. +Apache 2 license (https://github.com/grpc/grpc-go/blob/master/LICENSE) + + ------ ## BSD 3-clause "New" or "Revised" License diff --git a/Makefile b/Makefile index 16986dda4..6834d8654 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ start: --machine-safety-apiserver-statuscheck-period=1m \ --machine-safety-orphan-vms-period=30m \ --machine-safety-overshooting-period=1m \ - --v=2 + --v=3 ################################################################# # Rules related to binary build, Docker image build and release # diff --git a/README.md b/README.md index 15e0ba34f..d51fbe27e 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ kubectl create/get/delete machine vm1 Nodes/Machines/VMs are different terminologies used to represent similar things. We use these terms in the following way -1. VM: A virtual machine running on any cloud provider. +1. VM: A virtual machine running on any cloud provider. It could also refer to a physical machine (PM) in case of a bare metal setup. 1. Node: Native kubernetes node objects. The objects you get to see when you do a *"kubectl get nodes"*. Although nodes can be either physical/virtual machines, for the purposes of our discussions it refers to a VM. 1. Machine: A VM that is provisioned/managed by the Machine Controller Manager. diff --git a/cmd/machine-controller-manager/app/controllermanager.go b/cmd/machine-controller-manager/app/controllermanager.go index 6c09a1c10..760da0748 100644 --- a/cmd/machine-controller-manager/app/controllermanager.go +++ b/cmd/machine-controller-manager/app/controllermanager.go @@ -34,8 +34,9 @@ import ( machinescheme "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/scheme" machineinformers "github.com/gardener/machine-controller-manager/pkg/client/informers/externalversions" - machinecontroller "github.com/gardener/machine-controller-manager/pkg/controller" - corecontroller "github.com/gardener/machine-controller-manager/pkg/util/controller" + mcmcontroller "github.com/gardener/machine-controller-manager/pkg/controller" + corecontroller "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/core" + machinecontroller "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/machine" coreinformers "k8s.io/client-go/informers" kubescheme "k8s.io/client-go/kubernetes/scheme" @@ -63,12 +64,11 @@ const ( controllerManagerAgentName = "machine-controller-manager" ) -var openStackGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "openstackmachineclasses"} -var awsGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "awsmachineclasses"} -var azureGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "azuremachineclasses"} -var gcpGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "gcpmachineclasses"} -var alicloudGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "alicloudmachineclasses"} -var packetGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "packetmachineclasses"} +var ( + machineGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "machines"} + machineSetGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "machinesets"} + machineDeploymentGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "machinedeployments"} +) // Run runs the MCMServer. This should never exit. func Run(s *options.MCMServer) error { @@ -224,7 +224,7 @@ func StartControllers(s *options.MCMServer, klog.Fatal(err) } - if availableResources[awsGVR] || availableResources[azureGVR] || availableResources[gcpGVR] || availableResources[openStackGVR] || availableResources[alicloudGVR] || availableResources[packetGVR] { + if availableResources[machineGVR] || availableResources[machineSetGVR] || availableResources[machineDeploymentGVR] { klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( @@ -250,7 +250,7 @@ func StartControllers(s *options.MCMServer, machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() klog.V(5).Infof("Creating controllers...") - machineController, err := machinecontroller.NewController( + mcmcontroller, err := mcmcontroller.NewController( s.Namespace, controlMachineClient, controlCoreClient, @@ -283,10 +283,10 @@ func StartControllers(s *options.MCMServer, targetCoreInformerFactory.Start(stop) klog.V(5).Info("Running controller") - go machineController.Run(int(s.ConcurrentNodeSyncs), stop) + go mcmcontroller.Run(int(s.ConcurrentNodeSyncs), stop) } else { - return fmt.Errorf("unable to start machine controller: API GroupVersion %q or %q or %q or %q or %q or %q is not available; found %#v", awsGVR, azureGVR, gcpGVR, openStackGVR, alicloudGVR, packetGVR, availableResources) + return fmt.Errorf("unable to start machine controller: API GroupVersion %q or %q or %q is not available; \nFound: %#v", machineGVR, machineSetGVR, machineDeploymentGVR, availableResources) } select {} diff --git a/docs/README.md b/docs/README.md index 72da77fda..704f052f1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,18 +1,25 @@ # Documentation Index -## Development +## Using Out-of-Tree (External) provider support (Recommended) +#### Development +* [Adding support for a new cloud provider](development/cp_support_new.md) + +## Using In-Tree provider support (:warning: DEPRECATED!) + +##### Development + +* [Adding support for a new cloud provider](development/cp_support_old.md) * [Setting up a local development environment](development/local_setup.md) * [Testing and Dependency Management](development/testing_and_dependencies.md) -* [Adding support for a new cloud provider](development/new_cp_support.md) -## Usage +#### Usage * [Setting up your usage environment](usage/prerequisite.md) * [Creating/Deleting machines (VM)](usage/machine.md) * [Maintaining machine replicas using machines-sets](usage/machine_set.md) * [Updating machines using machines-deployments](usage/machine_deployment.md) -## Deployment +#### Deployment -* [Deploying the Machine Controller Manager into a Kubernetes cluster](deployment/kubernetes.md) +* [Deploying the MCM into a Kubernetes cluster using IN-TREE providers](deployment/kubernetes.md) diff --git a/docs/deployment/kubernetes.md b/docs/deployment/kubernetes.md index 4f543524e..697279270 100644 --- a/docs/deployment/kubernetes.md +++ b/docs/deployment/kubernetes.md @@ -37,19 +37,19 @@ $ make docker-image $ make push ``` -- Now you can deploy this docker image to your cluster. A sample development [file is given at](/kubernetes/deployment/deployment.yaml). By default, the deployment manages the cluster it is running in. Optionally, the kubeconfig could also be passed as a flag as described in `/kubernetes/deployment/deployment.yaml`. This is done when you want your controller running outside the cluster to be managed from. +- Now you can deploy this docker image to your cluster. A sample development [file is given at](/kubernetes/deployment/in-tree/deployment.yaml). By default, the deployment manages the cluster it is running in. Optionally, the kubeconfig could also be passed as a flag as described in `/kubernetes/deployment/in-tree/deployment.yaml`. This is done when you want your controller running outside the cluster to be managed from. ```bash -$ kubectl apply -f kubernetes/deployment/deployment.yaml +$ kubectl apply -f kubernetes/deployment/in-tree/deployment.yaml ``` - Also deploy the required clusterRole and clusterRoleBindings ```bash -$ kubectl apply -f kubernetes/deployment/clusterrole.yaml -$ kubectl apply -f kubernetes/deployment/clusterrolebinding.yaml +$ kubectl apply -f kubernetes/deployment/in-tree/clusterrole.yaml +$ kubectl apply -f kubernetes/deployment/in-tree/clusterrolebinding.yaml ``` ## Configuring optional parameters while deploying -Machine-controller-manager supports several configurable parameters while deploying. Refer to [the following lines](/kubernetes/deployment/deployment.yaml#L21-L30), to know how each parameter can be configured, and what it's purpose is for. +Machine-controller-manager supports several configurable parameters while deploying. Refer to [the following lines](/kubernetes/deployment/in-tree/deployment.yaml#L21-L30), to know how each parameter can be configured, and what it's purpose is for. ## Usage diff --git a/docs/development/cp_support_new.md b/docs/development/cp_support_new.md new file mode 100644 index 000000000..3c7544682 --- /dev/null +++ b/docs/development/cp_support_new.md @@ -0,0 +1,144 @@ +# Adding support for a new provider + +Steps to be followed while implementing a new (hyperscale) provider are mentioned below. This is the easiest way to add a new provider support using a blueprint code. + +However, you may also develop your machine controller from scratch which would provide you more flexibility. However make sure that your custom machine controller adhere's to the `Machine.Status` struct defined in the [MachineAPIs](/pkg/apis/machine/types.go) to make sure the MCM is able to act with higher level controllers like MachineSet and MachineDeployment controller. The key is the `Machine.Status.CurrentStatus.Phase` key that indicates the status of the machine object. + +Our strong recommendation would be to follow the steps below as this provides most flexibility required to support machine management for adding new providers. And if you feel to extend the functionality feel free to update our [machine controller libraries](/pkg/util/provider). + +## Setting up your repository + +1. Create a new empty repository named `machine-controller-manager-provider-{provider-name}` on github username/project. Do not initialize this repository with a README. +1. Copy the remote repository `URL` (HTTPS/SSH) to this repository which is displayed once you create this repository. +1. Now on your local system, create directories as required. {your-github-username} given below could also be {github-project} depending on where you have created the new repository. + ```bash + mkdir -p $GOPATH/src/github.com/{your-github-username} + ``` +1. Navigate to this created directory. + ```bash + cd $GOPATH/src/github.com/{your-github-username} + ``` +1. Clone [this repository](https://github.com/gardener/machine-controller-manager-provider-sampleprovider) on your local machine. + ```bash + git clone git@github.com:gardener/machine-controller-manager-provider-sampleprovider.git + ``` +1. Rename the directory from `machine-controller-manager-provider-sampleprovider` to `machine-controller-manager-provider-{provider-name}`. + ```bash + mv machine-controller-manager-provider-sampleprovider machine-controller-manager-provider-{provider-name} + ``` +1. Navigate into the newly created directory. + ```bash + cd machine-controller-manager-provider-{provider-name} + ``` +1. Update the remote `origin` URL to the newly created repository's URL you had copied above. + ```bash + git remote set-url origin git@github.com:{your-github-username}/machine-controller-manager-provider-{provider-name}.git + ``` +1. Rename github project from `gardener` to `{github-org/your-github-username}` where ever you have cloned the repository above. Use the hack script given below to do the same. + ```bash + make rename-project PROJECT_NAME={github-org/your-github-username} + eg: + make rename-project PROJECT_NAME=gardener (or) + make rename-project PROJECT_NAME=githubusername + ``` +1. Rename all files and code from `SampleProvider` to your desired `{provider-name}`. Use the hack script given below to do the same. {provider-name} is case sensitive. + ```bash + make rename-provider PROVIDER_NAME={provider-name} + eg: + make rename-provider PROVIDER_NAME=AmazonWebServices (or) + make rename-provider PROVIDER_NAME=AWS + ``` +1. Now commit your changes and push it upstream. + ```bash + git add -A + git commit -m "Renamed SampleProvide to {provider-name}" + git push origin master + ``` + +## Code changes required + +The contract between he Machine Controller Manager (MCM) and the Machine Controller (MC) AKA driver has been [documented here](machine_error_codes.md) and the [machine error codes can be found here](/pkg/util/provider/machinecodes/codes/codes.go). You may refer to them for any queries. + +:warning: +- Keep in mind that, **there should to be a unique way to map between machine objects and VMs**. This can be done by mapping machine object names with VM-Name/ tags/ other metadata. +- Optionally there should also be a unique way to map a VM to it's machine class object. This can be done by tagging VM objects with tags/resource-groups associated with the machine class. + +#### Steps to integrate + +1. Update the `pkg/provider/apis/provider_spec.go` specification file to reflect the structure of the `ProviderSpec` blob. It typically contains the machine template details in the `MachineClass` object. Follow the sample spec provided already in the file. A sample provider specification can be found [here](https://github.com/gardener/machine-controller-manager-provider-aws/blob/master/pkg/aws/apis/aws_provider_spec.go). +1. Fill in the methods described at `pkg/provider/core.go` to manage VMs on your cloud provider. Comments are provided above each method to help you fill them up with desired `REQUEST` and `RESPONSE` parameters. + - A sample provider implementation for these methods can be found [here](https://github.com/gardener/machine-controller-manager-provider-aws/blob/master/pkg/aws/core.go). + - Fill in the required methods `CreateMachine()`, and `DeleteMachine()` methods. + - Optionally fill in methods like `GetMachineStatus()`, `ListMachines()`, and `GetVolumeIDs()`. You may choose to fill these, once the working of the required methods seem to be working. + - `GetVolumeIDs()` expects VolumeIDs to be decoded from the volumeSpec based on the cloud provider. +1. Perform validation of APIs that you have described and make it a part of your methods as required at each requests. +1. Write unit tests to make it work with your implementation by running `make test`. + ```bash + make test + ``` +1. Re-generate the vendors, to update any new vendors imported. + ```bash + make revendor + ``` +1. Update the sample YAML files on `kubernetes/` directory to provide sample files through which the working of the machine controller can be tested. +1. Update `README.md` to reflect any additional changes + +## Testing your code changes + +Make sure `$TARGET_KUBECONFIG` points to the cluster where you wish to manage machines. `$CONTROL_NAMESPACE` represents the namespaces where MCM is looking for machine CR objects, and `$CONTROL_KUBECONFIG` points to the cluster which holds these machine CRs. + +1. On the first terminal running at `$GOPATH/src/github.com/{github-org/your-github-username}/machine-controller-manager-provider-{provider-name}`, + - Run the machine controller (driver) using the command below. + ```bash + make start + ``` +1. On the second terminal pointing to `$GOPATH/src/github.com/gardener`, + - Clone the [latest MCM code](https://github.com/gardener/machine-controller-manager) + ```bash + git clone git@github.com:gardener/machine-controller-manager.git + ``` + - Navigate to the newly created directory. + ```bash + cd machine-controller-manager + ``` + - Deploy the required CRDs from the machine-controller-manager repo, + ```bash + kubectl apply -f kubernetes/crds.yaml + ``` + - Run the machine-controller-manager in the `cmi-client` branch + ```bash + make start + ``` +1. On the third terminal pointing to `$GOPATH/src/github.com/{github-org/your-github-username}/machine-controller-manager-provider-{provider-name}` + - Fill in the object files given below and deploy them as described below. + - Deploy the `machine-class` + ```bash + kubectl apply -f kubernetes/machine-class.yaml + ``` + - Deploy the `kubernetes secret` if required. + ```bash + kubectl apply -f kubernetes/secret.yaml + ``` + - Deploy the `machine` object and make sure it joins the cluster successfully. + ```bash + kubectl apply -f kubernetes/machine.yaml + ``` + - Once machine joins, you can test by deploying a machine-deployment. + - Deploy the `machine-deployment` object and make sure it joins the cluster successfully. + ```bash + kubectl apply -f kubernetes/machine-deployment.yaml + ``` + - Make sure to delete both the `machine` and `machine-deployment` object after use. + ```bash + kubectl delete -f kubernetes/machine.yaml + kubectl delete -f kubernetes/machine-deployment.yaml + ``` + +## Releasing your docker image + +1. Make sure you have logged into gcloud/docker using the CLI. +2. To release your docker image, run the following. +```bash + make release IMAGE_REPOSITORY= +``` +3. A sample kubernetes deploy file can be found at `kubernetes/deployment.yaml`. Update the same (with your desired MCM and MC images) to deploy your MCM pod. \ No newline at end of file diff --git a/docs/development/new_cp_support.md b/docs/development/cp_support_old.md similarity index 94% rename from docs/development/new_cp_support.md rename to docs/development/cp_support_old.md index cb5d0e11c..af0e8bac3 100644 --- a/docs/development/new_cp_support.md +++ b/docs/development/cp_support_old.md @@ -15,5 +15,5 @@ For adding support for a new cloud provider in the Machine Controller Manager, f 1. Update `pkg/controller/controller.go` to add new providerMachineClassLister, providerMachineClassQueue, awsMachineClassSynced into the controller struct. Also initialize them in NewController() method. 1. Add a new file `pkg/controller/providermachineclass.go` that allows re-queuing of machines which refer to an modified providerMachineClass. 1. Update `pkg/controller/controller.go` to extend `WaitForCacheSync` and `.Shutdown()` similar to other cloud providers. -1. Update the example ClusterRole in `kubernetes/deployment/clusterrole.yaml` to allow operations on your new machine class. +1. Update the example ClusterRole in `kubernetes/deployment/in-tree/clusterrole.yaml` to allow operations on your new machine class. 1. Update `pkg/controller/controller.go`, `pkg/controller/secret.go`, `pkg/controller/secret_util.go` to add event handlers to add/remove finalizers referenced by your machine Class. Refer [this commit](https://github.com/gardener/machine-controller-manager/pull/104/commits/013f70726b1057aed1cf7fe0f0449922ab9a256a). diff --git a/docs/development/local_setup.md b/docs/development/local_setup.md index d74e049e0..f2b302057 100644 --- a/docs/development/local_setup.md +++ b/docs/development/local_setup.md @@ -77,9 +77,11 @@ I1227 11:08:19.963638 55523 controllermanager.go:204] Starting shared informer I1227 11:08:20.766085 55523 controller.go:247] Starting machine-controller-manager ``` +:warning: The file `dev/target-kubeconfig.yaml` points to the cluster whose nodes you want to manage. `dev/control-kubeconfig.yaml` points to the cluster from where you want to manage the nodes from. However, `dev/control-kubeconfig.yaml` is optional. + The Machine Controller Manager should now be ready to manage the VMs in your kubernetes cluster. -:warning: The file `dev/target-kubeconfig.yaml` points to the cluster whose nodes you want to manage. `dev/control-kubeconfig.yaml` points to the cluster from where you want to manage the nodes from. However, `dev/control-kubeconfig.yaml` is optional. +:warning: This is assuming that your MCM is built to manage machines for any in-tree supported providers. There is a new way to deploy and manage out of tree (external) support for providers whose development can be [found here](cp_support_new.md) ## Testing Machine Classes diff --git a/docs/development/machine_error_codes.md b/docs/development/machine_error_codes.md new file mode 100644 index 000000000..4d2c8b7c7 --- /dev/null +++ b/docs/development/machine_error_codes.md @@ -0,0 +1,450 @@ +# Machine Error code handling + +## Notational Conventions + +The keywords "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" are to be interpreted as described in [RFC 2119](http://tools.ietf.org/html/rfc2119) (Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, March 1997). + +The key words "unspecified", "undefined", and "implementation-defined" are to be interpreted as described in the [rationale for the C99 standard](http://www.open-std.org/jtc1/sc22/wg14/www/C99RationaleV5.10.pdf#page=18). + +An implementation is not compliant if it fails to satisfy one or more of the MUST, REQUIRED, or SHALL requirements for the protocols it implements. +An implementation is compliant if it satisfies all the MUST, REQUIRED, and SHALL requirements for the protocols it implements. + +## Terminology + +| Term | Definition | +|-----------------|--------------------------------------------------| +| CR | Custom Resource (CR) is defined by a cluster admin using the Kubernetes Custom Resource Definition primitive. | +| VM | A Virtual Machine (VM) provisioned and managed by a provider. It could also refer to a physical machine in case of a bare metal provider.| +| Machine | Machine refers to a VM that is provisioned/managed by MCM. It typically describes the metadata used to store/represent a Virtual Machine | +| Node | Native kubernetes `Node` object. The objects you get to see when you do a "kubectl get nodes". Although nodes can be either physical/virtual machines, for the purposes of our discussions it refers to a VM. | +| MCM | [Machine Controller Manager (MCM)](https://github.com/gardener/machine-controller-manager) is the controller used to manage higher level Machine Custom Resource (CR) such as machine-set and machine-deployment CRs. | +| Provider/Driver/MC | `Provider` (or) `Driver` (or) `Machine Controller (MC)` is the driver responsible for managing machine objects present in the cluster from whom it manages these machines. A simple example could be creation/deletion of VM on the provider. | + +## Pre-requisite + +### MachineClass Resources + +MCM introduces the CRD `MachineClass`. This is a blueprint for creating machines that join a certain cluster as nodes in a certain role. The provider only works with `MachineClass` resources that have the structure described here. + +#### ProviderSpec + +The `MachineClass` resource contains a `providerSpec` field that is passed in the `ProviderSpec` request field to CMI methods such as [CreateMachine](#createmachine). The `ProviderSpec` can be thought of as a machine template from which the VM specification must be adopted. It can contain key-value pairs of these specs. An example for these key-value pairs are given below. + +| Parameter | Mandatory | Type | Description | +|---|---|---|---| +| `vmPool` | Yes | `string` | VM pool name, e.g. `TEST-WOKER-POOL` | +| `size` | Yes | `string` | VM size, e.g. `xsmall`, `small`, etc. Each size maps to a number of CPUs and memory size. | +| `rootFsSize` | No | `int` | Root (`/`) filesystem size in GB | +| `tags` | Yes | `map` | Tags to be put on the created VM | + +Most of the `ProviderSpec` fields are not mandatory. If not specified, the provider passes an empty value in the respective `Create VM` parameter. + +The `tags` can be used to map a VM to its corresponding machine object's Name + +The `ProviderSpec` is validated by methods that receive it as a request field for presence of all mandatory parameters and tags, and for validity of all parameters. + +#### Secrets + +The `MachineClass` resource also contains a `secretRef` field that contains a reference to a secret. The keys of this secret are passed in the `Secrets` request field to CMI methods. + +The secret can contain sensitive data such as +- `cloud-credentials` secret data used to authenticate at the provider +- `cloud-init` scripts used to initialize a new VM. The cloud-init script is expected to contain scripts to initialize the Kubelet and make it join the cluster. + +#### Identifying Cluster Machines + +To implement certain methods, the provider should be able to identify all machines associated with a particular Kubernetes cluster. This can be achieved using one/more of the below mentioned ways: + +* Names of VMs created by the provider are prefixed by the cluster ID specified in the ProviderSpec. +* VMs created by the provider are tagged with the special tags like `kubernetes.io/cluster` (for the cluster ID) and `kubernetes.io/role` (for the role), specified in the ProviderSpec. +* Mapping `Resource Groups` to individual cluster. + +### Error Scheme + +All provider API calls defined in this spec MUST return a [machine error status](/pkg/util/provider/machinecodes/codes/codes.go), which is very similar to [standard machine status](https://github.com/grpc/grpc/blob/master/src/proto/grpc/status/status.proto). + + +### Machine Provider Interface + +- The provider MUST have a unique way to map a `machine object` to a `VM` which triggers the deletion for the corresponding VM backing the machine object. +- The provider SHOULD have a unique way to map the `ProviderSpec` of a machine-class to a unique `Cluster`. This avoids deletion of other machines, not backed by the MCM. + +#### `CreateMachine` + +A Provider is REQUIRED to implement this interface method. +This interface method will be called by the MCM to provision a new VM on behalf of the requesting machine object. + +- This call requests the provider to create a VM backing the machine-object. +- If VM backing the `Machine.Name` already exists, and is compatible with the specified `Machine` object in the `CreateMachineRequest`, the Provider MUST reply `0 OK` with the corresponding `CreateMachineResponse`. +- The provider can OPTIONALLY make use of the MachineClass supplied in the `MachineClass` in the `CreateMachineRequest` to communicate with the provider. +- The provider can OPTIONALLY make use of the secrets supplied in the `Secret` in the `CreateMachineRequest` to communicate with the provider. +- The provider can OPTIONALLY make use of the `Status.LastKnownState` in the `Machine` object to decode the state of the VM operation based on the last known state of the VM. This can be useful to restart/continue an operations which are mean't to be atomic. +- The provider MUST have a unique way to map a `machine object` to a `VM`. This could be implicitly provided by the provider by letting you set VM-names (or) could be explicitly specified by the provider using appropriate tags to map the same. +- This operation SHOULD be idempotent. + +- The `CreateMachineResponse` returned by this method is expected to return + - `ProviderID` that uniquely identifys the VM at the provider. This is expected to match with the node.Spec.ProviderID on the node object. + - `NodeName` that is the expected name of the machine when it joins the cluster. It must match with the node name. + - `LastKnownState` is an OPTIONAL field that can store details of the last known state of the VM. It can be used by future operation calls to determine current infrastucture state. This state is saved on the machine object. + +```protobuf +// CreateMachine call is responsible for VM creation on the provider +CreateMachine(context.Context, *CreateMachineRequest) (*CreateMachineResponse, error) + +// CreateMachineRequest is the create request for VM creation +type CreateMachineRequest struct { + // Machine object from whom VM is to be created + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// CreateMachineResponse is the create response for VM creation +type CreateMachineResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string + + // LastKnownState represents the last state of the VM during an creation/deletion error + LastKnownState string +} +``` + +##### CreateMachine Errors + +If the provider is unable to complete the CreateMachine call successfully, it MUST return a non-ok ginterface method code in the machine status. +If the conditions defined below are encountered, the provider MUST return the specified machine error code. +The MCM MUST implement the specified error recovery behavior when it encounters the machine error code. + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call was successful in creating/adopting a VM that matches supplied creation request. The `CreateMachineResponse` is returned with desired values | | N | +| 1 CANCELED | Cancelled | Call was cancelled. Perform any pending clean-up tasks and return the call | | N | +| 2 UNKNOWN | Something went wrong | Not enough information on what went wrong | Retry operation after sometime | Y | +| 3 INVALID_ARGUMENT | Re-check supplied parameters | Re-check the supplied `Machine.Name` and `ProviderSpec`. Make sure all parameters are in permitted range of values. Exact issue to be given in `.message` | Update providerSpec to fix issues. | N | +| 4 DEADLINE_EXCEEDED | Timeout | The call processing exceeded supplied deadline | Retry operation after sometime | Y | +| 5 NOT_FOUND | | | | | +| 6 ALREADY_EXISTS | Already exists but desired parameters doesn't match | Parameters of the existing VM don't match the ProviderSpec | Create machine with a different name | N | +| 7 PERMISSION_DENIED | Insufficent permissions | The requestor doesn't have enough permissions to create an VM and it's required dependencies | Update requestor permissions to grant the same | N | +| 8 RESOURCE_EXHAUSTED | Resource limits have been reached | The requestor doesn't have enough resource limits to process this creation request | Enhance resource limits associated with the user/account to process this | N | +| 9 PRECONDITION_FAILED | VM is in inconsistent state | The VM is in a state that is invalid for this operation | Manual intervention might be needed to fix the state of the VM | N | +| 10 ABORTED | Operation is pending | Indicates that there is already an operation pending for the specified machine | Wait until previous pending operation is processed | Y | +| 11 OUT_OF_RANGE | Resources were out of range | The requested number of CPUs, memory size, of FS size in ProviderSpec falls outside of the corresponding valid range | Update request paramaters to request valid resource requests | N | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Retry with an alternate logic or implement this method at the provider. Most methods by default are in this state | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | +| 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | +| 15 DATALOSS | | | | | +| 16 UNAUTHENTICATED | Missing provider credentials | Request does not have valid authentication credentials for the operation | Fix the provider credentials | N | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + +#### `DeleteMachine` + +A Provider is REQUIRED to implement this driver call. +This driver call will be called by the MCM to deprovision/delete/terminate a VM backed by the requesting machine object. + +- If a VM corresponding to the specified machine-object's name does not exist or the artifacts associated with the VM do not exist anymore (after deletion), the Provider MUST reply `0 OK`. +- The provider SHALL only act on machines belonging to the cluster-id/cluster-name obtained from the `ProviderSpec`. +- The provider can OPTIONALY make use of the secrets supplied in the `Secrets` map in the `DeleteMachineRequest` to communicate with the provider. +- The provider can OPTIONALY make use of the `Spec.ProviderID` map in the `Machine` object. +- The provider can OPTIONALLY make use of the `Status.LastKnownState` in the `Machine` object to decode the state of the VM operation based on the last known state of the VM. This can be useful to restart/continue an operations which are mean't to be atomic. +- This operation SHOULD be idempotent. +- The provider must have a unique way to map a `machine object` to a `VM` which triggers the deletion for the corresponding VM backing the machine object. + +- The `DeleteMachineResponse` returned by this method is expected to return + - `LastKnownState` is an OPTIONAL field that can store details of the last known state of the VM. It can be used by future operation calls to determine current infrastucture state. This state is saved on the machine object. + +```protobuf +// DeleteMachine call is responsible for VM deletion/termination on the provider +DeleteMachine(context.Context, *DeleteMachineRequest) (*DeleteMachineResponse, error) + +// DeleteMachineRequest is the delete request for VM deletion +type DeleteMachineRequest struct { + // Machine object from whom VM is to be deleted + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// DeleteMachineResponse is the delete response for VM deletion +type DeleteMachineResponse struct { + // LastKnownState represents the last state of the VM during an creation/deletion error + LastKnownState string +} +``` + +##### DeleteMachine Errors + +If the provider is unable to complete the DeleteMachine call successfully, it MUST return a non-ok machine code in the machine status. +If the conditions defined below are encountered, the provider MUST return the specified machine error code. + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call was successful in deleting a VM that matches supplied deletion request. | | N | +| 1 CANCELED | Cancelled | Call was cancelled. Perform any pending clean-up tasks and return the call | | N | +| 2 UNKNOWN | Something went wrong | Not enough information on what went wrong | Retry operation after sometime | Y | +| 3 INVALID_ARGUMENT | Re-check supplied parameters | Re-check the supplied `Machine.Name` and make sure that it is in the desired format and not a blank value. Exact issue to be given in `.message` | Update `Machine.Name` to fix issues. | N | +| 4 DEADLINE_EXCEEDED | Timeout | The call processing exceeded supplied deadline | Retry operation after sometime | Y | +| 5 NOT_FOUND | | | | | +| 6 ALREADY_EXISTS | | | | | +| 7 PERMISSION_DENIED | Insufficent permissions | The requestor doesn't have enough permissions to delete an VM and it's required dependencies | Update requestor permissions to grant the same | N | +| 8 RESOURCE_EXHAUSTED | | | | | +| 9 PRECONDITION_FAILED | VM is in inconsistent state | The VM is in a state that is invalid for this operation | Manual intervention might be needed to fix the state of the VM | N | +| 10 ABORTED | Operation is pending | Indicates that there is already an operation pending for the specified machine | Wait until previous pending operation is processed | Y | +| 11 OUT_OF_RANGE | | | | | | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Retry with an alternate logic or implement this method at the provider. Most methods by default are in this state | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | +| 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | +| 15 DATALOSS | | | | | +| 16 UNAUTHENTICATED | Missing provider credentials | Request does not have valid authentication credentials for the operation | Fix the provider credentials | N | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + +#### `GetMachineStatus` + +A Provider can OPTIONALLY implement this driver call. Else should return a `NOT_IMPLEMENTED` status in error. +This call will be invoked by the MC to get the status of a machine. +This optional driver call helps in optimizing the working of the provider by avoiding unwanted calls to `CreateMachine()` and `DeleteMachine()`. + +- If a VM corresponding to the specified machine object's `Machine.Name` exists on provider the `GetMachineStatusResponse` fields are to be filled similar to the `CreateMachineResponse`. +- The provider SHALL only act on machines belonging to the cluster-id/cluster-name obtained from the `ProviderSpec`. +- The provider can OPTIONALY make use of the secrets supplied in the `Secrets` map in the `GetMachineStatusRequest` to communicate with the provider. +- The provider can OPTIONALY make use of the VM unique ID (returned by the provider on machine creation) passed in the `ProviderID` map in the `GetMachineStatusRequest`. +- This operation MUST be idempotent. + +```protobuf +// GetMachineStatus call get's the status of the VM backing the machine object on the provider +GetMachineStatus(context.Context, *GetMachineStatusRequest) (*GetMachineStatusResponse, error) + +// GetMachineStatusRequest is the get request for VM info +type GetMachineStatusRequest struct { + // Machine object from whom VM status is to be fetched + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// GetMachineStatusResponse is the get response for VM info +type GetMachineStatusResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string +} +``` + +##### GetMachineStatus Errors + +If the provider is unable to complete the GetMachineStatus call successfully, it MUST return a non-ok machine code in the machine status. +If the conditions defined below are encountered, the provider MUST return the specified machine error code. + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call was successful in getting machine details for given machine `Machine.Name` | | N | +| 1 CANCELED | Cancelled | Call was cancelled. Perform any pending clean-up tasks and return the call | | N | +| 2 UNKNOWN | Something went wrong | Not enough information on what went wrong | Retry operation after sometime | Y | +| 3 INVALID_ARGUMENT | Re-check supplied parameters | Re-check the supplied `Machine.Name` and make sure that it is in the desired format and not a blank value. Exact issue to be given in `.message` | Update `Machine.Name` to fix issues. | N | +| 4 DEADLINE_EXCEEDED | Timeout | The call processing exceeded supplied deadline | Retry operation after sometime | Y | +| 5 NOT_FOUND | Machine isn't found at provider | The machine could not be found at provider | Not required | N | +| 6 ALREADY_EXISTS | | | | | +| 7 PERMISSION_DENIED | Insufficent permissions | The requestor doesn't have enough permissions to get details for the VM and it's required dependencies | Update requestor permissions to grant the same | N | +| 8 RESOURCE_EXHAUSTED | | | | | +| 9 PRECONDITION_FAILED | VM is in inconsistent state | The VM is in a state that is invalid for this operation | Manual intervention might be needed to fix the state of the VM | N | +| 10 ABORTED | | | | | +| 11 OUT_OF_RANGE | Multiple VMs found | Multiple VMs found with matching machine object names | Orphan VM handler to cleanup orphan VMs / Manual intervention maybe required if orphan VM handler isn't enabled. | Y | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Retry with an alternate logic or implement this method at the provider. Most methods by default are in this state | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | +| 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | +| 15 DATALOSS | | | | | +| 16 UNAUTHENTICATED | Missing provider credentials | Request does not have valid authentication credentials for the operation | Fix the provider credentials | N | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + + +#### `ListMachines` + +A Provider can OPTIONALLY implement this driver call. Else should return a `NOT_IMPLEMENTED` status in error. +The Provider SHALL return the information about all the machines associated with the `MachineClass`. +Make sure to use appropriate filters to achieve the same to avoid data transfer overheads. +This optional driver call helps in cleaning up orphan VMs present in the cluster. If not implemented, any orphan VM that might have been created incorrectly by the MCM/Provider (due to bugs in code/infra) might require manual clean up. + +- If the Provider succeeded in returning a list of `Machine.Name` with their corresponding `ProviderID`, then return `0 OK`. +- The `ListMachineResponse` contains a map of `MachineList` whose + - Key is expected to contain the `ProviderID` & + - Value is expected to contain the `Machine.Name` corresponding to it's kubernetes machine CR object +- The provider can OPTIONALY make use of the secrets supplied in the `Secrets` map in the `ListMachinesRequest` to communicate with the provider. + +```protobuf +// ListMachines lists all the machines that might have been created by the supplied machineClass +ListMachines(context.Context, *ListMachinesRequest) (*ListMachinesResponse, error) + +// ListMachinesRequest is the request object to get a list of VMs belonging to a machineClass +type ListMachinesRequest struct { + // MachineClass object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// ListMachinesResponse is the response object of the list of VMs belonging to a machineClass +type ListMachinesResponse struct { + // MachineList is the map of list of machines. Format for the map should be . + MachineList map[string]string +} +``` + +##### ListMachines Errors + +If the provider is unable to complete the ListMachines call successfully, it MUST return a non-ok machine code in the machine status. +If the conditions defined below are encountered, the provider MUST return the specified machine error code. +The MCM MUST implement the specified error recovery behavior when it encounters the machine error code. + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call for listing all VMs associated with `ProviderSpec` was successful. | | N | +| 1 CANCELED | Cancelled | Call was cancelled. Perform any pending clean-up tasks and return the call | | N | +| 2 UNKNOWN | Something went wrong | Not enough information on what went wrong | Retry operation after sometime | Y | +| 3 INVALID_ARGUMENT | Re-check supplied parameters | Re-check the supplied `ProviderSpec` and make sure that all required fields are present in their desired value format. Exact issue to be given in `.message` | Update `ProviderSpec` to fix issues. | N | +| 4 DEADLINE_EXCEEDED | Timeout | The call processing exceeded supplied deadline | Retry operation after sometime | Y | +| 5 NOT_FOUND | | | | | +| 6 ALREADY_EXISTS | | | | | +| 7 PERMISSION_DENIED | Insufficent permissions | The requestor doesn't have enough permissions to list VMs and it's required dependencies | Update requestor permissions to grant the same | N | +| 8 RESOURCE_EXHAUSTED | | | | | +| 9 PRECONDITION_FAILED | | | | | +| 10 ABORTED | | | | | +| 11 OUT_OF_RANGE | | | | | | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Retry with an alternate logic or implement this method at the provider. Most methods by default are in this state | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | +| 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | +| 15 DATALOSS | | | | | +| 16 UNAUTHENTICATED | Missing provider credentials | Request does not have valid authentication credentials for the operation | Fix the provider credentials | N | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + +#### `GetVolumeIDs` + +A Provider can OPTIONALLY implement this driver call. Else should return a `NOT_IMPLEMENTED` status in error. +This driver call will be called by the MCM to get the `VolumeIDs` for the list of `PersistantVolumes (PVs)` supplied. +This OPTIONAL (but recommended) driver call helps in serailzied eviction of pods with PVs while draining of machines. This implies applications backed by PVs would be evicted one by one, leading to shorter application downtimes. + +- On succesful returnal of a list of `Volume-IDs` for all supplied `PVSpecs`, the Provider MUST reply `0 OK`. +- The `GetVolumeIDsResponse` is expected to return a repeated list of `strings` consisting of the `VolumeIDs` for `PVSpec` that could be extracted. +- If for any `PV` the Provider wasn't able to identify the `Volume-ID`, the provider MAY chose to ignore it and return the `Volume-IDs` for the rest of the `PVs` for whom the `Volume-ID` was found. +- Getting the `VolumeID` from the `PVSpec` depends on the Cloud-provider. You can extract this information by parsing the `PVSpec` based on the `ProviderType` + - https://github.com/kubernetes/api/blob/release-1.15/core/v1/types.go#L297-L339 + - https://github.com/kubernetes/api/blob/release-1.15//core/v1/types.go#L175-L257 +- This operation MUST be idempotent. + +```protobuf +// GetVolumeIDsRequest is the request object to get a list of VolumeIDs for a PVSpec +type GetVolumeIDsRequest struct { + // PVSpecsList is a list of PV specs for whom volume-IDs are required + // Plugin should parse this raw data into pre-defined list of PVSpecs + PVSpecs []*corev1.PersistentVolumeSpec +} + +// GetVolumeIDsResponse is the response object of the list of VolumeIDs for a PVSpec +type GetVolumeIDsResponse struct { + // VolumeIDs is a list of VolumeIDs. + VolumeIDs []string +} +``` + +##### GetVolumeIDs Errors + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call getting list of `VolumeIDs` for the list of `PersistantVolumes` was successful. | | N | +| 1 CANCELED | Cancelled | Call was cancelled. Perform any pending clean-up tasks and return the call | | N | +| 2 UNKNOWN | Something went wrong | Not enough information on what went wrong | Retry operation after sometime | Y | +| 3 INVALID_ARGUMENT | Re-check supplied parameters | Re-check the supplied `PVSpecList` and make sure that it is in the desired format. Exact issue to be given in `.message` | Update `PVSpecList` to fix issues. | N | +| 4 DEADLINE_EXCEEDED | Timeout | The call processing exceeded supplied deadline | Retry operation after sometime | Y | +| 5 NOT_FOUND | | | | | +| 6 ALREADY_EXISTS | | | | | +| 7 PERMISSION_DENIED | | | | | +| 8 RESOURCE_EXHAUSTED | | | | | +| 9 PRECONDITION_FAILED | | | | | +| 10 ABORTED | | | | | +| 11 OUT_OF_RANGE | | | | | | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Retry with an alternate logic or implement this method at the provider. Most methods by default are in this state | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | +| 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | +| 15 DATALOSS | | | | | +| 16 UNAUTHENTICATED | | | | | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + +## Configuration and Operation + +### Supervised Lifecycle Management + +* For Providers packaged in software form: + * Provider Packages SHOULD use a well-documented container image format (e.g., Docker, OCI). + * The chosen package image format MAY expose configurable Provider properties as environment variables, unless otherwise indicated in the section below. + Variables so exposed SHOULD be assigned default values in the image manifest. + * A Provider Supervisor MAY programmatically evaluate or otherwise scan a Provider Package’s image manifest in order to discover configurable environment variables. + * A Provider SHALL NOT assume that an operator or Provider Supervisor will scan an image manifest for environment variables. + +#### Environment Variables + +* Variables defined by this specification SHALL be identifiable by their `MC_` name prefix. +* Configuration properties not defined by the MC specification SHALL NOT use the same `MC_` name prefix; this prefix is reserved for common configuration properties defined by the MC specification. +* The Provider Supervisor SHOULD supply all RECOMMENDED MC environment variables to a Provider. +* The Provider Supervisor SHALL supply all REQUIRED MC environment variables to a Provider. + +##### Logging + +* Providers SHOULD generate log messages to ONLY standard output and/or standard error. + * In this case the Provider Supervisor SHALL assume responsibility for all log lifecycle management. +* Provider implementations that deviate from the above recommendation SHALL clearly and unambiguously document the following: + * Logging configuration flags and/or variables, including working sample configurations. + * Default log destination(s) (where do the logs go if no configuration is specified?) + * Log lifecycle management ownership and related guidance (size limits, rate limits, rolling, archiving, expunging, etc.) applicable to the logging mechanism embedded within the Provider. +* Providers SHOULD NOT write potentially sensitive data to logs (e.g. secrets). + +##### Available Services + +* Provider Packages MAY support all or a subset of CMI services; service combinations MAY be configurable at runtime by the Provider Supervisor. + * This specification does not dictate the mechanism by which mode of operation MUST be discovered, and instead places that burden upon the VM Provider. +* Misconfigured provider software SHOULD fail-fast with an OS-appropriate error code. + +##### Linux Capabilities + +* Providers SHOULD clearly document any additionally required capabilities and/or security context. + +##### Cgroup Isolation + +* A Provider MAY be constrained by cgroups. + +##### Resource Requirements + +* VM Providers SHOULD unambiguously document all of a Provider’s resource requirements. + +### Deploying +* **Recommended:** The MCM and Provider are typically expected to run as two containers inside a common `Pod`. +* However, for the security reasons they could execute on seperate Pods provided they have a secure way to exchange data between them. \ No newline at end of file diff --git a/kubernetes/crds.yaml b/kubernetes/crds.yaml index 3f77a532f..3f01229c2 100644 --- a/kubernetes/crds.yaml +++ b/kubernetes/crds.yaml @@ -1,315 +1,471 @@ # The set of CRDs used by machine-controller-manager -apiVersion: apiextensions.k8s.io/v1beta1 +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: machineclasses.machine.sapcloud.io +spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: MachineClass + listKind: MachineClassList + plural: machineclasses + shortNames: + - machcls + singular: machineclass + scope: Namespaced + versions: + - name: v1alpha1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + x-kubernetes-preserve-unknown-fields: true + subresources: + status: {} + +--- + +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: openstackmachineclasses.machine.sapcloud.io spec: + conversion: + strategy: None group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced names: kind: OpenStackMachineClass + listKind: OpenStackMachineClassList plural: openstackmachineclasses - singular: openstackmachineclass shortNames: - oscls - subresources: - status: {} - additionalPrinterColumns: - - name: Flavor - type: string - JSONPath: .spec.flavorName - - name: Image - type: string - JSONPath: .spec.imageName - - name: Region - type: string - priority: 1 - JSONPath: .spec.region - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + singular: openstackmachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.flavorName + name: Flavor + type: string + - jsonPath: .spec.imageName + name: Image + type: string + - jsonPath: .spec.region + name: Region + priority: 1 + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: awsmachineclasses.machine.sapcloud.io spec: + conversion: + strategy: None group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced names: kind: AWSMachineClass + listKind: AWSMachineClassList plural: awsmachineclasses - singular: awsmachineclass shortNames: - awscls - subresources: - status: {} - additionalPrinterColumns: - - name: Machine Type - type: string - JSONPath: .spec.machineType - - name: AMI - type: string - JSONPath: .spec.ami - - name: Region - type: string - priority: 1 - JSONPath: .spec.region - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + singular: awsmachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.machineType + name: Machine Type + type: string + - jsonPath: .spec.ami + name: AMI + type: string + - jsonPath: .spec.region + name: Region + priority: 1 + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: azuremachineclasses.machine.sapcloud.io spec: + conversion: + strategy: None group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced names: kind: AzureMachineClass + listKind: AzureMachineClassList plural: azuremachineclasses - singular: azuremachineclass shortNames: - azurecls - subresources: - status: {} - additionalPrinterColumns: - - name: VM size - type: string - JSONPath: .spec.properties.hardwareProfile.vmSize - - name: Location - type: string - priority: 1 - JSONPath: .spec.location - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + singular: azuremachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.properties.hardwareProfile.vmSize + name: VM Size + type: string + - jsonPath: .spec.location + name: Location + priority: 1 + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: gcpmachineclasses.machine.sapcloud.io spec: + conversion: + strategy: None group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced names: kind: GCPMachineClass + listKind: GCPMachineClassList plural: gcpmachineclasses - singular: gcpmachineclass shortNames: - gcpcls - subresources: - status: {} - additionalPrinterColumns: - - name: Machine Type - type: string - JSONPath: .spec.machineType - - name: Region - type: string - priority: 1 - JSONPath: .spec.region - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + singular: gcpmachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.machineType + name: Machine Type + type: string + - jsonPath: .spec.region + name: Region + priority: 1 + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} + --- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: alicloudmachineclasses.machine.sapcloud.io -spec: - group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced - names: - kind: AlicloudMachineClass - plural: alicloudmachineclasses - singular: alicloudmachineclass - shortNames: - - alicloudcls - subresources: - status: {} - additionalPrinterColumns: - - name: Instance Type - type: string - JSONPath: .spec.instanceType - - name: Region - type: string - priority: 1 - JSONPath: .spec.region - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + metadata: + name: alicloudmachineclasses.machine.sapcloud.io + spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: AlicloudMachineClass + listKind: AlicloudMachineClassList + plural: alicloudmachineclasses + shortNames: + - alicloudcls + singular: alicloudmachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.instanceType + name: Instance Type + type: string + - jsonPath: .spec.region + name: Region + priority: 1 + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: packetmachineclasses.machine.sapcloud.io -spec: - group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced - names: - kind: PacketMachineClass - plural: packetmachineclasses - singular: packetmachineclass - shortNames: - - packetcls - subresources: - status: {} + apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + metadata: + name: packetmachineclasses.machine.sapcloud.io + spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: PacketMachineClass + listKind: PacketMachineClassList + plural: packetmachineclasses + shortNames: + - packetcls + singular: packetmachineclass + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: machines.machine.sapcloud.io -spec: - group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced - names: - kind: Machine - plural: machines - singular: machine - shortNames: - - mach - subresources: - status: {} - additionalPrinterColumns: - - name: Status - type: string - description: Current status of the machine. - JSONPath: .status.currentStatus.phase - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + metadata: + name: machines.machine.sapcloud.io + spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: Machine + listKind: MachineList + plural: machines + shortNames: + - mach + singular: machine + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Current status of the machine. + jsonPath: .status.currentStatus.phase + name: Status + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} + - additionalPrinterColumns: + - description: Current status of the machine. + jsonPath: .status.currentStatus.phase + name: Status + type: string + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha2 + served: true + storage: false + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: machinesets.machine.sapcloud.io -spec: - group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced - names: - kind: MachineSet - plural: machinesets - singular: machineset - shortNames: - - machset - subresources: - status: {} - additionalPrinterColumns: - - name: Desired - type: integer - description: Number of desired replicas. - JSONPath: .spec.replicas - - name: Current - type: integer - description: Number of actual replicas. - JSONPath: .status.replicas - - name: Ready - type: integer - description: Number of ready replicas for this machine set. - JSONPath: .status.readyReplicas - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + metadata: + name: machinesets.machine.sapcloud.io + spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: MachineSet + listKind: MachineSetList + plural: machinesets + shortNames: + - machset + singular: machineset + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Number of desired replicas. + jsonPath: .spec.replicas + name: Desired + type: integer + - description: Number of actual replicas. + jsonPath: .status.replicas + name: Current + type: integer + - description: Number of ready replicas for this machine set. + jsonPath: .status.readyReplicas + name: Ready + type: integer + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} + - additionalPrinterColumns: + - description: Number of desired replicas. + jsonPath: .spec.replicas + name: Desired + type: integer + - description: Number of actual replicas. + jsonPath: .status.replicas + name: Current + type: integer + - description: Number of ready replicas for this machine set. + jsonPath: .status.readyReplicas + name: Ready + type: integer + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha2 + served: true + storage: false + subresources: + status: {} --- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: machinedeployments.machine.sapcloud.io -spec: - group: machine.sapcloud.io - version: v1alpha1 - scope: Namespaced - names: - kind: MachineDeployment - plural: machinedeployments - singular: machinedeployment - shortNames: - - machdeploy - subresources: - status: {} - additionalPrinterColumns: - - name: Ready - type: integer - description: Total number of ready machines targeted by this machine deployment. - JSONPath: .status.readyReplicas - - name: Desired - type: integer - description: Number of desired machines. - JSONPath: .spec.replicas - - name: Up-to-date - type: integer - description: Total number of non-terminated machines targeted by this machine deployment that have the desired template spec. - JSONPath: .status.updatedReplicas - - name: Available - type: integer - description: Total number of available machines (ready for at least minReadySeconds) targeted by this machine deployment. - JSONPath: .status.availableReplicas - - name: Age - type: date - description: > - CreationTimestamp is a timestamp representing the server time when this object was created. - It is not guaranteed to be set in happens-before order across separate operations. - Clients may not set this value. It is represented in RFC3339 form and is in UTC. - Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata - JSONPath: .metadata.creationTimestamp + apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + metadata: + name: machinedeployments.machine.sapcloud.io + spec: + conversion: + strategy: None + group: machine.sapcloud.io + names: + kind: MachineDeployment + listKind: MachineDeploymentList + plural: machinedeployments + shortNames: + - machdeploy + singular: machinedeployment + preserveUnknownFields: true + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Total number of ready machines targeted by this machine deployment. + jsonPath: .status.readyReplicas + name: Ready + type: integer + - description: Number of desired machines. + jsonPath: .spec.replicas + name: Desired + type: integer + - description: Total number of non-terminated machines targeted by this machine + deployment that have the desired template spec. + jsonPath: .status.updatedReplicas + name: Up-to-date + type: integer + - description: Total number of available machines (ready for at least minReadySeconds) + targeted by this machine deployment. + jsonPath: .status.availableReplicas + name: Available + type: integer + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + served: true + storage: true + subresources: + status: {} + - additionalPrinterColumns: + - description: Total number of ready machines targeted by this machine deployment. + jsonPath: .status.readyReplicas + name: Ready + type: integer + - description: Number of desired machines. + jsonPath: .spec.replicas + name: Desired + type: integer + - description: Total number of non-terminated machines targeted by this machine + deployment that have the desired template spec. + jsonPath: .status.updatedReplicas + name: Up-to-date + type: integer + - description: Total number of available machines (ready for at least minReadySeconds) + targeted by this machine deployment. + jsonPath: .status.availableReplicas + name: Available + type: integer + - description: |- + CreationTimestamp is a timestamp representing the server time when this object was created. It is not guaranteed to be set in happens-before order across separate operations. Clients may not set this value. It is represented in RFC3339 form and is in UTC. + Populated by the system. Read-only. Null for lists. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha2 + served: true + storage: false + subresources: + status: {} \ No newline at end of file diff --git a/kubernetes/deployment/control-cluster-role-binding.yaml b/kubernetes/deployment/in-tree/control-cluster-role-binding.yaml similarity index 100% rename from kubernetes/deployment/control-cluster-role-binding.yaml rename to kubernetes/deployment/in-tree/control-cluster-role-binding.yaml diff --git a/kubernetes/deployment/control-cluster-role.yaml b/kubernetes/deployment/in-tree/control-cluster-role.yaml similarity index 100% rename from kubernetes/deployment/control-cluster-role.yaml rename to kubernetes/deployment/in-tree/control-cluster-role.yaml diff --git a/kubernetes/deployment/deployment.yaml b/kubernetes/deployment/in-tree/deployment.yaml similarity index 69% rename from kubernetes/deployment/deployment.yaml rename to kubernetes/deployment/in-tree/deployment.yaml index 20d20703c..2aee856cf 100644 --- a/kubernetes/deployment/deployment.yaml +++ b/kubernetes/deployment/in-tree/deployment.yaml @@ -1,6 +1,6 @@ # Sample deployment file, used to run Machine Controller Manager on your cluster -apiVersion: apps/v1beta1 # Version may change based on kubernetes version +apiVersion: apps/v1 # Version may change based on kubernetes version kind: Deployment metadata: name: machine-controller-manager @@ -11,24 +11,26 @@ spec: labels: app: machine-controller-manager spec: + securityContext: {} + serviceAccount: machine-controller-manager + serviceAccountName: machine-controller-manager containers: - name: machine-controller-manager - image: eu.gcr.io/gardener-project/gardener/machine-controller-manager:0.5.0 + image: eu.gcr.io/gardener-project/gardener/machine-controller-manager:v0.28.0 imagePullPolicy: Always command: - ./machine-controller-manager - - --v=2 - --target-kubeconfig=$(TARGET_KUBECONFIG) # Mandatory Parameter - Filepath to the target cluster's kubeconfig where node objects are expected to join. - --control-kubeconfig=$(CONTROL_KUBECONFIG) # Optional Parameter - Default value is same as target-kubeconfig - Filepath to the control cluster's kubeconfig where machine objects would be created. Optionally you could also use "inClusterConfig" when pod is running inside control kubeconfig. - --namespace=$(CONTROL_NAMESPACE) # Optional Parameter - Default value for namespace is 'default' - The control namespace where the controller watches for it's machine objects. - --safety-up=2 # Optional Parameter - Default value 2 - The number of excess machine objects permitted for any machineSet/machineDeployment beyond its expected number of replicas based on desired and max-surge, we call this the upper-limit. When this upper-limit is reached, the objects are frozen until the number of objects reduce. upper-limit = desired + maxSurge (if applicable) + safetyUp. - --safety-down=1 # Optional Parameter - Default value 1 - Upper-limit minus safety-down value gives the lower-limit. This is the limits below which any temporarily frozen machineSet/machineDeployment object is unfrozen. lower-limit = desired + maxSurge (if applicable) + safetyUp - safetyDown. - - --machine-drain-timeout=5 # Optional Parameter - Timeout (in minutes) used while draining of machine before deletion, beyond which MCM forcefully deletes machine. - - --machine-health-timeout=10 # Optional Parameter - Default value 10mins - Timeout (in minutes) used while joining (during creation) or re-joining (in case of temporary health issues) of machine before it is declared as failed. - - --machine-set-scale-timeout=20 # Optional Parameter - Default value 20mins - Timeout (in minutes) used while scaling machineSet if timeout occurs machineSet is frozen. - - --machine-safety-orphan-vms-period=30 # Optional Parameter - Default value 30mins - Time period (in minutes) used to poll for orphan VMs by safety controller. - - --machine-safety-overshooting-period=1 # Optional Parameter - Default value 1min - Time period (in minutes) used to poll for overshooting of machine objects backing a machineSet by safety controller. + - --machine-drain-timeout=5m # Optional Parameter - Timeout (in time) used while draining of machine before deletion, beyond which MCM forcefully deletes machine. + - --machine-health-timeout=10m # Optional Parameter - Default value 10mins - Timeout (in time) used while joining (during creation) or re-joining (in case of temporary health issues) of machine before it is declared as failed. + - --machine-safety-orphan-vms-period=30m # Optional Parameter - Default value 30mins - Time period (in time) used to poll for orphan VMs by safety controller. + - --machine-safety-overshooting-period=1m # Optional Parameter - Default value 1min - Time period (in time) used to poll for overshooting of machine objects backing a machineSet by safety controller. - --node-conditions=ReadonlyFilesystem,KernelDeadlock,DiskPressure # List of comma-separated/case-sensitive node-conditions which when set to True will change machine to a failed state after MachineHealthTimeout duration. It may further be replaced with a new machine if the machine is backed by a machine-set object. + - --v=2 livenessProbe: failureThreshold: 3 httpGet: diff --git a/kubernetes/deployment/target-cluster-role-binding.yaml b/kubernetes/deployment/in-tree/target-cluster-role-binding.yaml similarity index 100% rename from kubernetes/deployment/target-cluster-role-binding.yaml rename to kubernetes/deployment/in-tree/target-cluster-role-binding.yaml diff --git a/kubernetes/deployment/target-cluster-role.yaml b/kubernetes/deployment/in-tree/target-cluster-role.yaml similarity index 100% rename from kubernetes/deployment/target-cluster-role.yaml rename to kubernetes/deployment/in-tree/target-cluster-role.yaml diff --git a/kubernetes/deployment/out-of-tree/control-cluster-role-binding.yaml b/kubernetes/deployment/out-of-tree/control-cluster-role-binding.yaml new file mode 100644 index 000000000..5d6e83abc --- /dev/null +++ b/kubernetes/deployment/out-of-tree/control-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: machine-controller-manager-control +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: machine-controller-manager-control +subjects: +- kind: ServiceAccount + name: default + namespace: default \ No newline at end of file diff --git a/kubernetes/deployment/out-of-tree/control-cluster-role.yaml b/kubernetes/deployment/out-of-tree/control-cluster-role.yaml new file mode 100644 index 000000000..b03bb7f0a --- /dev/null +++ b/kubernetes/deployment/out-of-tree/control-cluster-role.yaml @@ -0,0 +1,43 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: machine-controller-manager-control +rules: +- apiGroups: + - machine.sapcloud.io + resources: + - awsmachineclasses + - azuremachineclasses + - gcpmachineclasses + - openstackmachineclasses + - alicloudmachineclasses + - packetmachineclasses + - machinedeployments + - machines + - machinesets + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - nodes + - configmaps + - secrets + - endpoints + - events + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch diff --git a/kubernetes/deployment/out-of-tree/deployment.yaml b/kubernetes/deployment/out-of-tree/deployment.yaml new file mode 100644 index 000000000..bf271ea70 --- /dev/null +++ b/kubernetes/deployment/out-of-tree/deployment.yaml @@ -0,0 +1,81 @@ +# Sample deployment file, used to run Machine Controller Manager on your cluster + +apiVersion: apps/v1 # Version may change based on kubernetes version +kind: Deployment +metadata: + name: machine-controller-manager +spec: + replicas: 1 + template: + metadata: + labels: + app: machine-controller-manager + spec: + containers: + - name: machine-controller-manager + image:eu.gcr.io/gardener-project/gardener/machine-controller-manager:v0.28.0-dev-793b105c41adfc434e07107decaf1ac67fc1fd3f + imagePullPolicy: Always + command: + - ./machine-controller-manager + - --target-kubeconfig=$(TARGET_KUBECONFIG) # Mandatory Parameter - Filepath to the target cluster's kubeconfig where node objects are expected to join. + - --control-kubeconfig=$(CONTROL_KUBECONFIG) # Optional Parameter - Default value is same as target-kubeconfig - Filepath to the control cluster's kubeconfig where machine objects would be created. Optionally you could also use "inClusterConfig" when pod is running inside control kubeconfig. + - --namespace=$(CONTROL_NAMESPACE) # Optional Parameter - Default value for namespace is 'default' - The control namespace where the controller watches for it's machine objects. + - --safety-up=2 # Optional Parameter - Default value 2 - The number of excess machine objects permitted for any machineSet/machineDeployment beyond its expected number of replicas based on desired and max-surge, we call this the upper-limit. When this upper-limit is reached, the objects are frozen until the number of objects reduce. upper-limit = desired + maxSurge (if applicable) + safetyUp. + - --safety-down=1 # Optional Parameter - Default value 1 - Upper-limit minus safety-down value gives the lower-limit. This is the limits below which any temporarily frozen machineSet/machineDeployment object is unfrozen. lower-limit = desired + maxSurge (if applicable) + safetyUp - safetyDown. + - --machine-drain-timeout=5m # Optional Parameter - Timeout (in time) used while draining of machine before deletion, beyond which MCM forcefully deletes machine. + - --machine-health-timeout=10m # Optional Parameter - Default value 10mins - Timeout (in time) used while joining (during creation) or re-joining (in case of temporary health issues) of machine before it is declared as failed. + - --machine-safety-orphan-vms-period=30 # Optional Parameter - Default value 30mins - Time period (in time) used to poll for orphan VMs by safety controller. + - --machine-safety-overshooting-period=1 # Optional Parameter - Default value 1min - Time period (in time) used to poll for overshooting of machine objects backing a machineSet by safety controller. + - --node-conditions=ReadonlyFilesystem,KernelDeadlock,DiskPressure # List of comma-separated/case-sensitive node-conditions which when set to True will change machine to a failed state after MachineHealthTimeout duration. It may further be replaced with a new machine if the machine is backed by a machine-set object. + - --v=2 + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10258 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + - command: + - ./machine-controller + - --control-kubeconfig=$(TARGET_KUBECONFIG) # Mandatory Parameter - Filepath to the target cluster's kubeconfig where node objects are expected to join. + - --target-kubeconfig=$(CONTROL_KUBECONFIG) # Optional Parameter - Default value is same as target-kubeconfig - Filepath to the control cluster's kubeconfig where machine objects would be created. Optionally you could also use "inClusterConfig" when pod is running inside control kubeconfig. + - --namespace=$(CONTROL_NAMESPACE) # Optional Parameter - Default value for namespace is 'default' - The control namespace where the controller watches for it's machine objects. + - --machine-drain-timeout=5m # Optional Parameter - Timeout (in time) used while draining of machine before deletion, beyond which MCM forcefully deletes machine. + - --machine-health-timeout=10m # Optional Parameter - Default value 10mins - Timeout (in time) used while joining (during creation) or re-joining (in case of temporary health issues) of machine before it is declared as failed. + - --machine-safety-orphan-vms-period=30m # Optional Parameter - Default value 30mins - Time period (in time) used to poll for orphan VMs by safety controller. + - --node-conditions=ReadonlyFilesystem,KernelDeadlock,DiskPressure # List of comma-separated/case-sensitive node-conditions which when set to True will change machine to a failed state after MachineHealthTimeout duration. It may further be replaced with a new machine if the machine is backed by a machine-set object. + - --v=3 + image: gcr.io/gardener-project/gardener/machine-controller-manager-provider-aws:0.1.0-dev-47bc8cf5b02affba97bfb7b0e57202947d397b4c + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10259 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + name: machine-controller + ports: + - containerPort: 10259 + name: metrics + protocol: TCP + resources: + limits: + cpu: "3" + memory: 3000Mi + requests: + cpu: 50m + memory: 64Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + securityContext: {} + serviceAccount: machine-controller-manager + serviceAccountName: machine-controller-manager + dnsPolicy: ClusterFirst + restartPolicy: Always diff --git a/kubernetes/deployment/out-of-tree/target-cluster-role-binding.yaml b/kubernetes/deployment/out-of-tree/target-cluster-role-binding.yaml new file mode 100644 index 000000000..a2a05fec2 --- /dev/null +++ b/kubernetes/deployment/out-of-tree/target-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: machine-controller-manager-target +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: machine-controller-manager-target +subjects: +- kind: ServiceAccount + name: default + namespace: default \ No newline at end of file diff --git a/kubernetes/deployment/out-of-tree/target-cluster-role.yaml b/kubernetes/deployment/out-of-tree/target-cluster-role.yaml new file mode 100644 index 000000000..4b52bdf3a --- /dev/null +++ b/kubernetes/deployment/out-of-tree/target-cluster-role.yaml @@ -0,0 +1,52 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: machine-controller-manager-target +rules: +- apiGroups: + - "" + resources: + - nodes + - endpoints + - replicationcontrollers + - pods + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create +- apiGroups: + - extensions + - apps + resources: + - replicasets + - statefulsets + - daemonsets + - deployments + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + - persistentvolumes + verbs: + - list + - watch diff --git a/pkg/apis/machine/types.go b/pkg/apis/machine/types.go index 6b6427d0b..355bcd1f7 100644 --- a/pkg/apis/machine/types.go +++ b/pkg/apis/machine/types.go @@ -157,6 +157,11 @@ type MachineStatus struct { // Current status of the machine object CurrentStatus CurrentStatus + + // LastKnownState can store details of the last known state of the VM by the plugins. + // It can be used by future operation calls to determine current infrastucture state + // +optional + LastKnownState string } // LastOperation suggests the last operation performed on the object diff --git a/pkg/apis/machine/v1alpha1/types.go b/pkg/apis/machine/v1alpha1/types.go index 7c9f2630f..f5b9a7fc7 100644 --- a/pkg/apis/machine/v1alpha1/types.go +++ b/pkg/apis/machine/v1alpha1/types.go @@ -167,6 +167,11 @@ type MachineStatus struct { // Current status of the machine object CurrentStatus CurrentStatus `json:"currentStatus,omitempty"` + + // LastKnownState can store details of the last known state of the VM by the plugins. + // It can be used by future operation calls to determine current infrastucture state + // +optional + LastKnownState string `json:"lastKnownState,omitempty"` } // LastOperation suggests the last operation performed on the object diff --git a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go index 82ca1afe0..56b582aa7 100644 --- a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go @@ -2524,6 +2524,7 @@ func autoConvert_v1alpha1_MachineStatus_To_machine_MachineStatus(in *MachineStat if err := Convert_v1alpha1_CurrentStatus_To_machine_CurrentStatus(&in.CurrentStatus, &out.CurrentStatus, s); err != nil { return err } + out.LastKnownState = in.LastKnownState return nil } @@ -2541,6 +2542,7 @@ func autoConvert_machine_MachineStatus_To_v1alpha1_MachineStatus(in *machine.Mac if err := Convert_machine_CurrentStatus_To_v1alpha1_CurrentStatus(&in.CurrentStatus, &out.CurrentStatus, s); err != nil { return err } + out.LastKnownState = in.LastKnownState return nil } diff --git a/pkg/controller/deployment.go b/pkg/controller/deployment.go index 05fb231c1..7bb8c2394 100644 --- a/pkg/controller/deployment.go +++ b/pkg/controller/deployment.go @@ -466,14 +466,6 @@ func (dc *controller) reconcileClusterMachineDeployment(key string) error { return nil } - if deployment.DeletionTimestamp == nil { - // Validate MachineClass if the machineDeployment is not triggerred for deletion - _, secretRef, err := dc.validateMachineClass(&deployment.Spec.Template.Spec.Class) - if err != nil || secretRef == nil { - return err - } - } - // Resync the MachineDeployment after 10 minutes to avoid missing out on missed out events defer dc.enqueueMachineDeploymentAfter(deployment, 10*time.Minute) diff --git a/pkg/controller/deployment_rollback.go b/pkg/controller/deployment_rollback.go index 03bd0c5ae..9189ff4a0 100644 --- a/pkg/controller/deployment_rollback.go +++ b/pkg/controller/deployment_rollback.go @@ -27,11 +27,11 @@ import ( "time" "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" - "k8s.io/klog" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/klog" ) // rollback the deployment to the specified revision. In any case cleanup the rollback spec. diff --git a/pkg/controller/deployment_sync.go b/pkg/controller/deployment_sync.go index 33b53eb98..a3fb0a016 100644 --- a/pkg/controller/deployment_sync.go +++ b/pkg/controller/deployment_sync.go @@ -30,13 +30,13 @@ import ( "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" labelsutil "github.com/gardener/machine-controller-manager/pkg/util/labels" - "k8s.io/klog" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/klog" ) // syncStatusOnly only updates Deployments Status and doesn't take any mutating actions. diff --git a/pkg/controller/machine.go b/pkg/controller/machine.go index 09a36d6ae..11fb8051b 100644 --- a/pkg/controller/machine.go +++ b/pkg/controller/machine.go @@ -55,39 +55,55 @@ const ( Machine controller - Machine add, update, delete watches */ func (c *controller) addMachine(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - klog.Errorf("Couldn't get key for object %+v: %v", obj, err) - return - } - klog.V(4).Infof("Add/Update/Delete machine object %q", key) - c.machineQueue.Add(key) + klog.V(4).Infof("Adding machine object") + c.enqueueMachine(obj) } func (c *controller) updateMachine(oldObj, newObj interface{}) { klog.V(4).Info("Updating machine object") - c.addMachine(newObj) + c.enqueueMachine(newObj) } func (c *controller) deleteMachine(obj interface{}) { klog.V(4).Info("Deleting machine object") - c.addMachine(obj) + c.enqueueMachine(obj) } func (c *controller) enqueueMachine(obj interface{}) { key, err := cache.MetaNamespaceKeyFunc(obj) if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) return } - c.machineQueue.Add(key) + + machine := obj.(*v1alpha1.Machine) + switch machine.Spec.Class.Kind { + case AlicloudMachineClassKind, AWSMachineClassKind, AzureMachineClassKind, GCPMachineClassKind, OpenStackMachineClassKind, PacketMachineClassKind: + // Checking if machineClass is to be processed by MCM, and then only enqueue the machine object + klog.V(4).Infof("Adding machine object to the queue %q", key) + c.machineQueue.Add(key) + default: + klog.V(4).Infof("ClassKind %q not found. Machine maybe be processed by external controller", machine.Spec.Class.Kind) + } } func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) { key, err := cache.MetaNamespaceKeyFunc(obj) if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) return } - c.machineQueue.AddAfter(key, after) + + machine := obj.(*v1alpha1.Machine) + + switch machine.Spec.Class.Kind { + case AlicloudMachineClassKind, AWSMachineClassKind, AzureMachineClassKind, GCPMachineClassKind, OpenStackMachineClassKind, PacketMachineClassKind: + // Checking if machineClass is to be processed by MCM, and then only enqueue the machine object + klog.V(4).Infof("Adding machine object to the queue %q after %s", key, after) + c.machineQueue.AddAfter(key, after) + default: + klog.V(4).Infof("ClassKind %q not found. Machine maybe be processed by external controller", machine.Spec.Class.Kind) + } } func (c *controller) reconcileClusterMachineKey(key string) error { diff --git a/pkg/controller/machine_util.go b/pkg/controller/machine_util.go index c92ef0be9..388622342 100644 --- a/pkg/controller/machine_util.go +++ b/pkg/controller/machine_util.go @@ -87,7 +87,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf var secretRef *v1.Secret switch classSpec.Kind { - case "AWSMachineClass": + case AWSMachineClassKind: AWSMachineClass, err := c.awsMachineClassLister.AWSMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("AWSMachineClass %q/%q not found. Skipping. %v", c.namespace, classSpec.Name, err) @@ -115,7 +115,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf klog.V(2).Info("Secret reference not found") return MachineClass, secretRef, err } - case "AzureMachineClass": + case AzureMachineClassKind: AzureMachineClass, err := c.azureMachineClassLister.AzureMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("AzureMachineClass %q not found. Skipping. %v", classSpec.Name, err) @@ -144,7 +144,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf return MachineClass, secretRef, err } - case "GCPMachineClass": + case GCPMachineClassKind: GCPMachineClass, err := c.gcpMachineClassLister.GCPMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("GCPMachineClass %q not found. Skipping. %v", classSpec.Name, err) @@ -172,7 +172,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf klog.V(2).Info("Secret reference not found") return MachineClass, secretRef, err } - case "OpenStackMachineClass": + case OpenStackMachineClassKind: OpenStackMachineClass, err := c.openStackMachineClassLister.OpenStackMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("OpenStackMachineClass %q not found. Skipping. %v", classSpec.Name, err) @@ -200,7 +200,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf klog.V(2).Info("Secret reference not found") return MachineClass, secretRef, err } - case "AlicloudMachineClass": + case AlicloudMachineClassKind: AlicloudMachineClass, err := c.alicloudMachineClassLister.AlicloudMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("AlicloudMachineClass %q/%q not found. Skipping. %v", c.namespace, classSpec.Name, err) @@ -228,7 +228,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf klog.V(2).Info("Secret reference not found") return MachineClass, secretRef, err } - case "PacketMachineClass": + case PacketMachineClassKind: PacketMachineClass, err := c.packetMachineClassLister.PacketMachineClasses(c.namespace).Get(classSpec.Name) if err != nil { klog.V(2).Infof("PacketMachineClass %q/%q not found. Skipping. %v", c.namespace, classSpec.Name, err) @@ -257,7 +257,7 @@ func (c *controller) validateMachineClass(classSpec *v1alpha1.ClassSpec) (interf return MachineClass, secretRef, err } default: - klog.V(2).Infof("ClassKind %q not found", classSpec.Kind) + klog.V(2).Infof("ClassKind %q not found. Machine maybe be processed by external controller", classSpec.Kind) } return MachineClass, secretRef, nil diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 1a2687698..ddf529dfe 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -463,12 +463,6 @@ func (c *controller) reconcileClusterMachineSet(key string) error { } if machineSet.DeletionTimestamp == nil { - // Validate MachineClass - _, secretRef, err := c.validateMachineClass(&machineSet.Spec.Template.Spec.Class) - if err != nil || secretRef == nil { - return err - } - // Manipulate finalizers c.addMachineSetFinalizers(machineSet) } diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go index 6849ab473..4a9d6b6b7 100644 --- a/pkg/openapi/openapi_generated.go +++ b/pkg/openapi/openapi_generated.go @@ -3004,6 +3004,13 @@ func schema_pkg_apis_machine_v1alpha1_MachineStatus(ref common.ReferenceCallback Ref: ref("github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1.CurrentStatus"), }, }, + "lastKnownState": { + SchemaProps: spec.SchemaProps{ + Description: "LastKnownState can store details of the last known state of the VM by the plugins. It can be used by future operation calls to determine current infrastucture state", + Type: []string{"string"}, + Format: "", + }, + }, }, }, }, diff --git a/pkg/util/controller/client_builder.go b/pkg/util/clientbuilder/core/client_builder.go similarity index 100% rename from pkg/util/controller/client_builder.go rename to pkg/util/clientbuilder/core/client_builder.go diff --git a/pkg/controller/client_builder.go b/pkg/util/clientbuilder/machine/client_builder.go similarity index 100% rename from pkg/controller/client_builder.go rename to pkg/util/clientbuilder/machine/client_builder.go diff --git a/pkg/util/provider/app/app.go b/pkg/util/provider/app/app.go new file mode 100644 index 000000000..ae8927533 --- /dev/null +++ b/pkg/util/provider/app/app.go @@ -0,0 +1,373 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This file was copied and modified from the kubernetes/kubernetes project +https://github.com/kubernetes/kubernetes/release-1.8/cmd/kube-controller-manager/app/controllermanager.go + +Modifications Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. +*/ + +package app + +import ( + "context" + "fmt" + "net" + "net/http" + "net/http/pprof" + "os" + goruntime "runtime" + "strconv" + "time" + + machinescheme "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/scheme" + machineinformers "github.com/gardener/machine-controller-manager/pkg/client/informers/externalversions" + coreclientbuilder "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/core" + machineclientbuilder "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/machine" + machinecontroller "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecontroller" + coreinformers "k8s.io/client-go/informers" + kubescheme "k8s.io/client-go/kubernetes/scheme" + + "github.com/gardener/machine-controller-manager/pkg/handlers" + "github.com/gardener/machine-controller-manager/pkg/util/configz" + "github.com/gardener/machine-controller-manager/pkg/util/provider/app/options" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/discovery" + "k8s.io/client-go/kubernetes" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/pkg/version" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/client-go/tools/record" + "k8s.io/klog" +) + +const ( + controllerManagerAgentName = "machine-controller" +) + +var ( + machineGVR = schema.GroupVersionResource{Group: "machine.sapcloud.io", Version: "v1alpha1", Resource: "machines"} +) + +// Run runs the MCServer. This should never exit. +func Run(s *options.MCServer, driver driver.Driver) error { + // To help debugging, immediately log version + klog.V(4).Infof("Version: %+v", version.Get()) + if err := s.Validate(); err != nil { + return err + } + + var err error + + // kubeconfig for the cluster for which machine-controller will create machines. + targetkubeconfig, err := clientcmd.BuildConfigFromFlags("", s.TargetKubeconfig) + if err != nil { + return err + } + + controlkubeconfig := targetkubeconfig + + if s.ControlKubeconfig != "" { + if s.ControlKubeconfig == "inClusterConfig" { + //use inClusterConfig when controller is running inside clus + controlkubeconfig, err = clientcmd.BuildConfigFromFlags("", "") + } else { + //kubeconfig for the seedcluster where MachineCRDs are supposed to be registered. + controlkubeconfig, err = clientcmd.BuildConfigFromFlags("", s.ControlKubeconfig) + } + if err != nil { + return err + } + } + + // PROTOBUF WONT WORK + // kubeconfig.ContentConfig.ContentType = s.ContentType + // Override kubeconfig qps/burst settings from flags + targetkubeconfig.QPS = s.KubeAPIQPS + controlkubeconfig.QPS = s.KubeAPIQPS + targetkubeconfig.Burst = int(s.KubeAPIBurst) + controlkubeconfig.Burst = int(s.KubeAPIBurst) + + kubeClientControl, err := kubernetes.NewForConfig( + rest.AddUserAgent(controlkubeconfig, "machine-controller"), + ) + if err != nil { + klog.Fatalf("Invalid API configuration for kubeconfig-control: %v", err) + } + + leaderElectionClient := kubernetes.NewForConfigOrDie(rest.AddUserAgent(controlkubeconfig, "machine-leader-election")) + klog.V(4).Info("Starting http server and mux") + go startHTTP(s) + + recorder := createRecorder(kubeClientControl) + + run := func(ctx context.Context) { + var stop <-chan struct{} + // Control plane client used to interact with machine APIs + controlMachineClientBuilder := machineclientbuilder.SimpleClientBuilder{ + ClientConfig: controlkubeconfig, + } + // Control plane client used to interact with core kubernetes objects + controlCoreClientBuilder := coreclientbuilder.SimpleControllerClientBuilder{ + ClientConfig: controlkubeconfig, + } + // Target plane client used to interact with core kubernetes objects + targetCoreClientBuilder := coreclientbuilder.SimpleControllerClientBuilder{ + ClientConfig: targetkubeconfig, + } + + err := StartControllers( + s, + controlkubeconfig, + targetkubeconfig, + controlMachineClientBuilder, + controlCoreClientBuilder, + targetCoreClientBuilder, + driver, + recorder, + stop, + ) + + klog.Fatalf("error running controllers: %v", err) + panic("unreachable") + + } + + if !s.LeaderElection.LeaderElect { + run(nil) + panic("unreachable") + } + + id, err := os.Hostname() + if err != nil { + return err + } + + rl, err := resourcelock.New( + s.LeaderElection.ResourceLock, + s.Namespace, + "machine-controller", + leaderElectionClient.CoreV1(), + leaderElectionClient.CoordinationV1(), + resourcelock.ResourceLockConfig{ + Identity: id, + EventRecorder: recorder, + }, + ) + if err != nil { + klog.Fatalf("error creating lock: %v", err) + } + + ctx := context.TODO() + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: rl, + LeaseDuration: s.LeaderElection.LeaseDuration.Duration, + RenewDeadline: s.LeaderElection.RenewDeadline.Duration, + RetryPeriod: s.LeaderElection.RetryPeriod.Duration, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: run, + OnStoppedLeading: func() { + klog.Fatalf("leaderelection lost") + }, + }, + }) + panic("unreachable") +} + +// StartControllers starts all the controllers which are a part of machine-controller +func StartControllers(s *options.MCServer, + controlCoreKubeconfig *rest.Config, + targetCoreKubeconfig *rest.Config, + controlMachineClientBuilder machineclientbuilder.ClientBuilder, + controlCoreClientBuilder coreclientbuilder.ClientBuilder, + targetCoreClientBuilder coreclientbuilder.ClientBuilder, + driver driver.Driver, + recorder record.EventRecorder, + stop <-chan struct{}) error { + + klog.V(5).Info("Getting available resources") + availableResources, err := getAvailableResources(controlCoreClientBuilder) + if err != nil { + return err + } + + controlMachineClient := controlMachineClientBuilder.ClientOrDie(controllerManagerAgentName).MachineV1alpha1() + + controlCoreKubeconfig = rest.AddUserAgent(controlCoreKubeconfig, controllerManagerAgentName) + controlCoreClient, err := kubernetes.NewForConfig(controlCoreKubeconfig) + if err != nil { + klog.Fatal(err) + } + + targetCoreKubeconfig = rest.AddUserAgent(targetCoreKubeconfig, controllerManagerAgentName) + targetCoreClient, err := kubernetes.NewForConfig(targetCoreKubeconfig) + if err != nil { + klog.Fatal(err) + } + + if availableResources[machineGVR] { + klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) + + controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( + controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"), + s.MinResyncPeriod.Duration, + s.Namespace, + nil, + ) + + controlCoreInformerFactory := coreinformers.NewFilteredSharedInformerFactory( + controlCoreClientBuilder.ClientOrDie("control-core-shared-informers"), + s.MinResyncPeriod.Duration, + s.Namespace, + nil, + ) + + targetCoreInformerFactory := coreinformers.NewSharedInformerFactory( + targetCoreClientBuilder.ClientOrDie("target-core-shared-informers"), + s.MinResyncPeriod.Duration, + ) + + // All shared informers are v1alpha1 API level + machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() + + klog.V(5).Infof("Creating controllers...") + machineController, err := machinecontroller.NewController( + s.Namespace, + controlMachineClient, + controlCoreClient, + targetCoreClient, + driver, + targetCoreInformerFactory.Core().V1().PersistentVolumeClaims(), + targetCoreInformerFactory.Core().V1().PersistentVolumes(), + controlCoreInformerFactory.Core().V1().Secrets(), + targetCoreInformerFactory.Core().V1().Nodes(), + machineSharedInformers.MachineClasses(), + machineSharedInformers.Machines(), + recorder, + s.SafetyOptions, + s.NodeConditions, + s.BootstrapTokenAuthExtraGroups, + ) + if err != nil { + return err + } + klog.V(1).Info("Starting shared informers") + + controlMachineInformerFactory.Start(stop) + controlCoreInformerFactory.Start(stop) + targetCoreInformerFactory.Start(stop) + + klog.V(5).Info("Running controller") + go machineController.Run(int(s.ConcurrentNodeSyncs), stop) + + } else { + return fmt.Errorf("unable to start machine controller: API GroupVersion %q is not available; \nFound: %#v", machineGVR, availableResources) + } + + select {} +} + +// TODO: In general, any controller checking this needs to be dynamic so +// users don't have to restart their controller manager if they change the apiserver. +// Until we get there, the structure here needs to be exposed for the construction of a proper ControllerContext. +func getAvailableResources(clientBuilder coreclientbuilder.ClientBuilder) (map[schema.GroupVersionResource]bool, error) { + var discoveryClient discovery.DiscoveryInterface + + var healthzContent string + // If apiserver is not running we should wait for some time and fail only then. This is particularly + // important when we start apiserver and controller manager at the same time. + err := wait.PollImmediate(time.Second, 10*time.Second, func() (bool, error) { + client, err := clientBuilder.Client("controller-discovery") + if err != nil { + klog.Errorf("Failed to get api versions from server: %v", err) + return false, nil + } + + healthStatus := 0 + resp := client.Discovery().RESTClient().Get().AbsPath("/healthz").Do().StatusCode(&healthStatus) + if healthStatus != http.StatusOK { + klog.Errorf("Server isn't healthy yet. Waiting a little while.") + return false, nil + } + content, _ := resp.Raw() + healthzContent = string(content) + + discoveryClient = client.Discovery() + return true, nil + }) + if err != nil { + return nil, fmt.Errorf("failed to get api versions from server: %v: %v", healthzContent, err) + } + + resourceMap, err := discoveryClient.ServerResources() + if err != nil { + utilruntime.HandleError(fmt.Errorf("unable to get all supported resources from server: %v", err)) + } + if len(resourceMap) == 0 { + return nil, fmt.Errorf("unable to get any supported resources from server") + } + + allResources := map[schema.GroupVersionResource]bool{} + for _, apiResourceList := range resourceMap { + version, err := schema.ParseGroupVersion(apiResourceList.GroupVersion) + if err != nil { + return nil, err + } + for _, apiResource := range apiResourceList.APIResources { + allResources[version.WithResource(apiResource.Name)] = true + } + } + + return allResources, nil +} + +func createRecorder(kubeClient *kubernetes.Clientset) record.EventRecorder { + machinescheme.AddToScheme(kubescheme.Scheme) + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(klog.Infof) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events("")}) + return eventBroadcaster.NewRecorder(kubescheme.Scheme, v1.EventSource{Component: controllerManagerAgentName}) +} + +func startHTTP(s *options.MCServer) { + mux := http.NewServeMux() + if s.EnableProfiling { + mux.HandleFunc("/debug/pprof/", pprof.Index) + mux.HandleFunc("/debug/pprof/profile", pprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", pprof.Trace) + if s.EnableContentionProfiling { + goruntime.SetBlockProfileRate(1) + } + } + configz.InstallHandler(mux) + mux.Handle("/metrics", prometheus.Handler()) + handlers.UpdateHealth(true) + mux.HandleFunc("/healthz", handlers.Healthz) + + server := &http.Server{ + Addr: net.JoinHostPort(s.Address, strconv.Itoa(int(s.Port))), + Handler: mux, + } + klog.Fatal(server.ListenAndServe()) +} diff --git a/pkg/util/provider/app/options/options.go b/pkg/util/provider/app/options/options.go new file mode 100644 index 000000000..855c86a2e --- /dev/null +++ b/pkg/util/provider/app/options/options.go @@ -0,0 +1,120 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This file was copied and modified from the kubernetes/kubernetes project +https://github.com/kubernetes/kubernetes/release-1.8/cmd/kube-controller-manager/app/options/options.go + +Modifications Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. +*/ + +package options + +import ( + "time" + + drain "github.com/gardener/machine-controller-manager/pkg/util/provider/drain" + machineconfig "github.com/gardener/machine-controller-manager/pkg/util/provider/options" + "github.com/spf13/pflag" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + + "github.com/gardener/machine-controller-manager/pkg/util/client/leaderelectionconfig" + + // add the machine feature gates + _ "github.com/gardener/machine-controller-manager/pkg/features" +) + +// MCServer is the main context object for the machine controller. +type MCServer struct { + machineconfig.MachineControllerConfiguration + + ControlKubeconfig string + TargetKubeconfig string +} + +// NewMCServer creates a new MCServer with a default config. +func NewMCServer() *MCServer { + + s := MCServer{ + // Part of these default values also present in 'cmd/cloud-controller-manager/app/options/options.go'. + // Please keep them in sync when doing update. + MachineControllerConfiguration: machineconfig.MachineControllerConfiguration{ + Port: 10259, + Namespace: "default", + Address: "0.0.0.0", + ConcurrentNodeSyncs: 5, + ContentType: "application/vnd.kubernetes.protobuf", + NodeConditions: "KernelDeadlock,ReadonlyFilesystem,DiskPressure", + MinResyncPeriod: metav1.Duration{Duration: 12 * time.Hour}, + KubeAPIQPS: 20.0, + KubeAPIBurst: 30, + LeaderElection: leaderelectionconfig.DefaultLeaderElectionConfiguration(), + ControllerStartInterval: metav1.Duration{Duration: 0 * time.Second}, + SafetyOptions: machineconfig.SafetyOptions{ + MachineCreationTimeout: metav1.Duration{Duration: 20 * time.Minute}, + MachineHealthTimeout: metav1.Duration{Duration: 10 * time.Minute}, + MachineDrainTimeout: metav1.Duration{Duration: drain.DefaultMachineDrainTimeout}, + MaxEvictRetries: drain.DefaultMaxEvictRetries, + PvDetachTimeout: metav1.Duration{Duration: 2 * time.Minute}, + MachineSafetyOrphanVMsPeriod: metav1.Duration{Duration: 30 * time.Minute}, + MachineSafetyAPIServerStatusCheckPeriod: metav1.Duration{Duration: 1 * time.Minute}, + MachineSafetyAPIServerStatusCheckTimeout: metav1.Duration{Duration: 30 * time.Second}, + }, + }, + } + s.LeaderElection.LeaderElect = true + return &s +} + +// AddFlags adds flags for a specific CMServer to the specified FlagSet +func (s *MCServer) AddFlags(fs *pflag.FlagSet) { + fs.Int32Var(&s.Port, "port", s.Port, "The port that the controller-manager's http service runs on") + fs.Var(machineconfig.IPVar{Val: &s.Address}, "address", "The IP address to serve on (set to 0.0.0.0 for all interfaces)") + fs.StringVar(&s.CloudProvider, "cloud-provider", s.CloudProvider, "The provider for cloud services. Empty string for no provider.") + fs.Int32Var(&s.ConcurrentNodeSyncs, "concurrent-syncs", s.ConcurrentNodeSyncs, "The number of nodes that are allowed to sync concurrently. Larger number = more responsive service management, but more CPU (and network) load") + fs.DurationVar(&s.MinResyncPeriod.Duration, "min-resync-period", s.MinResyncPeriod.Duration, "The resync period in reflectors will be random between MinResyncPeriod and 2*MinResyncPeriod") + fs.BoolVar(&s.EnableProfiling, "profiling", true, "Enable profiling via web interface host:port/debug/pprof/") + fs.BoolVar(&s.EnableContentionProfiling, "contention-profiling", false, "Enable lock contention profiling, if profiling is enabled") + fs.StringVar(&s.TargetKubeconfig, "target-kubeconfig", s.TargetKubeconfig, "Filepath to the target cluster's kubeconfig where node objects are expected to join") + fs.StringVar(&s.ControlKubeconfig, "control-kubeconfig", s.ControlKubeconfig, "Filepath to the control cluster's kubeconfig where machine objects would be created. Optionally you could also use 'inClusterConfig' when pod is running inside control kubeconfig. (Default value is same as target-kubeconfig)") + fs.StringVar(&s.Namespace, "namespace", s.Namespace, "Name of the namespace in control cluster where controller would look for CRDs and Kubernetes objects") + fs.StringVar(&s.ContentType, "kube-api-content-type", s.ContentType, "Content type of requests sent to apiserver.") + fs.Float32Var(&s.KubeAPIQPS, "kube-api-qps", s.KubeAPIQPS, "QPS to use while talking with kubernetes apiserver") + fs.Int32Var(&s.KubeAPIBurst, "kube-api-burst", s.KubeAPIBurst, "Burst to use while talking with kubernetes apiserver") + fs.DurationVar(&s.ControllerStartInterval.Duration, "controller-start-interval", s.ControllerStartInterval.Duration, "Interval between starting controller managers.") + + fs.DurationVar(&s.SafetyOptions.MachineCreationTimeout.Duration, "machine-creation-timeout", s.SafetyOptions.MachineCreationTimeout.Duration, "Timeout (in durartion) used while joining (during creation) of machine before it is declared as failed.") + fs.DurationVar(&s.SafetyOptions.MachineHealthTimeout.Duration, "machine-health-timeout", s.SafetyOptions.MachineHealthTimeout.Duration, "Timeout (in durartion) used while re-joining (in case of temporary health issues) of machine before it is declared as failed.") + fs.DurationVar(&s.SafetyOptions.MachineDrainTimeout.Duration, "machine-drain-timeout", drain.DefaultMachineDrainTimeout, "Timeout (in durartion) used while draining of machine before deletion, beyond which MCM forcefully deletes machine.") + fs.Int32Var(&s.SafetyOptions.MaxEvictRetries, "machine-max-evict-retries", drain.DefaultMaxEvictRetries, "Maximum number of times evicts would be attempted on a pod before it is forcibly deleted during draining of a machine.") + fs.DurationVar(&s.SafetyOptions.PvDetachTimeout.Duration, "machine-pv-detach-timeout", s.SafetyOptions.PvDetachTimeout.Duration, "Timeout (in duration) used while waiting for detach of PV while evicting/deleting pods") + fs.DurationVar(&s.SafetyOptions.MachineSafetyAPIServerStatusCheckTimeout.Duration, "machine-safety-apiserver-statuscheck-timeout", s.SafetyOptions.MachineSafetyAPIServerStatusCheckTimeout.Duration, "Timeout (in duration) for which the APIServer can be down before declare the machine controller frozen by safety controller") + + fs.DurationVar(&s.SafetyOptions.MachineSafetyOrphanVMsPeriod.Duration, "machine-safety-orphan-vms-period", s.SafetyOptions.MachineSafetyOrphanVMsPeriod.Duration, "Time period (in durartion) used to poll for orphan VMs by safety controller.") + fs.DurationVar(&s.SafetyOptions.MachineSafetyAPIServerStatusCheckPeriod.Duration, "machine-safety-apiserver-statuscheck-period", s.SafetyOptions.MachineSafetyAPIServerStatusCheckPeriod.Duration, "Time period (in duration) used to poll for APIServer's health by safety controller") + fs.StringVar(&s.NodeConditions, "node-conditions", s.NodeConditions, "List of comma-separated/case-sensitive node-conditions which when set to True will change machine to a failed state after MachineHealthTimeout duration. It may further be replaced with a new machine if the machine is backed by a machine-set object.") + fs.StringVar(&s.BootstrapTokenAuthExtraGroups, "bootstrap-token-auth-extra-groups", s.BootstrapTokenAuthExtraGroups, "Comma-separated list of groups to set bootstrap token's \"auth-extra-groups\" field to") + + leaderelectionconfig.BindFlags(&s.LeaderElection, fs) + // TODO: DefaultFeatureGate is global and it adds all k8s flags + // utilfeature.DefaultFeatureGate.AddFlag(fs) +} + +// Validate is used to validate the options and config before launching the controller manager +func (s *MCServer) Validate() error { + var errs []error + // TODO add validation + return utilerrors.NewAggregate(errs) +} diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go new file mode 100644 index 000000000..d6e950d26 --- /dev/null +++ b/pkg/util/provider/drain/drain.go @@ -0,0 +1,958 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This file was copied and modified from the kubernetes/kubernetes project +https://github.com/kubernetes/kubernetes/blob/release-1.8/pkg/kubectl/cmd/drain.go + +Modifications Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. +*/ + +// Package drain is used to drain nodes +package drain + +import ( + "context" + "errors" + "fmt" + "io" + "regexp" + "sort" + "strings" + "time" + + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + api "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + policy "k8s.io/api/policy/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/klog" +) + +// Options are configurable options while draining a node before deletion +type Options struct { + client kubernetes.Interface + ForceDeletePods bool + IgnorePodsWithoutControllers bool + GracePeriodSeconds int + IgnoreDaemonsets bool + Timeout time.Duration + MaxEvictRetries int32 + PvDetachTimeout time.Duration + DeleteLocalData bool + nodeName string + Out io.Writer + ErrOut io.Writer + Driver driver.Driver + pvcLister corelisters.PersistentVolumeClaimLister + pvLister corelisters.PersistentVolumeLister + drainStartedOn time.Time + drainEndedOn time.Time +} + +// Takes a pod and returns a bool indicating whether or not to operate on the +// pod, an optional warning message, and an optional fatal error. +type podFilter func(api.Pod) (include bool, w *warning, f *fatal) +type warning struct { + string +} +type fatal struct { + string +} + +const ( + // EvictionKind is the kind used for eviction + EvictionKind = "Eviction" + // EvictionSubresource is the kind used for evicting pods + EvictionSubresource = "pods/eviction" + + // DefaultMachineDrainTimeout is the default value for MachineDrainTimeout + DefaultMachineDrainTimeout = 12 * time.Hour + + // PodsWithoutPVDrainGracePeriod defines the grace period to wait for the pods without PV during machine drain. + // This is in addition to the maximum terminationGracePeriod amount the pods. + PodsWithoutPVDrainGracePeriod = 3 * time.Minute + + // Interval is the default Poll interval + Interval = time.Second * 5 + + // PodEvictionRetryInterval is the interval in which to retry eviction for pods + PodEvictionRetryInterval = time.Second * 20 + + // GetPvDetailsRetryInterval is the interval in which to retry getting PV details + GetPvDetailsRetryInterval = time.Second * 5 + // GetPvDetailsMaxRetries is the number of max retries to get PV details + GetPvDetailsMaxRetries = 3 + // VolumeDetachPollInterval is the interval in which to recheck if the volume is detached from the node + VolumeDetachPollInterval = time.Second * 5 + + daemonsetFatal = "DaemonSet-managed pods (use --ignore-daemonsets to ignore)" + daemonsetWarning = "Ignoring DaemonSet-managed pods" + localStorageFatal = "pods with local storage (use --delete-local-data to override)" + localStorageWarning = "Deleting pods with local storage" + unmanagedFatal = "pods not managed by ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet (use --force to override)" + unmanagedWarning = "Deleting pods not managed by ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet" +) + +var ( + // DefaultMaxEvictRetries is the default value for MaxEvictRetries + DefaultMaxEvictRetries = int32(DefaultMachineDrainTimeout.Seconds() / PodEvictionRetryInterval.Seconds()) +) + +// NewDrainOptions creates a new DrainOptions struct and returns a pointer to it +func NewDrainOptions( + client kubernetes.Interface, + timeout time.Duration, + maxEvictRetries int32, + pvDetachTimeout time.Duration, + nodeName string, + gracePeriodSeconds int, + forceDeletePods bool, + ignorePodsWithoutControllers bool, + ignoreDaemonsets bool, + deleteLocalData bool, + out io.Writer, + errOut io.Writer, + driver driver.Driver, + pvcLister corelisters.PersistentVolumeClaimLister, + pvLister corelisters.PersistentVolumeLister, +) *Options { + + return &Options{ + client: client, + ForceDeletePods: forceDeletePods, + IgnorePodsWithoutControllers: ignorePodsWithoutControllers, + GracePeriodSeconds: gracePeriodSeconds, + IgnoreDaemonsets: ignoreDaemonsets, + MaxEvictRetries: maxEvictRetries, + Timeout: timeout, + PvDetachTimeout: pvDetachTimeout, + DeleteLocalData: deleteLocalData, + nodeName: nodeName, + Out: out, + ErrOut: errOut, + Driver: driver, + pvcLister: pvcLister, + pvLister: pvLister, + } + +} + +// RunDrain runs the 'drain' command +func (o *Options) RunDrain() error { + o.drainStartedOn = time.Now() + klog.V(4).Infof( + "Machine drain started on %s for %q", + o.drainStartedOn, + o.nodeName, + ) + + defer func() { + o.drainEndedOn = time.Now() + klog.Infof( + "Machine drain ended on %s and took %s for %q", + o.drainEndedOn, + o.drainEndedOn.Sub(o.drainStartedOn), + o.nodeName, + ) + }() + + if err := o.RunCordonOrUncordon(true); err != nil { + klog.Errorf("Drain Error: Cordoning of node failed with error: %v", err) + return err + } + + err := o.deleteOrEvictPodsSimple() + return err +} + +func (o *Options) deleteOrEvictPodsSimple() error { + pods, err := o.getPodsForDeletion() + if err != nil { + return err + } + + err = o.deleteOrEvictPods(pods) + if err != nil { + pendingPods, newErr := o.getPodsForDeletion() + if newErr != nil { + return newErr + } + fmt.Fprintf(o.ErrOut, "There are pending pods when an error occurred: %v\n", err) + for _, pendingPod := range pendingPods { + fmt.Fprintf(o.ErrOut, "%s/%s\n", pendingPod.Namespace, pendingPod.Name) + } + } + return err +} + +func (o *Options) getPodController(pod api.Pod) *metav1.OwnerReference { + return metav1.GetControllerOf(&pod) +} + +func (o *Options) unreplicatedFilter(pod api.Pod) (bool, *warning, *fatal) { + // any finished pod can be removed + if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed { + return true, nil, nil + } + + controllerRef := o.getPodController(pod) + if controllerRef != nil { + return true, nil, nil + } + if !o.IgnorePodsWithoutControllers { + return false, nil, &fatal{unmanagedFatal} + } + return true, &warning{unmanagedWarning}, nil +} + +func (o *Options) daemonsetFilter(pod api.Pod) (bool, *warning, *fatal) { + // Note that we return false in cases where the pod is DaemonSet managed, + // regardless of flags. We never delete them, the only question is whether + // their presence constitutes an error. + // + // TODO: Might need to revisit this. This feature is ignored for now + // The exception is for pods that are orphaned (the referencing + // management resource - including DaemonSet - is not found). + // Such pods will be deleted if --force is used. + controllerRef := o.getPodController(pod) + if controllerRef == nil || controllerRef.Kind != "DaemonSet" { + return true, nil, nil + } + if !o.IgnoreDaemonsets { + return false, nil, &fatal{daemonsetFatal} + } + return false, &warning{daemonsetWarning}, nil +} + +func mirrorPodFilter(pod api.Pod) (bool, *warning, *fatal) { + if _, found := pod.ObjectMeta.Annotations[corev1.MirrorPodAnnotationKey]; found { + return false, nil, nil + } + return true, nil, nil +} + +func hasLocalStorage(pod api.Pod) bool { + for _, volume := range pod.Spec.Volumes { + if volume.EmptyDir != nil { + return true + } + } + + return false +} + +func (o *Options) localStorageFilter(pod api.Pod) (bool, *warning, *fatal) { + if !hasLocalStorage(pod) { + return true, nil, nil + } + if !o.DeleteLocalData { + return false, nil, &fatal{localStorageFatal} + } + return true, &warning{localStorageWarning}, nil +} + +// Map of status message to a list of pod names having that status. +type podStatuses map[string][]string + +func (ps podStatuses) Message() string { + msgs := []string{} + + for key, pods := range ps { + msgs = append(msgs, fmt.Sprintf("%s: %s", key, strings.Join(pods, ", "))) + } + return strings.Join(msgs, "; ") +} + +// getPodsForDeletion returns all the pods we're going to delete. If there are +// any pods preventing us from deleting, we return that list in an error. +func (o *Options) getPodsForDeletion() (pods []api.Pod, err error) { + podList, err := o.client.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{ + FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": o.nodeName}).String()}) + if err != nil { + return pods, err + } + + ws := podStatuses{} + fs := podStatuses{} + + for _, pod := range podList.Items { + podOk := true + for _, filt := range []podFilter{mirrorPodFilter, o.localStorageFilter, o.unreplicatedFilter, o.daemonsetFilter} { + filterOk, w, f := filt(pod) + + podOk = podOk && filterOk + if w != nil { + ws[w.string] = append(ws[w.string], pod.Name) + } + if f != nil { + fs[f.string] = append(fs[f.string], pod.Name) + } + } + if podOk { + pods = append(pods, pod) + } + } + + if len(fs) > 0 { + return []api.Pod{}, errors.New(fs.Message()) + } + if len(ws) > 0 { + fmt.Fprintf(o.ErrOut, "WARNING: %s\n", ws.Message()) + } + return pods, nil +} + +func (o *Options) deletePod(pod *api.Pod) error { + deleteOptions := &metav1.DeleteOptions{} + gracePeriodSeconds := int64(0) + deleteOptions.GracePeriodSeconds = &gracePeriodSeconds + + klog.V(3).Infof("Attempting to force-delete the pod:%q from node %q", pod.Name, o.nodeName) + return o.client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, deleteOptions) +} + +func (o *Options) evictPod(pod *api.Pod, policyGroupVersion string) error { + deleteOptions := &metav1.DeleteOptions{} + if o.GracePeriodSeconds >= 0 { + gracePeriodSeconds := int64(o.GracePeriodSeconds) + deleteOptions.GracePeriodSeconds = &gracePeriodSeconds + } + eviction := &policy.Eviction{ + TypeMeta: metav1.TypeMeta{ + APIVersion: policyGroupVersion, + Kind: EvictionKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + DeleteOptions: deleteOptions, + } + klog.V(3).Infof("Attempting to evict the pod:%q from node %q", pod.Name, o.nodeName) + // TODO: Remember to change the URL manipulation func when Evction's version change + return o.client.PolicyV1beta1().Evictions(eviction.Namespace).Evict(eviction) +} + +// deleteOrEvictPods deletes or evicts the pods on the api server +func (o *Options) deleteOrEvictPods(pods []api.Pod) error { + if len(pods) == 0 { + return nil + } + + policyGroupVersion, err := SupportEviction(o.client) + if err != nil { + return err + } + + getPodFn := func(namespace, name string) (*api.Pod, error) { + return o.client.CoreV1().Pods(namespace).Get(name, metav1.GetOptions{}) + } + + attemptEvict := !o.ForceDeletePods && len(policyGroupVersion) > 0 + + return o.evictPods(attemptEvict, pods, policyGroupVersion, getPodFn) +} + +func volIsPvc(vol *corev1.Volume) bool { + return vol.PersistentVolumeClaim != nil +} + +func filterPodsWithPv(pods []api.Pod) ([]*api.Pod, []*api.Pod) { + podsWithPv, podsWithoutPv := []*api.Pod{}, []*api.Pod{} + + for i := range pods { + hasPv := false + pod := &pods[i] + vols := pod.Spec.Volumes + for k := range vols { + vol := &vols[k] + hasPv = volIsPvc(vol) + if hasPv { + podsWithPv = append(podsWithPv, pod) + // No need to process rest of the volumes + break + } + } + if !hasPv { + podsWithoutPv = append(podsWithoutPv, pod) + } + } + return podsWithPv, podsWithoutPv +} + +func (o *Options) getTerminationGracePeriod(pod *api.Pod) time.Duration { + if pod == nil || pod.Spec.TerminationGracePeriodSeconds == nil { + return time.Duration(0) + } + + return time.Duration(*pod.Spec.TerminationGracePeriodSeconds) * time.Second +} + +func (o *Options) getGlobalTimeoutForPodsWithoutPV(pods []*api.Pod) time.Duration { + var tgpsMax time.Duration + for _, pod := range pods { + tgps := o.getTerminationGracePeriod(pod) + if tgps > tgpsMax { + tgpsMax = tgps + } + } + + return tgpsMax + PodsWithoutPVDrainGracePeriod +} + +func (o *Options) evictPods(attemptEvict bool, pods []api.Pod, policyGroupVersion string, getPodFn func(namespace, name string) (*api.Pod, error)) error { + returnCh := make(chan error, len(pods)) + defer close(returnCh) + + if o.ForceDeletePods { + podsToDrain := make([]*api.Pod, len(pods)) + for i := range pods { + podsToDrain[i] = &pods[i] + } + + klog.V(3).Infof("Forceful eviction of pods on the node: %q", o.nodeName) + + // evict all pods in parallel without waiting for pods or volume detachment + go o.evictPodsWithoutPv(attemptEvict, podsToDrain, policyGroupVersion, getPodFn, returnCh) + } else { + podsWithPv, podsWithoutPv := filterPodsWithPv(pods) + + klog.V(3).Infof("Normal eviction of pods on the node: %q", o.nodeName) + + // evcit all pods without PV in parallel and with PV in serial (waiting for vol detachment) + go o.evictPodsWithPv(attemptEvict, podsWithPv, policyGroupVersion, getPodFn, returnCh) + go o.evictPodsWithoutPv(attemptEvict, podsWithoutPv, policyGroupVersion, getPodFn, returnCh) + } + + doneCount := 0 + var errors []error + + numPods := len(pods) + for doneCount < numPods { + err := <-returnCh + doneCount++ + if err != nil { + errors = append(errors, err) + } + } + return utilerrors.NewAggregate(errors) +} + +func (o *Options) evictPodsWithoutPv(attemptEvict bool, pods []*corev1.Pod, + policyGroupVersion string, + getPodFn func(namespace, name string) (*api.Pod, error), + returnCh chan error, +) { + for _, pod := range pods { + go o.evictPodWithoutPVInternal(attemptEvict, pod, policyGroupVersion, getPodFn, returnCh) + } + return +} + +func sortPodsByPriority(pods []*corev1.Pod) { + sort.Slice(pods, func(i, j int) bool { + return *pods[i].Spec.Priority > *pods[j].Spec.Priority + }) +} + +// doAccountingOfPvs returns the map with keys as pod names and values as array of attached volumes' IDs +func (o *Options) doAccountingOfPvs(pods []*corev1.Pod) map[string][]string { + volMap := make(map[string][]string) + pvMap := make(map[string][]string) + + for _, pod := range pods { + podPVs, _ := o.getPvs(pod) + pvMap[pod.Namespace+"/"+pod.Name] = podPVs + } + klog.V(4).Info("PV map: ", pvMap) + + filterSharedPVs(pvMap) + + for i := range pvMap { + pvList := pvMap[i] + vols, err := o.getVolIDsFromDriver(pvList) + if err != nil { + // In case of error, log and skip this set of volumes + klog.Errorf("Error getting volume ID from cloud provider. Skipping volumes for pod: %v. Err: %v", i, err) + continue + } + volMap[i] = vols + } + klog.V(4).Info("Volume map: ", volMap) + return volMap +} + +// filterSharedPVs filters out the PVs that are shared among pods. +func filterSharedPVs(pvMap map[string][]string) { + sharedVol := make(map[string]bool) + sharedVolumesFound := false + + // Create hash map of volumes: + // Key: volume name + // Value: 'true' if any other pod shares this volume, else 'false' + for _, vols := range pvMap { + for _, vol := range vols { + if _, ok := sharedVol[vol]; !ok { + sharedVol[vol] = false + } else { + sharedVol[vol] = true + sharedVolumesFound = true + } + } + } + + if !sharedVolumesFound { + klog.V(4).Info("No shared volumes found.") + return + } + + // Recreate the values of pvMap. Append volume if it is not shared + for pod, vols := range pvMap { + volList := []string{} + for _, vol := range vols { + if sharedVol[vol] == false { + volList = append(volList, vol) + } + } + pvMap[pod] = volList + } + klog.V(3).Info("Removed shared volumes. Filtered list of pods with volumes: ", pvMap) +} + +func (o *Options) evictPodsWithPv(attemptEvict bool, pods []*corev1.Pod, + policyGroupVersion string, + getPodFn func(namespace, name string) (*api.Pod, error), + returnCh chan error, +) { + sortPodsByPriority(pods) + + volMap := o.doAccountingOfPvs(pods) + + var ( + remainingPods []*api.Pod + fastTrack bool + nretries = int(o.MaxEvictRetries) + ) + + if attemptEvict { + for i := 0; i < nretries; i++ { + remainingPods, fastTrack = o.evictPodsWithPVInternal(attemptEvict, pods, volMap, policyGroupVersion, getPodFn, returnCh) + if fastTrack || len(remainingPods) == 0 { + //Either all pods got evicted or we need to fast track the return (node deletion detected) + break + } + + klog.V(4).Infof( + "Eviction/deletion for some pods will be retried after %s for node %q", + PodEvictionRetryInterval, + o.nodeName, + ) + pods = remainingPods + time.Sleep(PodEvictionRetryInterval) + } + + if !fastTrack && len(remainingPods) > 0 { + // Force delete the pods remaining after evict retries. + pods = remainingPods + remainingPods, _ = o.evictPodsWithPVInternal(false, pods, volMap, policyGroupVersion, getPodFn, returnCh) + } + } else { + remainingPods, _ = o.evictPodsWithPVInternal(false, pods, volMap, policyGroupVersion, getPodFn, returnCh) + } + + // Placate the caller by returning the nil status for the remaining pods. + for _, pod := range remainingPods { + klog.V(4).Infof("Returning success for remaining pods for node %q", o.nodeName) + if fastTrack { + // This is executed when node is not found anymore. + // Return success to caller for all non-processed pods so that the caller function can move on. + returnCh <- nil + } else if attemptEvict { + returnCh <- fmt.Errorf("Error evicting pod %s/%s from node %q", pod.Namespace, pod.Name, pod.Spec.NodeName) + } else { + returnCh <- fmt.Errorf("Error deleting pod %s/%s from node %q", pod.Namespace, pod.Name, pod.Spec.NodeName) + } + } + + return +} + +func (o *Options) evictPodsWithPVInternal(attemptEvict bool, pods []*corev1.Pod, volMap map[string][]string, + policyGroupVersion string, + getPodFn func(namespace, name string) (*api.Pod, error), + returnCh chan error) (remainingPods []*api.Pod, fastTrack bool) { + var ( + mainContext context.Context + cancelMainContext context.CancelFunc + retryPods []*api.Pod + ) + mainContext, cancelMainContext = context.WithDeadline(context.Background(), o.drainStartedOn.Add(o.Timeout)) + defer cancelMainContext() + + for i, pod := range pods { + select { + case <-mainContext.Done(): + // Timeout occurred. Abort and report the remaining pods. + returnCh <- nil + return append(retryPods, pods[i+1:]...), true + default: + } + + var ( + err error + podEvictionStartTime time.Time + ) + + podEvictionStartTime = time.Now() + + if attemptEvict { + err = o.evictPod(pod, policyGroupVersion) + } else { + err = o.deletePod(pod) + } + + if attemptEvict && apierrors.IsTooManyRequests(err) { + // Pod eviction failed because of PDB violation, we will retry one we are done with this list. + klog.V(3).Info("Pod ", pod.Namespace, "/", pod.Name, " from node ", pod.Spec.NodeName, " couldn't be evicted. This may also occur due to PDB violation. Will be retried. Error:", err) + retryPods = append(retryPods, pod) + continue + } else if apierrors.IsNotFound(err) { + klog.V(3).Info("\t", pod.Name, " from node ", pod.Spec.NodeName, " is already gone") + returnCh <- nil + continue + } else if err != nil { + klog.V(4).Infof("Error when evicting pod: %v/%v from node %v. Will be retried. Err: %v", pod.Namespace, pod.Name, pod.Spec.NodeName, err) + retryPods = append(retryPods, pod) + continue + } + + // Eviction was successful. Wait for pvs for this pod to detach + klog.V(3).Infof( + "Pod eviction/deletion for Pod %s/%s in Node %q and took %v. Now waiting for volume detachment.", + pod.Namespace, + pod.Name, + pod.Spec.NodeName, + time.Since(podEvictionStartTime), + ) + + pvs := volMap[pod.Namespace+"/"+pod.Name] + ctx, cancelFn := context.WithTimeout(mainContext, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout) + err = o.waitForDetach(ctx, pvs, o.nodeName) + cancelFn() + + if apierrors.IsNotFound(err) { + klog.V(3).Info("Node not found anymore") + returnCh <- nil + return append(retryPods, pods[i+1:]...), true + } else if err != nil { + klog.Errorf("Error when waiting for volume to detach from node. Err: %v", err) + returnCh <- err + continue + } + klog.V(3).Infof( + "Volume detached for Pod %s/%s in Node %q and took %v (including pod eviction/deletion time).", + pod.Namespace, + pod.Name, + pod.Spec.NodeName, + time.Since(podEvictionStartTime), + ) + returnCh <- nil + } + + return retryPods, false +} + +func (o *Options) getPvs(pod *corev1.Pod) ([]string, error) { + pvs := []string{} + for i := range pod.Spec.Volumes { + vol := &pod.Spec.Volumes[i] + + if vol.PersistentVolumeClaim != nil { + try := 0 + + for { + pvc, err := o.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(vol.PersistentVolumeClaim.ClaimName) + + if apierrors.IsNotFound(err) { + // If this PVC is not found, move on to the next PVC + break + } else if err != nil { + try++ + + if try == GetPvDetailsMaxRetries { + // Log warning, and skip trying this volume anymore + klog.Errorf("Error getting PVC. Err: %v", err) + break + } + // In case of error, try again after few seconds + time.Sleep(GetPvDetailsRetryInterval) + continue + } + + // Found PVC; append and exit + pvs = append(pvs, pvc.Spec.VolumeName) + break + } + } + } + return pvs, nil +} + +func (o *Options) waitForDetach(ctx context.Context, volumeIDs []string, nodeName string) error { + if volumeIDs == nil || len(volumeIDs) == 0 || nodeName == "" { + // If volume or node name is not available, nothing to do. Just log this as warning + klog.Warningf("Node name: %q, list of pod PVs to wait for detach: %v", nodeName, volumeIDs) + return nil + } + + klog.V(4).Info("Waiting for following volumes to detach: ", volumeIDs) + + found := true + + for found { + select { + case <-ctx.Done(): + klog.Warningf("Timeout occurred while waiting for PVs to detach from node %q", nodeName) + return fmt.Errorf("Timeout while waiting for PVs to detach from node") + default: + } + + found = false + + node, err := o.client.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + klog.V(4).Info("Node not found: ", nodeName) + return err + } else if err != nil { + klog.Errorf("Error getting details for node: %q. Err: %v", nodeName, err) + return err + } + + klog.V(4).Infof("No of attached volumes for node %q is %s", nodeName, node.Status.VolumesAttached) + attachedVols := node.Status.VolumesAttached + if len(attachedVols) == 0 { + klog.V(4).Infof("No volumes attached to the node %q", nodeName) + return nil + } + + LookUpVolume: + for i := range volumeIDs { + volumeID := &volumeIDs[i] + + for j := range attachedVols { + attachedVol := &attachedVols[j] + + found, _ = regexp.MatchString(*volumeID, string(attachedVol.Name)) + + if found { + klog.V(4).Infof( + "Found volume:%s still attached to node %q. Will re-check in %s", + *volumeID, + nodeName, + VolumeDetachPollInterval, + ) + time.Sleep(VolumeDetachPollInterval) + break LookUpVolume + } + } + } + } + + klog.V(4).Infof("Detached volumes:%s from node %q", volumeIDs, nodeName) + return nil +} + +func (o *Options) getVolIDsFromDriver(pvNames []string) ([]string, error) { + pvSpecs := []*corev1.PersistentVolumeSpec{} + + for _, pvName := range pvNames { + try := 0 + + for { + pv, err := o.pvLister.Get(pvName) + + if apierrors.IsNotFound(err) { + break + } else if err != nil { + try++ + if try == GetPvDetailsMaxRetries { + break + } + // In case of error, try again after few seconds + time.Sleep(GetPvDetailsRetryInterval) + continue + } + + // Found PV; append and exit + pvSpecs = append(pvSpecs, &pv.Spec) + break + } + } + + response, err := o.Driver.GetVolumeIDs(context.TODO(), &driver.GetVolumeIDsRequest{PVSpecs: pvSpecs}) + return response.VolumeIDs, err +} + +func (o *Options) evictPodWithoutPVInternal(attemptEvict bool, pod *corev1.Pod, policyGroupVersion string, getPodFn func(namespace, name string) (*api.Pod, error), returnCh chan error) { + var err error + klog.V(3).Infof( + "Evicting pod %s/%s from node %q ", + pod.Namespace, + pod.Name, + pod.Spec.NodeName, + ) + + nretries := int(o.MaxEvictRetries) + for i := 0; ; i++ { + if i >= nretries { + attemptEvict = false + } + + if attemptEvict { + err = o.evictPod(pod, policyGroupVersion) + } else { + err = o.deletePod(pod) + } + + if err == nil { + break + } else if apierrors.IsNotFound(err) { + klog.V(3).Info("\t", pod.Name, " evicted from node ", pod.Spec.NodeName) + returnCh <- nil + return + } else if attemptEvict && apierrors.IsTooManyRequests(err) { + // Pod couldn't be evicted because of PDB violation + time.Sleep(PodEvictionRetryInterval) + } else { + returnCh <- fmt.Errorf("error when evicting pod %q: %v scheduled on node %v", pod.Name, err, pod.Spec.NodeName) + return + } + } + + if o.ForceDeletePods { + // Skip waiting for pod termination in case of forced drain + if err == nil { + returnCh <- nil + } else { + returnCh <- err + } + return + } + + podArray := []*api.Pod{pod} + + timeout := o.getTerminationGracePeriod(pod) + if timeout > o.Timeout { + klog.V(3).Infof("Overriding large termination grace period (%s) for the pod %s/%s and setting it to %s", timeout.String(), pod.Namespace, pod.Name, o.Timeout) + timeout = o.Timeout + } + + bufferPeriod := 30 * time.Second + podArray, err = o.waitForDelete(podArray, Interval, timeout+bufferPeriod, true, getPodFn) + if err == nil { + if len(podArray) > 0 { + returnCh <- fmt.Errorf("timeout expired while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName) + } else { + returnCh <- nil + } + } else { + returnCh <- fmt.Errorf("error when waiting for pod %q/%v terminating: %v", pod.Name, pod.Spec.NodeName, err) + } +} + +func (o *Options) waitForDelete(pods []*api.Pod, interval, timeout time.Duration, usingEviction bool, getPodFn func(string, string) (*api.Pod, error)) ([]*api.Pod, error) { + err := wait.PollImmediate(interval, timeout, func() (bool, error) { + pendingPods := []*api.Pod{} + for i, pod := range pods { + p, err := getPodFn(pod.Namespace, pod.Name) + if apierrors.IsNotFound(err) || (p != nil && p.ObjectMeta.UID != pod.ObjectMeta.UID) { + //cmdutil.PrintSuccess(o.mapper, false, o.Out, "pod", pod.Name, false, verbStr) + //klog.Info("pod deleted successfully found") + continue + } else if err != nil { + return false, err + } else { + pendingPods = append(pendingPods, pods[i]) + } + } + pods = pendingPods + if len(pendingPods) > 0 { + return false, nil + } + return true, nil + }) + return pods, err +} + +// SupportEviction uses Discovery API to find out if the server support eviction subresource +// If support, it will return its groupVersion; Otherwise, it will return "" +func SupportEviction(clientset kubernetes.Interface) (string, error) { + discoveryClient := clientset.Discovery() + groupList, err := discoveryClient.ServerGroups() + if err != nil { + return "", err + } + foundPolicyGroup := false + var policyGroupVersion string + for _, group := range groupList.Groups { + if group.Name == "policy" { + foundPolicyGroup = true + policyGroupVersion = group.PreferredVersion.GroupVersion + break + } + } + if !foundPolicyGroup { + return "", nil + } + resourceList, err := discoveryClient.ServerResourcesForGroupVersion("v1") + if err != nil { + return "", err + } + for _, resource := range resourceList.APIResources { + if resource.Name == EvictionSubresource && resource.Kind == EvictionKind { + return policyGroupVersion, nil + } + } + return "", nil +} + +// RunCordonOrUncordon runs either Cordon or Uncordon. The desired value for +// "Unschedulable" is passed as the first arg. +func (o *Options) RunCordonOrUncordon(desired bool) error { + node, err := o.client.CoreV1().Nodes().Get(o.nodeName, metav1.GetOptions{}) + if err != nil { + // Deletion could be triggered when machine is just being created, no node present then + return nil + } + unsched := node.Spec.Unschedulable + if unsched == desired { + klog.V(3).Infof("Scheduling state for node %q is already in desired state", node.Name) + } else { + clone := node.DeepCopy() + clone.Spec.Unschedulable = desired + + _, err = o.client.CoreV1().Nodes().Update(clone) + if err != nil { + return err + } + } + return nil +} diff --git a/pkg/util/provider/drain/drain_suite_test.go b/pkg/util/provider/drain/drain_suite_test.go new file mode 100644 index 000000000..b2d299eaf --- /dev/null +++ b/pkg/util/provider/drain/drain_suite_test.go @@ -0,0 +1,13 @@ +package drain_test + +import ( + "testing" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +func TestDrain(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Drain Suite") +} diff --git a/pkg/util/provider/drain/drain_test.go b/pkg/util/provider/drain/drain_test.go new file mode 100644 index 000000000..f1398d26c --- /dev/null +++ b/pkg/util/provider/drain/drain_test.go @@ -0,0 +1,953 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +*/ // Package drain is used to drain nodes +package drain + +/* +TODO: Fix timeout issue for tests +import ( + "context" + "fmt" + "regexp" + "sync" + "time" + + "github.com/gardener/machine-controller-manager/pkg/fakeclient" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + . "github.com/onsi/ginkgo" + . "github.com/onsi/ginkgo/extensions/table" + . "github.com/onsi/gomega" + api "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes" + k8stesting "k8s.io/client-go/testing" +) + +var _ = Describe("drain", func() { + const testNodeName = "node" + const terminationGracePeriodShort = 5 * time.Second + const terminationGracePeriodShortBy4 = terminationGracePeriodShort / 4 + const terminationGracePeriodShortBy8 = terminationGracePeriodShort / 8 + const terminationGracePeriodMedium = 10 * time.Second + const terminationGracePeriodDefault = 20 * time.Second + const terminationGracePeriodLong = 2 * time.Minute + const testNamespace = "test" + + type stats struct { + nPodsWithoutPV int + nPodsWithOnlyExclusivePV int + nPodsWithOnlySharedPV int + nPodsWithExclusiveAndSharedPV int + } + type setup struct { + stats + attemptEviction bool + maxEvictRetries int32 + terminationGracePeriod time.Duration + force bool + evictError error + deleteError error + } + + type expectation struct { + stats + timeout time.Duration + drainTimeout bool + drainError error + nEvictions int + minDrainDuration time.Duration + } + + type podDrainHandler func(client kubernetes.Interface, pod *api.Pod, detachExclusiveVolumesCh chan<- *api.Pod) error + + run := func(setup *setup, podDrainHandlers []podDrainHandler, expected *expectation) { + stop := make(chan struct{}) + defer close(stop) + + wg := sync.WaitGroup{} + + podsWithoutPV := getPodsWithoutPV(setup.nPodsWithoutPV, testNamespace, "nopv-", testNodeName, setup.terminationGracePeriod, map[string]string{ + "volumes": "none", + }) + podsWithOnlyExclusivePV := getPodsWithPV(setup.nPodsWithOnlyExclusivePV, setup.nPodsWithOnlyExclusivePV, 0, testNamespace, "expv-", "expv-", "", testNodeName, setup.terminationGracePeriod, map[string]string{ + "volumes": "only-exclusive", + }) + podsWithOnlySharedPV := getPodsWithPV(setup.nPodsWithOnlySharedPV, 0, setup.nPodsWithOnlySharedPV/2, testNamespace, "shpv-", "", "shpv-", testNodeName, setup.terminationGracePeriod, map[string]string{ + "volumes": "only-shared", + }) + nPodsWithExclusiveAndSharedPV := getPodsWithPV(setup.nPodsWithExclusiveAndSharedPV, setup.nPodsWithExclusiveAndSharedPV, setup.nPodsWithExclusiveAndSharedPV/2, testNamespace, "exshpv-", "exshexpv-", "exshshpv-", testNodeName, setup.terminationGracePeriod, map[string]string{ + "volumes": "exclusive-and-shared", + }) + + var pods []*api.Pod + pods = append(pods, podsWithoutPV...) + pods = append(pods, podsWithOnlyExclusivePV...) + pods = append(pods, podsWithOnlySharedPV...) + pods = append(pods, nPodsWithExclusiveAndSharedPV...) + + pvcs := getPVCs(pods) + pvs := getPVs(pvcs) + nodes := []*corev1.Node{getNode(testNodeName, pvs)} + + var targetCoreObjects []runtime.Object + targetCoreObjects = appendPods(targetCoreObjects, pods) + targetCoreObjects = appendPVCs(targetCoreObjects, pvcs) + targetCoreObjects = appendPVs(targetCoreObjects, pvs) + targetCoreObjects = appendNodes(targetCoreObjects, nodes) + fakeTargetCoreClient, fakePVLister, fakePVCLister, tracker := createFakeController( + stop, testNamespace, targetCoreObjects, + ) + defer tracker.Stop() + + //Expect(cache.WaitForCacheSync(stop, fakePVCLister)).To(BeTrue()) + + //fakeDriver := driver.NewFakeDriver(driver.FakeDriver{Err: nil,}) + maxEvictRetries := setup.maxEvictRetries + if maxEvictRetries <= 0 { + maxEvictRetries = 3 + } + d := &Options{ + DeleteLocalData: true, + Driver: &drainDriver{}, + ErrOut: GinkgoWriter, + ForceDeletePods: setup.force, + IgnorePodsWithoutControllers: true, + GracePeriodSeconds: 30, + IgnoreDaemonsets: true, + MaxEvictRetries: maxEvictRetries, + Out: GinkgoWriter, + PvDetachTimeout: 3 * time.Minute, + Timeout: time.Minute, + client: fakeTargetCoreClient, + nodeName: testNodeName, + pvLister: fakePVLister, + pvcLister: fakePVCLister, + } + + // Get the pod directly from the ObjectTracker to avoid locking issues in the Fake object. + getPod := func(gvr schema.GroupVersionResource, ns, name string) (*api.Pod, error) { + ro, err := tracker.Get(gvr, ns, name) + if err != nil { + return nil, err + } + + return ro.(*api.Pod), nil + } + + // Serialize volume detachment to avoid concurrency issues during node update. + detachExclusiveVolumesCh := make(chan *api.Pod) + defer close(detachExclusiveVolumesCh) + + runPodDrainHandlers := func(pod *api.Pod) { + var err error + for _, handler := range podDrainHandlers { + err = handler(d.client, pod, detachExclusiveVolumesCh) + if err != nil { + break + } + } + + if err != nil { + fmt.Fprintf(GinkgoWriter, "Error simulating eviction for the pod %s/%s: %s", pod.Namespace, pod.Name, err) + } + } + + // Serialize volume detachment to avoid concurrency issues during node update. + go func() { + for pod := range detachExclusiveVolumesCh { + nodes := d.client.CoreV1().Nodes() + node, err := nodes.Get(pod.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + fmt.Fprintln(GinkgoWriter, err) + continue + } + + node = node.DeepCopy() + nodeUpdateRequired := false + { + remainingVolumesAttached := []corev1.AttachedVolume{} + pvcs := getPVCs([]*api.Pod{pod}) + pvs := getPVs(pvcs) + for i := range node.Status.VolumesAttached { + va := &node.Status.VolumesAttached[i] + if matched, err := regexp.Match("expv-", []byte(va.Name)); err != nil || !matched { + // Detach only exclusive volumes + remainingVolumesAttached = append(remainingVolumesAttached, *va) + continue + } + + found := false + for _, pv := range pvs { + if va.Name == corev1.UniqueVolumeName(getDrainTestVolumeName(&pv.Spec)) { + found = true + break + } + } + if !found { + remainingVolumesAttached = append(remainingVolumesAttached, *va) + } + } + if nodeUpdateRequired = len(remainingVolumesAttached) != len(node.Status.VolumesAttached); nodeUpdateRequired { + node.Status.VolumesAttached = remainingVolumesAttached + } + } + + if !nodeUpdateRequired { + continue + } + + _, err = nodes.Update(node) + fmt.Fprintln(GinkgoWriter, err) + } + }() + + ctx, cancelCtx := context.WithTimeout(context.Background(), expected.timeout) + defer cancelCtx() + + nEvictions := 0 + if setup.attemptEviction { + fakeTargetCoreClient := fakeTargetCoreClient.(*fakeclient.Clientset) + fakeTargetCoreClient.FakeDiscovery.Resources = []*metav1.APIResourceList{ + { + GroupVersion: "policy/v1", + }, + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: EvictionSubresource, + Kind: EvictionKind, + }, + }, + }, + } + + // Fake eviction + fakeTargetCoreClient.PrependReactor("post", "pods", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + if setup.evictError != nil { + return true, nil, setup.evictError + } + + start := time.Now() + switch ga := action.(type) { + case k8stesting.GetAction: + if ga.GetSubresource() != "eviction" { + return + } + + var pod *api.Pod + pod, err = getPod(action.GetResource(), ga.GetNamespace(), ga.GetName()) + if err != nil { + return + } + + // Delete the pod asyncronously to work around the lock problems in testing.Fake + wg.Add(1) + go func() { + defer wg.Done() + runPodDrainHandlers(pod) + fmt.Fprintf(GinkgoWriter, "Drained pod %s/%s in %s\n", pod.Namespace, pod.Name, time.Now().Sub(start).String()) + }() + + nEvictions++ + return + default: + err = fmt.Errorf("Expected type k8stesting.GetAction but got %T", action) + return + } + }) + } else { + // Work-around: Use a non-handling reactor in place of watch (because watch is not working). + fakeTargetCoreClient := fakeTargetCoreClient.(*fakeclient.Clientset) + fakeTargetCoreClient.PrependReactor("delete", "pods", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + if setup.deleteError != nil { + return true, nil, setup.deleteError + } + + start := time.Now() + switch ga := action.(type) { + case k8stesting.DeleteAction: + var pod *api.Pod + pod, err = getPod(action.GetResource(), ga.GetNamespace(), ga.GetName()) + if err != nil { + return + } + + // Delete the pod asyncronously to work around the lock problems in testing.Fake + wg.Add(1) + go func() { + defer wg.Done() + runPodDrainHandlers(pod) + fmt.Fprintf(GinkgoWriter, "Drained pod %s/%s in %s\n", pod.Namespace, pod.Name, time.Now().Sub(start).String()) + }() + default: + err = fmt.Errorf("Expected type k8stesting.GetAction but got %T", action) + } + + return + }) + } + + var drainErr error + var drainStart, drainEnd *time.Time + go func() { + start := time.Now() + drainStart = &start + drainErr = d.RunDrain() + end := time.Now() + drainEnd = &end + cancelCtx() + }() + + // Wait for the context to complete or timeout. + <-ctx.Done() + + if expected.drainTimeout { + Expect(ctx.Err()).To(Equal(context.DeadlineExceeded)) + + // TODO Find a way to validate rest of the details in case of an expected timeout. + return + } + + Expect(ctx.Err()).ToNot(Equal(context.DeadlineExceeded)) + + if expected.drainError == nil { + Expect(drainErr).ShouldNot(HaveOccurred()) + } else { + Expect(drainErr).To(Equal(expected.drainError)) + } + + wg.Wait() + + Expect(nEvictions).To(Equal(expected.nEvictions)) + + if expected.minDrainDuration > 0 { + Expect(drainStart).ToNot(BeNil()) + Expect(drainEnd).ToNot(BeNil()) + Expect(drainEnd.Sub(*drainStart)).To(BeNumerically(">=", expected.minDrainDuration)) + } + + validatePodCount := func(labelSelector string, nExpected int) { + podList, err := d.client.CoreV1().Pods(testNamespace).List(metav1.ListOptions{LabelSelector: labelSelector}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(podList).ToNot(BeNil()) + Expect(podList.Items).To(HaveLen(nExpected)) + } + + validatePodCount("volumes=none", expected.nPodsWithoutPV) + validatePodCount("volumes=only-exclusive", expected.nPodsWithOnlyExclusivePV) + validatePodCount("volumes=only-shared", expected.nPodsWithOnlySharedPV) + validatePodCount("volumes=exclusive-and-shared", expected.nPodsWithExclusiveAndSharedPV) + } + + sleepFor := func(d time.Duration) podDrainHandler { + return func(client kubernetes.Interface, pod *api.Pod, detachExclusiveVolumesCh chan<- *api.Pod) error { + time.Sleep(d) + return nil + } + } + + deletePod := func(client kubernetes.Interface, pod *api.Pod, detachExclusiveVolumesCh chan<- *api.Pod) error { + return client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, nil) + } + + detachExclusiveVolumes := func(client kubernetes.Interface, pod *api.Pod, detachExclusiveVolumesCh chan<- *api.Pod) error { + detachExclusiveVolumesCh <- pod + return nil + } + + DescribeTable("RunDrain", run, + Entry("Successful drain without support for eviction pods without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + }, + nil, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodShort, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + minDrainDuration: 0, + }), + Entry("Successful drain with support for eviction of pods without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{deletePod}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDelete polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodMedium, + drainTimeout: false, + drainError: nil, + nEvictions: 10, + // Because waitForDelete polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodShort, + }), + Entry("Successful drain without support for eviction of pods with exclusive volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful drain with support for eviction of pods with exclusive volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{deletePod, sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 2, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful drain without support for eviction of pods with shared volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 2, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + }, + nil, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodShort, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + minDrainDuration: 0, + }), + Entry("Successful drain with support for eviction of pods with shared volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 2, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{sleepFor(terminationGracePeriodShortBy4), deletePod}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodShort, + drainTimeout: false, + drainError: nil, + nEvictions: 2, + minDrainDuration: 0, + }), + Entry("Successful drain without support for eviction of pods with exclusive and shared volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 2, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful drain with support for eviction of pods with exclusive and shared volumes", + &setup{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 2, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{deletePod, sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 2, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful drain without support for eviction of pods with and without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful drain with support for eviction of pods with and without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + }, + []podDrainHandler{deletePod, sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodDefault, + drainTimeout: false, + drainError: nil, + nEvictions: 12, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + Entry("Successful forced drain without support for eviction of pods with and without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: false, + terminationGracePeriod: terminationGracePeriodShort, + force: true, + }, + nil, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodShort, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + minDrainDuration: 0, + }), + Entry("Successful forced drain with support for eviction of pods with and without volume", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + force: true, + }, + []podDrainHandler{deletePod}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodShort, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + minDrainDuration: 0, + }), + Entry("Successful forced drain with support for eviction of pods with and without volume when eviction fails", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + maxEvictRetries: 1, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodShort, + force: true, + evictError: apierrors.NewTooManyRequestsError(""), + }, + nil, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + timeout: terminationGracePeriodMedium, + drainTimeout: false, + drainError: nil, + nEvictions: 0, + minDrainDuration: 0, + }), + Entry("Successful drain for pods with long termination grace period", + &setup{ + stats: stats{ + nPodsWithoutPV: 10, + nPodsWithOnlyExclusivePV: 2, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + attemptEviction: true, + terminationGracePeriod: terminationGracePeriodLong, + }, + []podDrainHandler{deletePod, sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes}, + &expectation{ + stats: stats{ + nPodsWithoutPV: 0, + nPodsWithOnlyExclusivePV: 0, + nPodsWithOnlySharedPV: 0, + nPodsWithExclusiveAndSharedPV: 0, + }, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + timeout: terminationGracePeriodLong, + drainTimeout: false, + drainError: nil, + nEvictions: 12, + // Because waitForDetach polling Interval is equal to terminationGracePeriodShort + minDrainDuration: terminationGracePeriodMedium, + }), + ) +}) + +func getPodWithoutPV(ns, name, nodeName string, terminationGracePeriod time.Duration, labels map[string]string) *corev1.Pod { + controller := true + priority := int32(0) + tgps := int64(terminationGracePeriod / time.Second) + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + {Controller: &controller}, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + TerminationGracePeriodSeconds: &tgps, + Priority: &priority, + }, + } +} + +func getPodsWithoutPV(n int, ns, podPrefix, nodeName string, terminationGracePeriod time.Duration, labels map[string]string) []*corev1.Pod { + pods := make([]*corev1.Pod, n) + for i := range pods { + pods[i] = getPodWithoutPV(ns, fmt.Sprintf("%s%d", podPrefix, i), nodeName, terminationGracePeriod, labels) + } + return pods +} + +func getPodWithPV(ns, name, exclusivePV, sharedPV, nodeName string, terminationGracePeriod time.Duration, labels map[string]string) *corev1.Pod { + pod := getPodWithoutPV(ns, name, nodeName, terminationGracePeriod, labels) + + appendVolume := func(pod *api.Pod, vol string) { + pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{ + Name: vol, + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: vol, + }, + }, + }) + } + + if exclusivePV != "" { + appendVolume(pod, exclusivePV) + } + if sharedPV != "" { + appendVolume(pod, sharedPV) + } + return pod +} + +func getPodsWithPV(nPod, nExclusivePV, nSharedPV int, ns, podPrefix, exclusivePVPrefix, sharedPVPrefix, nodeName string, terminationGracePeriod time.Duration, labels map[string]string) []*corev1.Pod { + pods := make([]*corev1.Pod, nPod) + for i := range pods { + var ( + podName = fmt.Sprintf("%s%d", podPrefix, i) + exclusivePV string + sharedPV string + ) + if nExclusivePV > 0 { + exclusivePV = fmt.Sprintf("%s%d", exclusivePVPrefix, i%nExclusivePV) + } + if nSharedPV > 0 { + sharedPV = fmt.Sprintf("%s%d", sharedPVPrefix, i%nSharedPV) + } + pods[i] = getPodWithPV(ns, podName, exclusivePV, sharedPV, nodeName, terminationGracePeriod, labels) + } + return pods +} + +func getPVCs(pods []*corev1.Pod) []*corev1.PersistentVolumeClaim { + m := make(map[string]*corev1.PersistentVolumeClaim) + for _, pod := range pods { + for i := range pod.Spec.Volumes { + vol := &pod.Spec.Volumes[i] + if vol.PersistentVolumeClaim != nil { + pvc := vol.PersistentVolumeClaim + + if _, ok := m[pvc.ClaimName]; ok { + continue + } + + m[pvc.ClaimName] = &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: pvc.ClaimName, + Namespace: pod.Namespace, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + VolumeName: pvc.ClaimName, + }, + } + } + } + } + + pvcs := make([]*corev1.PersistentVolumeClaim, len(m)) + var i = 0 + for _, pvc := range m { + pvcs[i] = pvc + i++ + } + return pvcs +} + +func getPVs(pvcs []*corev1.PersistentVolumeClaim) []*corev1.PersistentVolume { + m := make(map[string]*corev1.PersistentVolume) + for _, pvc := range pvcs { + if _, ok := m[pvc.Spec.VolumeName]; ok { + continue + } + + m[pvc.Spec.VolumeName] = &corev1.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: pvc.Spec.VolumeName, + }, + Spec: corev1.PersistentVolumeSpec{ + PersistentVolumeSource: corev1.PersistentVolumeSource{ + CSI: &corev1.CSIPersistentVolumeSource{ + VolumeHandle: pvc.Spec.VolumeName, + }, + }, + }, + } + } + + pvs := make([]*corev1.PersistentVolume, len(m)) + var i = 0 + for _, pv := range m { + pvs[i] = pv + i++ + } + return pvs +} + +func getNode(name string, pvs []*corev1.PersistentVolume) *corev1.Node { + n := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + } + + vols := make([]corev1.AttachedVolume, len(pvs)) + for i, pv := range pvs { + vols[i] = corev1.AttachedVolume{ + Name: corev1.UniqueVolumeName(getDrainTestVolumeName(&pv.Spec)), + } + } + + n.Status.VolumesAttached = vols + + return n +} + +func getDrainTestVolumeName(pvSpec *corev1.PersistentVolumeSpec) string { + if pvSpec.CSI == nil { + return "" + } + return pvSpec.CSI.VolumeHandle +} + +type drainDriver struct { + *driver.FakeDriver +} + +func (d *drainDriver) GetVolumeIDs(ctx context.Context, req *driver.GetVolumeIDsRequest) (*driver.GetVolumeIDsResponse, error) { + volNames := make([]string, len(req.PVSpecs)) + for i := range req.PVSpecs { + volNames[i] = getDrainTestVolumeName(req.PVSpecs[i]) + } + return &driver.GetVolumeIDsResponse{ + VolumeIDs: volNames, + }, nil +} + +func appendPods(objects []runtime.Object, pods []*corev1.Pod) []runtime.Object { + for _, pod := range pods { + objects = append(objects, pod) + } + return objects +} + +func appendPVCs(objects []runtime.Object, pvcs []*corev1.PersistentVolumeClaim) []runtime.Object { + for _, pvc := range pvcs { + objects = append(objects, pvc) + } + return objects +} + +func appendPVs(objects []runtime.Object, pvs []*corev1.PersistentVolume) []runtime.Object { + for _, pv := range pvs { + objects = append(objects, pv) + } + return objects +} + +func appendNodes(objects []runtime.Object, nodes []*corev1.Node) []runtime.Object { + for _, n := range nodes { + objects = append(objects, n) + } + return objects +} +*/ diff --git a/pkg/util/provider/drain/fake_controller.go b/pkg/util/provider/drain/fake_controller.go new file mode 100644 index 000000000..de028b0b6 --- /dev/null +++ b/pkg/util/provider/drain/fake_controller.go @@ -0,0 +1,55 @@ +/* +Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +*/ + +// Package drain is used to drain nodes +package drain + +import ( + "time" + + customfake "github.com/gardener/machine-controller-manager/pkg/fakeclient" + "k8s.io/apimachinery/pkg/runtime" + coreinformers "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + corelisters "k8s.io/client-go/listers/core/v1" +) + +func createFakeController( + stop <-chan struct{}, + namespace string, + targetCoreObjects []runtime.Object, +) (kubernetes.Interface, corelisters.PersistentVolumeLister, corelisters.PersistentVolumeClaimLister, *customfake.FakeObjectTracker) { + + fakeTargetCoreClient, targetCoreObjectTracker := customfake.NewCoreClientSet(targetCoreObjects...) + go targetCoreObjectTracker.Start() + + coreTargetInformerFactory := coreinformers.NewFilteredSharedInformerFactory( + fakeTargetCoreClient, + 100*time.Millisecond, + namespace, + nil, + ) + defer coreTargetInformerFactory.Start(stop) + coreTargetSharedInformers := coreTargetInformerFactory.Core().V1() + pvcs := coreTargetSharedInformers.PersistentVolumeClaims() + pvs := coreTargetSharedInformers.PersistentVolumes() + + pvcLister := pvcs.Lister() + pvLister := pvs.Lister() + + return fakeTargetCoreClient, pvLister, pvcLister, targetCoreObjectTracker +} diff --git a/pkg/util/provider/driver/driver.go b/pkg/util/provider/driver/driver.go new file mode 100644 index 000000000..8a5ca1e36 --- /dev/null +++ b/pkg/util/provider/driver/driver.go @@ -0,0 +1,134 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package driver contains the cloud provider specific implementations to manage machines +package driver + +import ( + "context" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + corev1 "k8s.io/api/core/v1" +) + +// Driver is the common interface for creation/deletion of the VMs over different cloud-providers. +type Driver interface { + // CreateMachine call is responsible for VM creation on the provider + CreateMachine(context.Context, *CreateMachineRequest) (*CreateMachineResponse, error) + // DeleteMachine call is responsible for VM deletion/termination on the provider + DeleteMachine(context.Context, *DeleteMachineRequest) (*DeleteMachineResponse, error) + // GetMachineStatus call get's the status of the VM backing the machine object on the provider + GetMachineStatus(context.Context, *GetMachineStatusRequest) (*GetMachineStatusResponse, error) + // ListMachines lists all the machines that might have been created by the supplied machineClass + ListMachines(context.Context, *ListMachinesRequest) (*ListMachinesResponse, error) + // GetVolumeIDs returns a list volumeIDs for the list of PVSpecs + GetVolumeIDs(context.Context, *GetVolumeIDsRequest) (*GetVolumeIDsResponse, error) +} + +// CreateMachineRequest is the create request for VM creation +type CreateMachineRequest struct { + // Machine object from whom VM is to be created + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// CreateMachineResponse is the create response for VM creation +type CreateMachineResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string + + // LastKnownState represents the last state of the VM during an creation/deletion error + LastKnownState string +} + +// DeleteMachineRequest is the delete request for VM deletion +type DeleteMachineRequest struct { + // Machine object from whom VM is to be deleted + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// DeleteMachineResponse is the delete response for VM deletion +type DeleteMachineResponse struct { + // LastKnownState represents the last state of the VM during an creation/deletion error + LastKnownState string +} + +// GetMachineStatusRequest is the get request for VM info +type GetMachineStatusRequest struct { + // Machine object from whom VM status is to be fetched + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// GetMachineStatusResponse is the get response for VM info +type GetMachineStatusResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string +} + +// ListMachinesRequest is the request object to get a list of VMs belonging to a machineClass +type ListMachinesRequest struct { + // MachineClass object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// ListMachinesResponse is the response object of the list of VMs belonging to a machineClass +type ListMachinesResponse struct { + // MachineList is the map of list of machines. Format for the map should be . + MachineList map[string]string +} + +// GetVolumeIDsRequest is the request object to get a list of VolumeIDs for a PVSpec +type GetVolumeIDsRequest struct { + // PVSpecsList is a list of PV specs for whom volume-IDs are required + // Plugin should parse this raw data into pre-defined list of PVSpecs + PVSpecs []*corev1.PersistentVolumeSpec +} + +// GetVolumeIDsResponse is the response object of the list of VolumeIDs for a PVSpec +type GetVolumeIDsResponse struct { + // VolumeIDs is a list of VolumeIDs. + VolumeIDs []string +} diff --git a/pkg/util/provider/driver/fake.go b/pkg/util/provider/driver/fake.go new file mode 100644 index 000000000..701d42db2 --- /dev/null +++ b/pkg/util/provider/driver/fake.go @@ -0,0 +1,92 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package driver contains a fake driver implementation +package driver + +import ( + "context" + "fmt" + + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" +) + +// FakeDriver is a fake driver returned when none of the actual drivers match +type FakeDriver struct { + VMExists bool + ProviderID string + NodeName string + LastKnownState string + Err error +} + +// NewFakeDriver returns a new fakedriver object +func NewFakeDriver(fakeDriver *FakeDriver) Driver { + return fakeDriver +} + +// CreateMachine makes a call to the driver to create the machine. +func (d *FakeDriver) CreateMachine(ctx context.Context, createMachineRequest *CreateMachineRequest) (*CreateMachineResponse, error) { + if d.Err == nil { + d.VMExists = true + return &CreateMachineResponse{ + ProviderID: d.ProviderID, + NodeName: d.NodeName, + LastKnownState: d.LastKnownState, + }, nil + } + + return nil, d.Err +} + +// DeleteMachine make a call to the driver to delete the machine. +func (d *FakeDriver) DeleteMachine(ctx context.Context, deleteMachineRequest *DeleteMachineRequest) (*DeleteMachineResponse, error) { + d.VMExists = false + return &DeleteMachineResponse{ + LastKnownState: d.LastKnownState, + }, d.Err +} + +// GetMachineStatus makes a gRPC call to the driver to check existance of machine +func (d *FakeDriver) GetMachineStatus(ctx context.Context, getMachineStatusRequest *GetMachineStatusRequest) (*GetMachineStatusResponse, error) { + switch { + case !d.VMExists: + errMessage := fmt.Sprintf("Fake plugin is returning no VM instances backing this machine object") + return nil, status.Error(codes.NotFound, errMessage) + case d.Err != nil: + return nil, d.Err + } + + return &GetMachineStatusResponse{ + ProviderID: d.ProviderID, + NodeName: d.NodeName, + }, nil +} + +// ListMachines have to list machines +func (d *FakeDriver) ListMachines(ctx context.Context, listMachinesRequest *ListMachinesRequest) (*ListMachinesResponse, error) { + return &ListMachinesResponse{ + MachineList: map[string]string{}, + }, d.Err +} + +// GetVolumeIDs returns a list of VolumeIDs for the PV spec list supplied +func (d *FakeDriver) GetVolumeIDs(ctx context.Context, getVolumeIDs *GetVolumeIDsRequest) (*GetVolumeIDsResponse, error) { + return &GetVolumeIDsResponse{ + VolumeIDs: []string{}, + }, d.Err +} diff --git a/pkg/util/provider/machinecodes/codes/code_string.go b/pkg/util/provider/machinecodes/codes/code_string.go new file mode 100644 index 000000000..3feb6cea9 --- /dev/null +++ b/pkg/util/provider/machinecodes/codes/code_string.go @@ -0,0 +1,68 @@ +/* + * + * Copyright 2017 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file was copied and modified from the github.com/grpc/grpc-go project + * https://github.com/grpc/grpc-go/blob/v1.29.x/code/code_string.go + * + * Modifications Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved. + * + */ + +// Package codes defines the canonical error codes used by machineAPIs for gardener/machine-controller-manager +package codes + +import "strconv" + +func (c Code) String() string { + switch c { + case OK: + return "OK" + case Canceled: + return "Canceled" + case Unknown: + return "Unknown" + case InvalidArgument: + return "InvalidArgument" + case DeadlineExceeded: + return "DeadlineExceeded" + case NotFound: + return "NotFound" + case AlreadyExists: + return "AlreadyExists" + case PermissionDenied: + return "PermissionDenied" + case ResourceExhausted: + return "ResourceExhausted" + case FailedPrecondition: + return "FailedPrecondition" + case Aborted: + return "Aborted" + case OutOfRange: + return "OutOfRange" + case Unimplemented: + return "Unimplemented" + case Internal: + return "Internal" + case Unavailable: + return "Unavailable" + case DataLoss: + return "DataLoss" + case Unauthenticated: + return "Unauthenticated" + default: + return "Code(" + strconv.FormatInt(int64(c), 10) + ")" + } +} diff --git a/pkg/util/provider/machinecodes/codes/codes.go b/pkg/util/provider/machinecodes/codes/codes.go new file mode 100644 index 000000000..f69bee923 --- /dev/null +++ b/pkg/util/provider/machinecodes/codes/codes.go @@ -0,0 +1,177 @@ +/* + * + * Copyright 2014 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file was copied and modified from the github.com/grpc/grpc-go project + * https://github.com/grpc/grpc-go/blob/v1.29.x/code/code.go + * + * Modifications Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved. + * + */ + +// Package codes defines the canonical error codes used by machineAPIs for gardener/machine-controller-manager +package codes + +// Code is an unsigned int representing aa canonical error code for machineAPIs +type Code uint32 + +const ( + // OK is returned on success. + OK Code = 0 + + // Canceled indicates the operation was canceled (typically by the caller). + Canceled Code = 1 + + // Unknown error. An example of where this error may be returned is + // if a Status value received from another address space belongs to + // an error-space that is not known in this address space. Also + // errors raised by APIs that do not return enough error information + // may be converted to this error. + Unknown Code = 2 + + // InvalidArgument indicates client specified an invalid argument. + // Note that this differs from FailedPrecondition. It indicates arguments + // that are problematic regardless of the state of the system + // (e.g., a malformed file name). + InvalidArgument Code = 3 + + // DeadlineExceeded means operation expired before completion. + // For operations that change the state of the system, this error may be + // returned even if the operation has completed successfully. For + // example, a successful response from a server could have been delayed + // long enough for the deadline to expire. + DeadlineExceeded Code = 4 + + // NotFound means some requested entity (e.g., file or directory) was + // not found. + NotFound Code = 5 + + // AlreadyExists means an attempt to create an entity failed because one + // already exists. + AlreadyExists Code = 6 + + // PermissionDenied indicates the caller does not have permission to + // execute the specified operation. It must not be used for rejections + // caused by exhausting some resource (use ResourceExhausted + // instead for those errors). It must not be + // used if the caller cannot be identified (use Unauthenticated + // instead for those errors). + PermissionDenied Code = 7 + + // ResourceExhausted indicates some resource has been exhausted, perhaps + // a per-user quota, or perhaps the entire file system is out of space. + ResourceExhausted Code = 8 + + // FailedPrecondition indicates operation was rejected because the + // system is not in a state required for the operation's execution. + // For example, directory to be deleted may be non-empty, an rmdir + // operation is applied to a non-directory, etc. + // + // A litmus test that may help a service implementor in deciding + // between FailedPrecondition, Aborted, and Unavailable: + // (a) Use Unavailable if the client can retry just the failing call. + // (b) Use Aborted if the client should retry at a higher-level + // (e.g., restarting a read-modify-write sequence). + // (c) Use FailedPrecondition if the client should not retry until + // the system state has been explicitly fixed. E.g., if an "rmdir" + // fails because the directory is non-empty, FailedPrecondition + // should be returned since the client should not retry unless + // they have first fixed up the directory by deleting files from it. + // (d) Use FailedPrecondition if the client performs conditional + // REST Get/Update/Delete on a resource and the resource on the + // server does not match the condition. E.g., conflicting + // read-modify-write on the same resource. + FailedPrecondition Code = 9 + + // Aborted indicates the operation was aborted, typically due to a + // concurrency issue like sequencer check failures, transaction aborts, + // etc. + // + // See litmus test above for deciding between FailedPrecondition, + // Aborted, and Unavailable. + Aborted Code = 10 + + // OutOfRange means operation was attempted past the valid range. + // E.g., seeking or reading past end of file. + // + // Unlike InvalidArgument, this error indicates a problem that may + // be fixed if the system state changes. For example, a 32-bit file + // system will generate InvalidArgument if asked to read at an + // offset that is not in the range [0,2^32-1], but it will generate + // OutOfRange if asked to read from an offset past the current + // file size. + // + // There is a fair bit of overlap between FailedPrecondition and + // OutOfRange. We recommend using OutOfRange (the more specific + // error) when it applies so that callers who are iterating through + // a space can easily look for an OutOfRange error to detect when + // they are done. + OutOfRange Code = 11 + + // Unimplemented indicates operation is not implemented or not + // supported/enabled in this service. + Unimplemented Code = 12 + + // Internal errors. Means some invariants expected by underlying + // system has been broken. If you see one of these errors, + // something is very broken. + Internal Code = 13 + + // Unavailable indicates the service is currently unavailable. + // This is a most likely a transient condition and may be corrected + // by retrying with a backoff. Note that it is not always safe to retry + // non-idempotent operations. + // + // See litmus test above for deciding between FailedPrecondition, + // Aborted, and Unavailable. + Unavailable Code = 14 + + // DataLoss indicates unrecoverable data loss or corruption. + DataLoss Code = 15 + + // Unauthenticated indicates the request does not have valid + // authentication credentials for the operation. + Unauthenticated Code = 16 + + _maxCode = 17 +) + +var strToCode = map[string]Code{ + "OK": OK, + "Canceled": Canceled, + "Unknown": Unknown, + "InvalidArgument": InvalidArgument, + "DeadlineExceeded": DeadlineExceeded, + "NotFound": NotFound, + "AlreadyExists": AlreadyExists, + "PermissionDenied": PermissionDenied, + "ResourceExhausted": ResourceExhausted, + "FailedPrecondition": FailedPrecondition, + "Aborted": Aborted, + "OutOfRange": OutOfRange, + "Unimplemented": Unimplemented, + "Internal": Internal, + "Unavailable": Unavailable, + "DataLoss": DataLoss, + "Unauthenticated": Unauthenticated, +} + +// StringToCode coverts string into the Code. +func StringToCode(input string) Code { + if code, ok := strToCode[input]; ok { + return code + } + return Unknown +} diff --git a/pkg/util/provider/machinecodes/status/status.go b/pkg/util/provider/machinecodes/status/status.go new file mode 100644 index 000000000..00058af3c --- /dev/null +++ b/pkg/util/provider/machinecodes/status/status.go @@ -0,0 +1,114 @@ +/* + * + * Copyright 2017 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file was copied and modified from the github.com/grpc/grpc-go project + * https://github.com/grpc/grpc-go/blob/v1.29.x/status/status.go + * + * Modifications Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved. + * + */ + +// Package status implements errors returned by MachineAPIs. MachineAPIs +// service handlers should return an error created by this package, +// and machineAPIs clients should expect a corresponding error to be +// returned from the RPC call. +// +// This package upholds the invariants that a non-nil error may not +// contain an OK code, and an OK code must result in a nil error. +package status + +import ( + "fmt" + "regexp" + "strings" + + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" +) + +// Status implements error and Status, +type Status struct { + // The status code, which should be an enum value of + // ../codes.Code + code int32 + // A developer-facing error message, which should be in English. Any + // user-facing error message should be localized and sent in the + // [google.rpc.Status.details][google.rpc.Status.details] field, or localized + // by the client. + message string +} + +// Code returns the status code contained in status. +func (s *Status) Code() codes.Code { + if s == nil { + return codes.OK + } + return codes.Code(s.code) +} + +// Message returns the message contained in status. +func (s *Status) Message() string { + return s.message +} + +// Error returns the error message for the status. +func (s *Status) Error() string { + return fmt.Sprintf("machine codes error: code = [%s] message = [%s]", s.Code(), s.Message()) +} + +// New returns a Status representing c and msg. +func New(c codes.Code, msg string) *Status { + return &Status{code: int32(c), message: msg} +} + +// Error returns an error representing c and msg. If c is OK, returns nil. +func Error(c codes.Code, msg string) error { + return New(c, msg) +} + +// FromError returns a Status representing err if it was produced from this +// package or has a method `GRPCStatus() *Status`. Otherwise, ok is false and a +// Status is returned with codes.Unknown and the original error message. +func FromError(err error) (s *Status, ok bool) { + if err == nil { + return nil, true + } + + if matches, errInFind := findInString(err.Error()); errInFind == nil { + code := codes.StringToCode(matches[0]) + return New(code, matches[1]), true + } + + return New(codes.Unknown, err.Error()), false +} + +// findInString need to check if this logic can be optimized +func findInString(input string) ([]string, error) { + var matches []string + + re := regexp.MustCompile(`\[([^\[\]]*)\]`) + submatchall := re.FindAllString(input, -1) + if submatchall == nil || len(submatchall) != 2 { + return nil, fmt.Errorf("Unable to decode for machine code error") + } + + for _, element := range submatchall { + element = strings.Trim(element, "[") + element = strings.Trim(element, "]") + matches = append(matches, element) + } + + return matches, nil +} diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go new file mode 100644 index 000000000..b936ec313 --- /dev/null +++ b/pkg/util/provider/machinecontroller/controller.go @@ -0,0 +1,305 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "fmt" + "sync" + "time" + + machineinternal "github.com/gardener/machine-controller-manager/pkg/apis/machine" + machinev1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + machineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1" + machineinformers "github.com/gardener/machine-controller-manager/pkg/client/informers/externalversions/machine/v1alpha1" + machinelisters "github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/handlers" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/options" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/runtime" + runtimeutil "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + coreinformers "k8s.io/client-go/informers/core/v1" + "k8s.io/client-go/kubernetes" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog" +) + +const ( + maxRetries = 15 + + // ClassAnnotation is the annotation used to identify a machine class + ClassAnnotation = "machine.sapcloud.io/class" + // MachineIDAnnotation is the annotation used to identify a machine ID + MachineIDAnnotation = "machine.sapcloud.io/id" + // DeleteFinalizerName is the finalizer used to identify the controller acting on an object + DeleteFinalizerName = "machine.sapcloud.io/machine-controller-manager" +) + +// NewController returns a new Node controller. +func NewController( + namespace string, + controlMachineClient machineapi.MachineV1alpha1Interface, + controlCoreClient kubernetes.Interface, + targetCoreClient kubernetes.Interface, + driver driver.Driver, + pvcInformer coreinformers.PersistentVolumeClaimInformer, + pvInformer coreinformers.PersistentVolumeInformer, + secretInformer coreinformers.SecretInformer, + nodeInformer coreinformers.NodeInformer, + machineClassInformer machineinformers.MachineClassInformer, + machineInformer machineinformers.MachineInformer, + recorder record.EventRecorder, + safetyOptions options.SafetyOptions, + nodeConditions string, + bootstrapTokenAuthExtraGroups string, +) (Controller, error) { + controller := &controller{ + namespace: namespace, + controlMachineClient: controlMachineClient, + controlCoreClient: controlCoreClient, + targetCoreClient: targetCoreClient, + recorder: recorder, + secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"), + nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"), + machineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineclass"), + machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"), + machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"), + machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"), + safetyOptions: safetyOptions, + nodeConditions: nodeConditions, + driver: driver, + } + + controller.internalExternalScheme = runtime.NewScheme() + + if err := machineinternal.AddToScheme(controller.internalExternalScheme); err != nil { + return nil, err + } + + if err := machinev1alpha1.AddToScheme(controller.internalExternalScheme); err != nil { + return nil, err + } + + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(klog.Infof) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: typedcorev1.New(controlCoreClient.CoreV1().RESTClient()).Events(namespace)}) + + // Controller listers + controller.pvcLister = pvcInformer.Lister() + controller.pvLister = pvInformer.Lister() + controller.secretLister = secretInformer.Lister() + controller.machineClassLister = machineClassInformer.Lister() + controller.nodeLister = nodeInformer.Lister() + controller.machineLister = machineInformer.Lister() + + // Controller syncs + controller.secretSynced = secretInformer.Informer().HasSynced + controller.machineClassSynced = machineClassInformer.Informer().HasSynced + controller.nodeSynced = nodeInformer.Informer().HasSynced + controller.machineSynced = machineInformer.Informer().HasSynced + + // Secret Controller Informers + secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.secretAdd, + DeleteFunc: controller.secretDelete, + }) + + machineClassInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.machineClassToSecretAdd, + UpdateFunc: controller.machineClassToSecretUpdate, + DeleteFunc: controller.machineClassToSecretDelete, + }) + + machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + DeleteFunc: controller.machineToMachineClassDelete, + }) + + machineClassInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.machineClassAdd, + UpdateFunc: controller.machineClassUpdate, + }) + + // Machine Controller Informers + nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.addNodeToMachine, + UpdateFunc: controller.updateNodeToMachine, + DeleteFunc: controller.deleteNodeToMachine, + }) + + machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.addMachine, + UpdateFunc: controller.updateMachine, + DeleteFunc: controller.deleteMachine, + }) + + // MachineSafety Controller Informers + + // We follow the kubernetes way of reconciling the safety controller + // done by adding empty key objects. We initialize it, to trigger + // running of different safety loop on MCM startup. + controller.machineSafetyOrphanVMsQueue.Add("") + controller.machineSafetyAPIServerQueue.Add("") + + machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + // deleteMachineToSafety makes sure that orphan VM handler is invoked + DeleteFunc: controller.deleteMachineToSafety, + }) + + return controller, nil +} + +// Controller describes a controller for +type Controller interface { + // Run runs the controller until the given stop channel can be read from. + // workers specifies the number of goroutines, per resource, processing work + // from the resource workqueues + Run(workers int, stopCh <-chan struct{}) +} + +// controller is a concrete Controller. +type controller struct { + namespace string + nodeConditions string + bootstrapTokenAuthExtraGroups string + + controlMachineClient machineapi.MachineV1alpha1Interface + controlCoreClient kubernetes.Interface + targetCoreClient kubernetes.Interface + + recorder record.EventRecorder + safetyOptions options.SafetyOptions + internalExternalScheme *runtime.Scheme + driver driver.Driver + + // listers + pvcLister corelisters.PersistentVolumeClaimLister + pvLister corelisters.PersistentVolumeLister + secretLister corelisters.SecretLister + nodeLister corelisters.NodeLister + machineClassLister machinelisters.MachineClassLister + machineLister machinelisters.MachineLister + // queues + secretQueue workqueue.RateLimitingInterface + nodeQueue workqueue.RateLimitingInterface + machineClassQueue workqueue.RateLimitingInterface + machineQueue workqueue.RateLimitingInterface + machineSafetyOrphanVMsQueue workqueue.RateLimitingInterface + machineSafetyAPIServerQueue workqueue.RateLimitingInterface + // syncs + secretSynced cache.InformerSynced + nodeSynced cache.InformerSynced + machineClassSynced cache.InformerSynced + machineSynced cache.InformerSynced +} + +func (c *controller) Run(workers int, stopCh <-chan struct{}) { + + var ( + waitGroup sync.WaitGroup + ) + + defer runtimeutil.HandleCrash() + defer c.nodeQueue.ShutDown() + defer c.secretQueue.ShutDown() + defer c.machineClassQueue.ShutDown() + defer c.machineQueue.ShutDown() + defer c.machineSafetyOrphanVMsQueue.ShutDown() + defer c.machineSafetyAPIServerQueue.ShutDown() + + if !cache.WaitForCacheSync(stopCh, c.secretSynced, c.nodeSynced, c.machineClassSynced, c.machineSynced) { + runtimeutil.HandleError(fmt.Errorf("Timed out waiting for caches to sync")) + return + } + + klog.V(1).Info("Starting machine-controller-manager") + + // The controller implement the prometheus.Collector interface and can therefore + // be passed to the metrics registry. Collectors which added to the registry + // will collect metrics to expose them via the metrics endpoint of the mcm + // every time when the endpoint is called. + prometheus.MustRegister(c) + + for i := 0; i < workers; i++ { + createWorker(c.secretQueue, "ClusterSecret", maxRetries, true, c.reconcileClusterSecretKey, stopCh, &waitGroup) + createWorker(c.machineClassQueue, "ClusterMachineClass", maxRetries, true, c.reconcileClusterMachineClassKey, stopCh, &waitGroup) + createWorker(c.nodeQueue, "ClusterNode", maxRetries, true, c.reconcileClusterNodeKey, stopCh, &waitGroup) + createWorker(c.machineQueue, "ClusterMachine", maxRetries, true, c.reconcileClusterMachineKey, stopCh, &waitGroup) + createWorker(c.machineSafetyOrphanVMsQueue, "ClusterMachineSafetyOrphanVMs", maxRetries, true, c.reconcileClusterMachineSafetyOrphanVMs, stopCh, &waitGroup) + createWorker(c.machineSafetyAPIServerQueue, "ClusterMachineAPIServer", maxRetries, true, c.reconcileClusterMachineSafetyAPIServer, stopCh, &waitGroup) + } + + <-stopCh + klog.V(1).Info("Shutting down Machine Controller Manager ") + handlers.UpdateHealth(false) + + waitGroup.Wait() +} + +// createWorker creates and runs a worker thread that just processes items in the +// specified queue. The worker will run until stopCh is closed. The worker will be +// added to the wait group when started and marked done when finished. +func createWorker(queue workqueue.RateLimitingInterface, resourceType string, maxRetries int, forgetAfterSuccess bool, reconciler func(key string) error, stopCh <-chan struct{}, waitGroup *sync.WaitGroup) { + waitGroup.Add(1) + go func() { + wait.Until(worker(queue, resourceType, maxRetries, forgetAfterSuccess, reconciler), time.Second, stopCh) + waitGroup.Done() + }() +} + +// worker runs a worker thread that just dequeues items, processes them, and marks them done. +// If reconciler returns an error, requeue the item up to maxRetries before giving up. +// It enforces that the reconciler is never invoked concurrently with the same key. +// If forgetAfterSuccess is true, it will cause the queue to forget the item should reconciliation +// have no error. +func worker(queue workqueue.RateLimitingInterface, resourceType string, maxRetries int, forgetAfterSuccess bool, reconciler func(key string) error) func() { + return func() { + exit := false + for !exit { + exit = func() bool { + key, quit := queue.Get() + if quit { + return true + } + defer queue.Done(key) + + err := reconciler(key.(string)) + if err == nil { + if forgetAfterSuccess { + queue.Forget(key) + } + return false + } + + if queue.NumRequeues(key) < maxRetries { + klog.V(4).Infof("Error syncing %s %v: %v", resourceType, key, err) + queue.AddRateLimited(key) + return false + } + + klog.V(4).Infof("Dropping %s %q out of the queue: %v", resourceType, key, err) + queue.Forget(key) + return false + }() + } + } +} diff --git a/pkg/util/provider/machinecontroller/controller_suite_test.go b/pkg/util/provider/machinecontroller/controller_suite_test.go new file mode 100644 index 000000000..606305341 --- /dev/null +++ b/pkg/util/provider/machinecontroller/controller_suite_test.go @@ -0,0 +1,601 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "fmt" + "testing" + "time" + + machine_internal "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + faketyped "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake" + machineinformers "github.com/gardener/machine-controller-manager/pkg/client/informers/externalversions" + customfake "github.com/gardener/machine-controller-manager/pkg/fakeclient" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/options" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/watch" + coreinformers "k8s.io/client-go/informers" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog" + "k8s.io/utils/pointer" +) + +func TestMachineControllerManagerSuite(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Machine Controller Manager Suite") +} + +var ( + controllerKindMachine = v1alpha1.SchemeGroupVersion.WithKind("Machine") + MachineClass = "MachineClass" + TestMachineClass = "machineClass-0" +) + +func newMachineDeployment( + specTemplate *v1alpha1.MachineTemplateSpec, + replicas int32, + minReadySeconds int32, + statusTemplate *v1alpha1.MachineDeploymentStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, +) *v1alpha1.MachineDeployment { + return newMachineDeployments(1, specTemplate, replicas, minReadySeconds, statusTemplate, owner, annotations, labels)[0] +} + +func newMachineDeployments( + machineDeploymentCount int, + specTemplate *v1alpha1.MachineTemplateSpec, + replicas int32, + minReadySeconds int32, + statusTemplate *v1alpha1.MachineDeploymentStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, +) []*v1alpha1.MachineDeployment { + + intStr1 := intstr.FromInt(1) + machineDeployments := make([]*v1alpha1.MachineDeployment, machineDeploymentCount) + for i := range machineDeployments { + machineDeployment := &v1alpha1.MachineDeployment{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "machine.sapcloud.io", + Kind: "MachineDeployment", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("machinedeployment-%d", i), + Namespace: testNamespace, + Labels: labels, + }, + Spec: v1alpha1.MachineDeploymentSpec{ + MinReadySeconds: minReadySeconds, + Replicas: replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: deepCopy(specTemplate.ObjectMeta.Labels), + }, + Strategy: v1alpha1.MachineDeploymentStrategy{ + RollingUpdate: &v1alpha1.RollingUpdateMachineDeployment{ + MaxSurge: &intStr1, + MaxUnavailable: &intStr1, + }, + }, + Template: *specTemplate.DeepCopy(), + }, + } + + if statusTemplate != nil { + machineDeployment.Status = *statusTemplate.DeepCopy() + } + + if owner != nil { + machineDeployment.OwnerReferences = append(machineDeployment.OwnerReferences, *owner.DeepCopy()) + } + + if annotations != nil { + machineDeployment.Annotations = annotations + } + + machineDeployments[i] = machineDeployment + } + return machineDeployments +} + +func newMachineSetFromMachineDeployment( + machineDeployment *v1alpha1.MachineDeployment, + replicas int32, + statusTemplate *v1alpha1.MachineSetStatus, + annotations map[string]string, + labels map[string]string, +) *v1alpha1.MachineSet { + return newMachineSetsFromMachineDeployment(1, machineDeployment, replicas, statusTemplate, annotations, labels)[0] +} + +func newMachineSetsFromMachineDeployment( + machineSetCount int, + machineDeployment *v1alpha1.MachineDeployment, + replicas int32, + statusTemplate *v1alpha1.MachineSetStatus, + annotations map[string]string, + labels map[string]string, +) []*v1alpha1.MachineSet { + + finalLabels := make(map[string]string) + for k, v := range labels { + finalLabels[k] = v + } + for k, v := range machineDeployment.Spec.Template.Labels { + finalLabels[k] = v + } + + t := &machineDeployment.TypeMeta + + return newMachineSets( + machineSetCount, + &machineDeployment.Spec.Template, + replicas, + machineDeployment.Spec.MinReadySeconds, + statusTemplate, + &metav1.OwnerReference{ + APIVersion: t.APIVersion, + Kind: t.Kind, + Name: machineDeployment.Name, + UID: machineDeployment.UID, + BlockOwnerDeletion: pointer.BoolPtr(true), + Controller: pointer.BoolPtr(true), + }, + annotations, + finalLabels, + ) +} + +func newMachineSet( + specTemplate *v1alpha1.MachineTemplateSpec, + replicas int32, + minReadySeconds int32, + statusTemplate *v1alpha1.MachineSetStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, +) *v1alpha1.MachineSet { + return newMachineSets(1, specTemplate, replicas, minReadySeconds, statusTemplate, owner, annotations, labels)[0] +} + +func newMachineSets( + machineSetCount int, + specTemplate *v1alpha1.MachineTemplateSpec, + replicas int32, + minReadySeconds int32, + statusTemplate *v1alpha1.MachineSetStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, +) []*v1alpha1.MachineSet { + + machineSets := make([]*v1alpha1.MachineSet, machineSetCount) + for i := range machineSets { + ms := &v1alpha1.MachineSet{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "machine.sapcloud.io", + Kind: "MachineSet", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("machineset-%d", i), + Namespace: testNamespace, + Labels: labels, + }, + Spec: v1alpha1.MachineSetSpec{ + MachineClass: *specTemplate.Spec.Class.DeepCopy(), + MinReadySeconds: minReadySeconds, + Replicas: replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: deepCopy(specTemplate.ObjectMeta.Labels), + }, + Template: *specTemplate.DeepCopy(), + }, + } + + if statusTemplate != nil { + ms.Status = *statusTemplate.DeepCopy() + } + + if owner != nil { + ms.OwnerReferences = append(ms.OwnerReferences, *owner.DeepCopy()) + } + + if annotations != nil { + ms.Annotations = annotations + } + + machineSets[i] = ms + } + return machineSets +} + +func deepCopy(m map[string]string) map[string]string { + r := make(map[string]string, len(m)) + for k := range m { + r[k] = m[k] + } + return r +} + +func newMachineFromMachineSet( + machineSet *v1alpha1.MachineSet, + statusTemplate *v1alpha1.MachineStatus, + annotations map[string]string, + labels map[string]string, + addFinalizer bool, +) *v1alpha1.Machine { + return newMachinesFromMachineSet(1, machineSet, statusTemplate, annotations, labels, addFinalizer)[0] +} + +func newMachinesFromMachineSet( + machineCount int, + machineSet *v1alpha1.MachineSet, + statusTemplate *v1alpha1.MachineStatus, + annotations map[string]string, + labels map[string]string, + addFinalizer bool, +) []*v1alpha1.Machine { + t := &machineSet.TypeMeta + + finalLabels := make(map[string]string, 0) + for k, v := range labels { + finalLabels[k] = v + } + for k, v := range machineSet.Spec.Template.Labels { + finalLabels[k] = v + } + + return newMachines( + machineCount, + &machineSet.Spec.Template, + statusTemplate, + &metav1.OwnerReference{ + APIVersion: t.APIVersion, + Kind: t.Kind, + Name: machineSet.Name, + UID: machineSet.UID, + BlockOwnerDeletion: boolPtr(true), + Controller: boolPtr(true), + }, + annotations, + finalLabels, + addFinalizer, + ) +} + +func newMachine( + specTemplate *v1alpha1.MachineTemplateSpec, + statusTemplate *v1alpha1.MachineStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, + addFinalizer bool, +) *v1alpha1.Machine { + return newMachines(1, specTemplate, statusTemplate, owner, annotations, labels, addFinalizer)[0] +} + +func newMachines( + machineCount int, + specTemplate *v1alpha1.MachineTemplateSpec, + statusTemplate *v1alpha1.MachineStatus, + owner *metav1.OwnerReference, + annotations map[string]string, + labels map[string]string, + addFinalizer bool, +) []*v1alpha1.Machine { + machines := make([]*v1alpha1.Machine, machineCount) + + if annotations == nil { + annotations = make(map[string]string, 0) + } + if labels == nil { + labels = make(map[string]string, 0) + } + + for i := range machines { + m := &v1alpha1.Machine{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "machine.sapcloud.io", + Kind: "Machine", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("machine-%d", i), + Namespace: testNamespace, + Labels: labels, + Annotations: annotations, + }, + Spec: *newMachineSpec(&specTemplate.Spec, i), + } + finalizers := sets.NewString(m.Finalizers...) + + if addFinalizer { + finalizers.Insert(DeleteFinalizerName) + } + m.Finalizers = finalizers.List() + + if statusTemplate != nil { + m.Status = *newMachineStatus(statusTemplate, i) + } + + if owner != nil { + m.OwnerReferences = append(m.OwnerReferences, *owner.DeepCopy()) + } + + machines[i] = m + } + return machines +} + +func newNode( + nodeCount int, + nodeSpec *corev1.NodeSpec, + nodeStatus *corev1.NodeStatus, +) *corev1.Node { + return newNodes(1, nodeSpec, nodeStatus)[0] +} + +func newNodes( + nodeCount int, + nodeSpec *corev1.NodeSpec, + nodeStatus *corev1.NodeStatus, +) []*corev1.Node { + + nodes := make([]*corev1.Node, nodeCount) + for i := range nodes { + node := &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("node-%d", i), + }, + Spec: *nodeSpec.DeepCopy(), + } + + nodes[i] = node + } + return nodes +} + +func newObjectMeta(meta *metav1.ObjectMeta, index int) *metav1.ObjectMeta { + r := meta.DeepCopy() + + if r.Name != "" { + return r + } + + if r.GenerateName != "" { + r.Name = fmt.Sprintf("%s-%d", r.GenerateName, index) + return r + } + + r.Name = fmt.Sprintf("machine-%d", index) + return r +} + +func newMachineSpec(specTemplate *v1alpha1.MachineSpec, index int) *v1alpha1.MachineSpec { + r := specTemplate.DeepCopy() + if r.ProviderID == "" { + return r + } + + r.ProviderID = fmt.Sprintf("%s-%d", r.ProviderID, index) + return r +} + +func newMachineStatus(statusTemplate *v1alpha1.MachineStatus, index int) *v1alpha1.MachineStatus { + if statusTemplate == nil { + return &v1alpha1.MachineStatus{} + } + + r := statusTemplate.DeepCopy() + if r.Node == "" { + return r + } + + r.Node = fmt.Sprintf("%s-%d", r.Node, index) + return r +} + +func newSecretReference(meta *metav1.ObjectMeta, index int) *corev1.SecretReference { + r := &corev1.SecretReference{ + Namespace: meta.Namespace, + } + + if meta.Name != "" { + r.Name = meta.Name + return r + } + + if meta.GenerateName != "" { + r.Name = fmt.Sprintf("%s-%d", meta.GenerateName, index) + return r + } + + r.Name = fmt.Sprintf("machine-%d", index) + return r +} + +func boolPtr(b bool) *bool { + return &b +} + +func createController( + stop <-chan struct{}, + namespace string, + controlMachineObjects, controlCoreObjects, targetCoreObjects []runtime.Object, + fakedriver driver.Driver, +) (*controller, *customfake.FakeObjectTrackers) { + + fakeControlMachineClient, controlMachineObjectTracker := customfake.NewMachineClientSet(controlMachineObjects...) + fakeTypedMachineClient := &faketyped.FakeMachineV1alpha1{ + Fake: &fakeControlMachineClient.Fake, + } + fakeControlCoreClient, controlCoreObjectTracker := customfake.NewCoreClientSet(controlCoreObjects...) + fakeTargetCoreClient, targetCoreObjectTracker := customfake.NewCoreClientSet(targetCoreObjects...) + fakeObjectTrackers := customfake.NewFakeObjectTrackers( + controlMachineObjectTracker, + controlCoreObjectTracker, + targetCoreObjectTracker, + ) + fakeObjectTrackers.Start() + + coreTargetInformerFactory := coreinformers.NewFilteredSharedInformerFactory( + fakeTargetCoreClient, + 100*time.Millisecond, + namespace, + nil, + ) + defer coreTargetInformerFactory.Start(stop) + coreTargetSharedInformers := coreTargetInformerFactory.Core().V1() + nodes := coreTargetSharedInformers.Nodes() + pvcs := coreTargetSharedInformers.PersistentVolumeClaims() + pvs := coreTargetSharedInformers.PersistentVolumes() + secrets := coreTargetSharedInformers.Secrets() + + controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( + fakeControlMachineClient, + 100*time.Millisecond, + namespace, + nil, + ) + defer controlMachineInformerFactory.Start(stop) + + machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() + machineClass := machineSharedInformers.MachineClasses() + machines := machineSharedInformers.Machines() + + internalExternalScheme := runtime.NewScheme() + Expect(machine_internal.AddToScheme(internalExternalScheme)).To(Succeed()) + Expect(v1alpha1.AddToScheme(internalExternalScheme)).To(Succeed()) + + safetyOptions := options.SafetyOptions{ + MachineCreationTimeout: metav1.Duration{Duration: 20 * time.Minute}, + MachineHealthTimeout: metav1.Duration{Duration: 10 * time.Minute}, + MachineDrainTimeout: metav1.Duration{Duration: 5 * time.Minute}, + MachineSafetyOrphanVMsPeriod: metav1.Duration{Duration: 30 * time.Minute}, + MachineSafetyAPIServerStatusCheckPeriod: metav1.Duration{Duration: 1 * time.Minute}, + MachineSafetyAPIServerStatusCheckTimeout: metav1.Duration{Duration: 30 * time.Second}, + } + + controller := &controller{ + namespace: namespace, + driver: fakedriver, + safetyOptions: safetyOptions, + machineClassLister: machineClass.Lister(), + machineClassSynced: machineClass.Informer().HasSynced, + targetCoreClient: fakeTargetCoreClient, + controlCoreClient: fakeControlCoreClient, + controlMachineClient: fakeTypedMachineClient, + internalExternalScheme: internalExternalScheme, + nodeLister: nodes.Lister(), + pvcLister: pvcs.Lister(), + secretLister: secrets.Lister(), + pvLister: pvs.Lister(), + machineLister: machines.Lister(), + machineSynced: machines.Informer().HasSynced, + nodeSynced: nodes.Informer().HasSynced, + secretSynced: secrets.Informer().HasSynced, + secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"), + nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"), + machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"), + machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"), + machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"), + recorder: record.NewBroadcaster().NewRecorder(nil, corev1.EventSource{Component: ""}), + } + + // controller.internalExternalScheme = runtime.NewScheme() + + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(klog.Infof) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(fakeControlCoreClient.CoreV1().RESTClient()).Events(namespace)}) + + return controller, fakeObjectTrackers +} + +func waitForCacheSync(stop <-chan struct{}, controller *controller) { + Expect(cache.WaitForCacheSync( + stop, + controller.machineClassSynced, + controller.machineSynced, + controller.secretSynced, + controller.nodeSynced, + )).To(BeTrue()) +} + +var _ = Describe("#createController", func() { + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + Namespace: "test", + } + + It("success", func() { + machine0 := newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *objMeta, + }, nil, nil, nil, nil, false) + + stop := make(chan struct{}) + defer close(stop) + + c, trackers := createController(stop, objMeta.Namespace, nil, nil, nil, nil) + defer trackers.Stop() + + waitForCacheSync(stop, c) + + Expect(c).NotTo(BeNil()) + + allMachineWatch, err := c.controlMachineClient.Machines(objMeta.Namespace).Watch(metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + defer allMachineWatch.Stop() + + machine0Watch, err := c.controlMachineClient.Machines(objMeta.Namespace).Watch(metav1.ListOptions{ + FieldSelector: fmt.Sprintf("metadata.name=%s", machine0.Name), + }) + Expect(err).NotTo(HaveOccurred()) + defer machine0Watch.Stop() + + go func() { + _, err := c.controlMachineClient.Machines(objMeta.Namespace).Create(machine0) + if err != nil { + fmt.Printf("Error creating machine: %s", err) + } + }() + + var event watch.Event + Eventually(allMachineWatch.ResultChan()).Should(Receive(&event)) + Expect(event.Type).To(Equal(watch.Added)) + Expect(event.Object).To(Equal(machine0)) + + Eventually(machine0Watch.ResultChan()).Should(Receive(&event)) + Expect(event.Type).To(Equal(watch.Added)) + Expect(event.Object).To(Equal(machine0)) + }) +}) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go new file mode 100644 index 000000000..3af19ce28 --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine.go @@ -0,0 +1,510 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/tools/cache" + "k8s.io/klog" +) + +/* + SECTION + Machine controller - Machine add, update, delete watches +*/ +func (c *controller) addMachine(obj interface{}) { + klog.V(4).Infof("Adding machine object") + c.enqueueMachine(obj) +} + +func (c *controller) updateMachine(oldObj, newObj interface{}) { + klog.V(4).Info("Updating machine object") + c.enqueueMachine(newObj) +} + +func (c *controller) deleteMachine(obj interface{}) { + klog.V(4).Info("Deleting machine object") + c.enqueueMachine(obj) +} + +func (c *controller) enqueueMachine(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + + klog.V(4).Infof("Adding machine object to the queue %q", key) + c.machineQueue.Add(key) +} + +func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + return + } + + klog.V(4).Infof("Adding machine object to the queue %q after %s", key, after) + c.machineQueue.AddAfter(key, after) +} + +func (c *controller) reconcileClusterMachineKey(key string) error { + _, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return err + } + + machine, err := c.machineLister.Machines(c.namespace).Get(name) + if apierrors.IsNotFound(err) { + klog.V(4).Infof("Machine %q: Not doing work because it is not found", key) + return nil + } + if err != nil { + klog.Errorf("Machine %q: Unable to retrieve object from store: %v", key, err) + return err + } + + durationToNextSync := 10 * time.Minute + retryStatus, err := c.reconcileClusterMachine(machine) + if err != nil { + if retryStatus == machineutils.RetryOp { + durationToNextSync = 15 * time.Second + } + } + + klog.V(4).Info(retryStatus, err, durationToNextSync) + c.enqueueMachineAfter(machine, durationToNextSync) + + return nil +} + +func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) (machineutils.Retry, error) { + klog.V(4).Infof("Start Reconciling machine %q", machine.Name) + defer klog.V(4).Infof("Stop Reconciling machine %q", machine.Name) + + if c.safetyOptions.MachineControllerFrozen && machine.DeletionTimestamp == nil { + // If Machine controller is frozen and + // machine is not set for termination don't process it + err := fmt.Errorf("Machine controller has frozen. Retrying reconcile after resync period") + klog.Error(err) + return machineutils.DoNotRetryOp, err + } + + internalMachine := &machineapi.Machine{} + if err := c.internalExternalScheme.Convert(machine, internalMachine, nil); err != nil { + klog.Error(err) + return machineutils.DoNotRetryOp, err + } + + validationerr := validation.ValidateMachine(internalMachine) + if validationerr.ToAggregate() != nil && len(validationerr.ToAggregate().Errors()) > 0 { + err := fmt.Errorf("Validation of Machine failed %s", validationerr.ToAggregate().Error()) + klog.Error(err) + return machineutils.DoNotRetryOp, err + } + + machineClass, secret, err := c.ValidateMachineClass(&machine.Spec.Class) + if err != nil { + return machineutils.DoNotRetryOp, err + } + + /* + NOT NEEDED? + else if actualProviderID == "fake" { + klog.Warning("Fake driver type") + return false, nil + } + // Get the latest version of the machine so that we can avoid conflicts + machine, err = c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) + if err != nil { + klog.Errorf("Could GET machine object %s", err) + return machineutils.RetryOp, err + } + */ + + if machine.DeletionTimestamp != nil { + // Process a delete event + return c.triggerDeletionFlow(&driver.DeleteMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + }) + } + + if machine.Status.Node != "" { + // If reference to node object exists execute the below + + retry, err := c.reconcileMachineHealth(machine) + if err != nil { + return retry, err + } + + retry, err = c.syncMachineNodeTemplates(machine) + if err != nil { + return retry, err + } + } + + /* + if machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed { + // If machine status is failed, ignore it + return machineutils.DoNotRetryOp, nil + } else + */ + + if machine.Spec.ProviderID == "" || machine.Status.CurrentStatus.Phase == "" { + return c.triggerCreationFlow(&driver.CreateMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + }) + } + + /* + TODO: re-introduce this when in-place updates can be done. + else if actualProviderID != machine.Spec.ProviderID { + // If provider-ID has changed, update the machine + return c.triggerUpdationFlow(machine, actualProviderID) + } + */ + + return machineutils.DoNotRetryOp, nil +} + +/* + SECTION + Machine controller - nodeToMachine +*/ +func (c *controller) addNodeToMachine(obj interface{}) { + + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + + machine, err := c.getMachineFromNode(key) + if err != nil { + klog.Errorf("Couldn't fetch machine %s, Error: %s", key, err) + return + } else if machine == nil { + return + } + + klog.V(4).Infof("Add machine object backing node %q", machine.Name) + c.enqueueMachine(machine) +} + +func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { + c.addNodeToMachine(newObj) +} + +func (c *controller) deleteNodeToMachine(obj interface{}) { + c.addNodeToMachine(obj) +} + +/* + SECTION + NodeToMachine operations +*/ + +func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, error) { + var ( + list = []string{nodeName} + selector = labels.NewSelector() + req, _ = labels.NewRequirement("node", selection.Equals, list) + ) + + selector = selector.Add(*req) + machines, _ := c.machineLister.List(selector) + + if len(machines) > 1 { + return nil, errors.New("Multiple machines matching node") + } else if len(machines) < 1 { + return nil, nil + } + + return machines[0], nil +} + +/* + Move to update method? + clone := machine.DeepCopy() + if clone.Labels == nil { + clone.Labels = make(map[string]string) + } + + if _, ok := clone.Labels["node"]; !ok { + clone.Labels["node"] = machine.Status.Node + machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(clone) + if err != nil { + klog.Warningf("Machine update failed. Retrying, error: %s", err) + return machine, err + } + } +*/ + +/* + SECTION + Machine operations - Create, Update, Delete +*/ + +func (c *controller) triggerCreationFlow(createMachineRequest *driver.CreateMachineRequest) (machineutils.Retry, error) { + var ( + machine = createMachineRequest.Machine + machineName = createMachineRequest.Machine.Name + nodeName = "" + providerID = "" + ) + // Add finalizers if not present + retry, err := c.addMachineFinalizers(createMachineRequest.Machine) + if err != nil { + return retry, err + } + + // Find out if VM exists on provider for this machine object + getMachineStatusResponse, err := c.driver.GetMachineStatus(context.TODO(), &driver.GetMachineStatusRequest{ + Machine: machine, + MachineClass: createMachineRequest.MachineClass, + Secret: createMachineRequest.Secret, + }) + if err == nil { + // Found VM with required machine name + klog.V(2).Infof("Found VM with required machine name. Adopting existing machine: %q with ProviderID: %s", machineName, getMachineStatusResponse.ProviderID) + nodeName = getMachineStatusResponse.NodeName + providerID = getMachineStatusResponse.ProviderID + } else { + // VM with required name is not found. + + machineErr, ok := status.FromError(err) + if !ok { + // Error occurred with decoding gRPC error status, abort with retry. + klog.Errorf("Error occurred while decoding machine error for machine %q: %s", machine.Name, err) + return machineutils.RetryOp, err + } + + klog.Info(machineErr.Code()) + + // Decoding gRPC error code + switch machineErr.Code() { + case codes.NotFound, codes.Unimplemented: + // Either VM is not found + // or GetMachineStatus() call is not implemented + // In this case, invoke a CreateMachine() call + klog.V(2).Infof("Creating a VM for machine %q, please wait!", machine.Name) + if _, present := machine.Labels["node"]; !present { + // If node label is not present + createMachineResponse, err := c.driver.CreateMachine(context.TODO(), createMachineRequest) + if err != nil { + // Create call returned an error. + klog.Errorf("Error while creating machine %s: %s", machine.Name, err.Error()) + return c.machineCreateErrorHandler(machine, createMachineResponse, err) + } + nodeName = createMachineResponse.NodeName + providerID = createMachineResponse.ProviderID + } else { + nodeName = machine.Labels["node"] + } + + // Creation was successful + klog.V(2).Infof("Created new VM for machine: %q with ProviderID: %s", machine.Name, providerID) + break + + case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: + // GetMachineStatus() returned with one of the above error codes. + // Retry operation. + return machineutils.RetryOp, err + + default: + return machineutils.DoNotRetryOp, err + } + } + _, machineNodeLabelPresent := createMachineRequest.Machine.Labels["node"] + _, machinePriorityAnnotationPresent := createMachineRequest.Machine.Annotations[machineutils.MachinePriority] + + if !machineNodeLabelPresent || !machinePriorityAnnotationPresent || machine.Spec.ProviderID == "" { + clone := machine.DeepCopy() + if clone.Labels == nil { + clone.Labels = make(map[string]string) + } + clone.Labels["node"] = nodeName + if clone.Annotations == nil { + clone.Annotations = make(map[string]string) + } + if clone.Annotations[machineutils.MachinePriority] == "" { + clone.Annotations[machineutils.MachinePriority] = "3" + } + + clone.Spec.ProviderID = providerID + _, err := c.controlMachineClient.Machines(clone.Namespace).Update(clone) + if err != nil { + klog.Warningf("Machine UPDATE failed for %q. Retrying, error: %s", machine.Name, err) + } else { + klog.V(2).Infof("Machine labels/annotations UPDATE for %q", machine.Name) + + // Return error even when machine object is updated + err = fmt.Errorf("Machine creation in process. Machine UPDATE successful") + } + return machineutils.RetryOp, err + } + + if machine.Status.Node != nodeName || machine.Status.CurrentStatus.Phase == "" { + clone := machine.DeepCopy() + + clone.Status.Node = nodeName + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: "Creating machine on cloud provider", + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachinePending, + TimeoutActive: true, + LastUpdateTime: metav1.Now(), + } + + _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if err != nil { + klog.Warningf("Machine/status UPDATE failed for %q. Retrying, error: %s", machine.Name, err) + } else { + klog.V(2).Infof("Machine/status UPDATE for %q during creation", machine.Name) + + // Return error even when machine object is updated + err = fmt.Errorf("Machine creation in process. Machine/Status UPDATE successful") + } + + return machineutils.RetryOp, err + } + + return machineutils.DoNotRetryOp, nil +} + +func (c *controller) triggerUpdationFlow(machine *v1alpha1.Machine, actualProviderID string) (machineutils.Retry, error) { + klog.V(2).Infof("Setting ProviderID of %s to %s", machine.Name, actualProviderID) + + for { + machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) + if err != nil { + klog.Warningf("Machine GET failed. Retrying, error: %s", err) + continue + } + + clone := machine.DeepCopy() + clone.Spec.ProviderID = actualProviderID + machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(clone) + if err != nil { + klog.Warningf("Machine UPDATE failed. Retrying, error: %s", err) + continue + } + + clone = machine.DeepCopy() + lastOperation := v1alpha1.LastOperation{ + Description: "Updated provider ID", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationUpdate, + LastUpdateTime: metav1.Now(), + } + clone.Status.LastOperation = lastOperation + _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if err != nil { + klog.Warningf("Machine/status UPDATE failed. Retrying, error: %s", err) + continue + } + // Update went through, exit out of infinite loop + break + } + + return machineutils.DoNotRetryOp, nil +} + +func (c *controller) triggerDeletionFlow(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) { + var ( + machine = deleteMachineRequest.Machine + finalizers = sets.NewString(machine.Finalizers...) + ) + + switch { + case !finalizers.Has(DeleteFinalizerName): + // If Finalizers are not present on machine + err := fmt.Errorf("Machine %q is missing finalizers. Deletion cannot proceed", machine.Name) + return machineutils.DoNotRetryOp, err + + case machine.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating: + return c.setMachineTerminationStatus(deleteMachineRequest) + + case strings.Contains(machine.Status.LastOperation.Description, machineutils.GetVMStatus): + return c.getVMStatus(&driver.GetMachineStatusRequest{ + Machine: deleteMachineRequest.Machine, + MachineClass: deleteMachineRequest.MachineClass, + Secret: deleteMachineRequest.Secret, + }) + + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateDrain): + return c.drainNode(deleteMachineRequest) + + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateVMDeletion): + return c.deleteVM(deleteMachineRequest) + + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateNodeDeletion): + return c.deleteNodeObject(machine) + + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateFinalizerRemoval): + _, err := c.deleteMachineFinalizers(machine) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Machine finalizer REMOVAL failed for machine %q. Retrying, error: %s", machine.Name, err) + return machineutils.RetryOp, err + } + + default: + err := fmt.Errorf("Unable to decode deletion flow state for machine %q. Re-initiate termination", machine.Name) + klog.Warning(err) + return c.setMachineTerminationStatus(deleteMachineRequest) + } + + /* + // Delete machine object + err := c.controlMachineClient.Machines(machine.Namespace).Delete(machine.Name, &metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + // If its an error, and anyother error than object not found + klog.Errorf("Deletion of Machine Object %q failed due to error: %s", machine.Name, err) + return machineutils.RetryOp, err + } + */ + + klog.V(2).Infof("Machine %q deleted successfully", machine.Name) + return machineutils.DoNotRetryOp, nil +} diff --git a/pkg/util/provider/machinecontroller/machine_bootstrap_token.go b/pkg/util/provider/machinecontroller/machine_bootstrap_token.go new file mode 100644 index 000000000..6033e43bb --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_bootstrap_token.go @@ -0,0 +1,136 @@ +/* +(c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Modifications Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "crypto/rand" + "encoding/hex" + "fmt" + "math/big" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + bootstraptokenapi "k8s.io/cluster-bootstrap/token/api" + bootstraptokenutil "k8s.io/cluster-bootstrap/token/util" + "k8s.io/klog" +) + +const placeholder = "<>" + +func (c *controller) addBootstrapTokenToUserData(machineName string, secret *corev1.Secret) error { + var ( + userDataB []byte + userDataS string + exists bool + ) + + if userDataB, exists = secret.Data["userData"]; !exists { + // If userData key is not founds + return fmt.Errorf("Userdata field not found in secret for machine %q", machineName) + } + userDataS = string(userDataB) + + klog.V(4).Infof("Creating bootstrap token!") + bootstrapTokenSecret, err := c.getBootstrapTokenOrCreateIfNotExist(machineName) + if err != nil { + return err + } + token := bootstraptokenutil.TokenFromIDAndSecret( + string(bootstrapTokenSecret.Data[bootstraptokenapi.BootstrapTokenIDKey]), + string(bootstrapTokenSecret.Data[bootstraptokenapi.BootstrapTokenSecretKey]), + ) + + klog.V(4).Infof("replacing placeholder %s with %s in user-data!", placeholder, token) + userDataS = strings.ReplaceAll(userDataS, placeholder, token) + secret.Data["userData"] = []byte(userDataS) + + return nil +} + +func (c *controller) getBootstrapTokenOrCreateIfNotExist(machineName string) (secret *corev1.Secret, err error) { + tokenID, secretName := getTokenIDAndSecretName(machineName) + + secret, err = c.targetCoreClient.CoreV1().Secrets(metav1.NamespaceSystem).Get(secretName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + bootstrapTokenSecretKey, err := generateRandomStringFromCharset(16, "0123456789abcdefghijklmnopqrstuvwxyz") + if err != nil { + return nil, err + } + data := map[string][]byte{ + bootstraptokenapi.BootstrapTokenDescriptionKey: []byte("A bootstrap token generated by MachineControllManager."), + bootstraptokenapi.BootstrapTokenIDKey: []byte(tokenID), + bootstraptokenapi.BootstrapTokenSecretKey: []byte(bootstrapTokenSecretKey), + bootstraptokenapi.BootstrapTokenExpirationKey: []byte(metav1.Now().Add(c.safetyOptions.MachineCreationTimeout.Duration).Format(time.RFC3339)), + bootstraptokenapi.BootstrapTokenUsageAuthentication: []byte("true"), + bootstraptokenapi.BootstrapTokenUsageSigningKey: []byte("true"), + bootstraptokenapi.BootstrapTokenExtraGroupsKey: []byte(c.bootstrapTokenAuthExtraGroups), + } + + secret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: secretName, + Namespace: metav1.NamespaceSystem, + }, + Type: bootstraptokenapi.SecretTypeBootstrapToken, + Data: data, + } + + return c.targetCoreClient.CoreV1().Secrets(metav1.NamespaceSystem).Create(secret) + } + return nil, err + } + + return secret, nil +} + +func (c *controller) deleteBootstrapToken(machineName string) error { + _, secretName := getTokenIDAndSecretName(machineName) + err := c.targetCoreClient.CoreV1().Secrets(metav1.NamespaceSystem).Delete(secretName, &metav1.DeleteOptions{}) + if err != nil && apierrors.IsNotFound(err) { + // Object no longer exists and has been deleted + return nil + } + return err +} + +// generateRandomStringFromCharset generates a cryptographically secure random string of the specified length . +// The set of allowed characters can be specified. Returns error if there was a problem during the random generation. +func generateRandomStringFromCharset(n int, allowedCharacters string) (string, error) { + output := make([]byte, n) + max := new(big.Int).SetInt64(int64(len(allowedCharacters))) + for i := range output { + randomCharacter, err := rand.Int(rand.Reader, max) + if err != nil { + return "", err + } + output[i] = allowedCharacters[randomCharacter.Int64()] + } + return string(output), nil +} + +func getTokenIDAndSecretName(machineName string) (string, string) { + tokenID := hex.EncodeToString([]byte(machineName)[len(machineName)-5:])[:6] + secretName := bootstraptokenutil.BootstrapTokenSecretName(tokenID) + return tokenID, secretName +} diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go new file mode 100644 index 000000000..22abfbac2 --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_safety.go @@ -0,0 +1,268 @@ +/* +Copyright (c) 2018 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "context" + "time" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" + "k8s.io/klog" +) + +const ( + // OverShootingReplicaCount freeze reason when replica count overshoots + OverShootingReplicaCount = "OverShootingReplicaCount" + // MachineDeploymentStateSync freeze reason when machineDeployment was found with inconsistent state + MachineDeploymentStateSync = "MachineDeploymentStateSync" + // TimeoutOccurred freeze reason when machineSet timeout occurs + TimeoutOccurred = "MachineSetTimeoutOccurred" + // UnfreezeAnnotation indicates the controllers to unfreeze this object + UnfreezeAnnotation = "safety.machine.sapcloud.io/unfreeze" +) + +// reconcileClusterMachineSafetyOrphanVMs checks for any orphan VMs and deletes them +func (c *controller) reconcileClusterMachineSafetyOrphanVMs(key string) error { + reSyncAfter := c.safetyOptions.MachineSafetyOrphanVMsPeriod.Duration + defer c.machineSafetyOrphanVMsQueue.AddAfter("", reSyncAfter) + + klog.V(3).Infof("reconcileClusterMachineSafetyOrphanVMs: Start") + defer klog.V(3).Infof("reconcileClusterMachineSafetyOrphanVMs: End, reSync-Period: %v", reSyncAfter) + + retry, err := c.checkMachineClasses() + if err != nil { + klog.Errorf("reconcileClusterMachineSafetyOrphanVMs: Error occurred while checking for orphan VMs: %s", err) + if retry { + return err + } + } + + return nil +} + +// reconcileClusterMachineSafetyAPIServer checks control and target clusters +// and checks if their APIServer's are reachable +// If they are not reachable, they set a machineControllerFreeze flag +func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error { + statusCheckTimeout := c.safetyOptions.MachineSafetyAPIServerStatusCheckTimeout.Duration + statusCheckPeriod := c.safetyOptions.MachineSafetyAPIServerStatusCheckPeriod.Duration + + klog.V(4).Infof("reconcileClusterMachineSafetyAPIServer: Start") + defer klog.V(4).Infof("reconcileClusterMachineSafetyAPIServer: Stop") + + if c.safetyOptions.MachineControllerFrozen { + // MachineController is frozen + if c.isAPIServerUp() { + // APIServer is up now, hence we need reset all machine health checks (to avoid unwanted freezes) and unfreeze + machines, err := c.machineLister.List(labels.Everything()) + if err != nil { + klog.Error("SafetyController: Unable to LIST machines. Error:", err) + return err + } + for _, machine := range machines { + if machine.Status.CurrentStatus.Phase == v1alpha1.MachineUnknown { + machine, err := c.controlMachineClient.Machines(c.namespace).Get(machine.Name, metav1.GetOptions{}) + if err != nil { + klog.Error("SafetyController: Unable to GET machines. Error:", err) + return err + } + + machine.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + machine.Status.LastOperation = v1alpha1.LastOperation{ + Description: "Machine Health Timeout was reset due to APIServer being unreachable", + LastUpdateTime: metav1.Now(), + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationHealthCheck, + } + _, err = c.controlMachineClient.Machines(c.namespace).UpdateStatus(machine) + if err != nil { + klog.Error("SafetyController: Unable to UPDATE machine/status. Error:", err) + return err + } + + klog.V(2).Info("SafetyController: Reinitializing machine health check for ", machine.Name) + } + + // En-queue after 30 seconds, to ensure all machine states are reconciled + c.enqueueMachineAfter(machine, 30*time.Second) + } + + c.safetyOptions.MachineControllerFrozen = false + c.safetyOptions.APIserverInactiveStartTime = time.Time{} + klog.V(2).Infof("SafetyController: UnFreezing Machine Controller") + } + } else { + // MachineController is not frozen + if !c.isAPIServerUp() { + // If APIServer is not up + if c.safetyOptions.APIserverInactiveStartTime.Equal(time.Time{}) { + // If timeout has not started + c.safetyOptions.APIserverInactiveStartTime = time.Now() + } + if time.Now().Sub(c.safetyOptions.APIserverInactiveStartTime) > statusCheckTimeout { + // If APIServer has been down for more than statusCheckTimeout + c.safetyOptions.MachineControllerFrozen = true + klog.V(2).Infof("SafetyController: Freezing Machine Controller") + } + + // Re-enqueue the safety check more often if APIServer is not active and is not frozen yet + defer c.machineSafetyAPIServerQueue.AddAfter("", statusCheckTimeout/5) + return nil + } + } + + defer c.machineSafetyAPIServerQueue.AddAfter("", statusCheckPeriod) + return nil +} + +// isAPIServerUp returns true if APIServers are up +// Both control and target APIServers +func (c *controller) isAPIServerUp() bool { + // Dummy get call to check if control APIServer is reachable + _, err := c.controlMachineClient.Machines(c.namespace).Get("dummy_name", metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + // Get returns an error other than object not found = Assume APIServer is not reachable + klog.Error("SafetyController: Unable to GET on machine objects ", err) + return false + } + + // Dummy get call to check if target APIServer is reachable + _, err = c.targetCoreClient.CoreV1().Nodes().Get("dummy_name", metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + // Get returns an error other than object not found = Assume APIServer is not reachable + klog.Error("SafetyController: Unable to GET on node objects ", err) + return false + } + + return true +} + +// checkCommonMachineClass checks for orphan VMs in MachinesClasses +func (c *controller) checkMachineClasses() (machineutils.Retry, error) { + MachineClasses, err := c.machineClassLister.List(labels.Everything()) + if err != nil { + klog.Error("Safety-Net: Error getting machineClasses") + return machineutils.DoNotRetryOp, err + } + + for _, machineClass := range MachineClasses { + retry, err := c.checkMachineClass( + machineClass, + machineClass.SecretRef, + ) + if err != nil { + return retry, err + } + } + + return machineutils.DoNotRetryOp, nil +} + +// checkMachineClass checks a particular machineClass for orphan instances +func (c *controller) checkMachineClass( + machineClass *v1alpha1.MachineClass, + secretRef *corev1.SecretReference) (machineutils.Retry, error) { + + // Get secret + secret, err := c.getSecret(secretRef, machineClass.Name) + if err != nil || secret == nil { + klog.Errorf("SafetyController: Secret reference not found for MachineClass: %q", machineClass.Name) + return machineutils.DoNotRetryOp, err + } + + listMachineResponse, err := c.driver.ListMachines(context.TODO(), &driver.ListMachinesRequest{ + MachineClass: machineClass, + Secret: secret, + }) + if err != nil { + klog.Errorf("SafetyController: Failed to LIST VMs at provider. Error: %s", err) + return machineutils.RetryOp, err + } + + // Making sure that its not a VM just being created, machine object not yet updated at API server + if len(listMachineResponse.MachineList) > 1 { + stopCh := make(chan struct{}) + defer close(stopCh) + + if !cache.WaitForCacheSync(stopCh, c.machineSynced) { + klog.Errorf("SafetyController: Timed out waiting for caches to sync. Error: %s", err) + return machineutils.RetryOp, err + } + } + + for machineID, machineName := range listMachineResponse.MachineList { + machine, err := c.machineLister.Machines(c.namespace).Get(machineName) + + if err != nil && !apierrors.IsNotFound(err) { + // Any other types of errors + klog.Errorf("SafetyController: Error while trying to GET machines. Error: %s", err) + } else if err != nil || machine.Spec.ProviderID != machineID { + + // If machine exists and machine object is still been processed by the machine controller + if err == nil && + machine.Status.CurrentStatus.Phase == "" { + klog.V(3).Infof("SafetyController: Machine object %q is being processed by machine controller, hence skipping", machine.Name) + continue + } + + // Creating a dummy machine object to create deleteMachineRequest + machine = &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: machineName, + }, + Spec: v1alpha1.MachineSpec{ + ProviderID: machineID, + }, + } + + _, err := c.driver.DeleteMachine(context.TODO(), &driver.DeleteMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + }) + if err != nil { + klog.Errorf("SafetyController: Error while trying to DELETE VM on CP - %s. Shall retry in next safety controller sync.", err) + } else { + klog.V(2).Infof("SafetyController: Orphan VM found and terminated VM: %s, %s", machineName, machineID) + } + } + } + return machineutils.DoNotRetryOp, nil +} + +// deleteMachineToSafety enqueues into machineSafetyQueue when a new machine is deleted +func (c *controller) deleteMachineToSafety(obj interface{}) { + machine := obj.(*v1alpha1.Machine) + c.enqueueMachineSafetyOrphanVMsKey(machine) +} + +// enqueueMachineSafetyOrphanVMsKey enqueues into machineSafetyOrphanVMsQueue +func (c *controller) enqueueMachineSafetyOrphanVMsKey(obj interface{}) { + c.machineSafetyOrphanVMsQueue.Add("") +} diff --git a/pkg/util/provider/machinecontroller/machine_safety_test.go b/pkg/util/provider/machinecontroller/machine_safety_test.go new file mode 100644 index 000000000..29a07ef6a --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_safety_test.go @@ -0,0 +1,138 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controller + +import ( + "time" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + machinev1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + . "github.com/onsi/ginkgo" + . "github.com/onsi/ginkgo/extensions/table" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +var _ = Describe("#machine_safety", func() { + + const ( + zeroDuration = time.Duration(0) + fiveSecondsDuration = 5 * time.Second + fiveMinutesDuration = 5 * time.Minute + ) + DescribeTable("##reconcileClusterMachineSafetyAPIServer", + func( + controlAPIServerIsUp bool, + targetAPIServerIsUp bool, + apiServerInactiveDuration time.Duration, + preMachineControllerIsFrozen bool, + postMachineControllerFrozen bool, + ) { + apiServerInactiveStartTime := time.Now().Add(-apiServerInactiveDuration) + stop := make(chan struct{}) + defer close(stop) + + testMachine := &machinev1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testmachine1", + Namespace: testNamespace, + }, + Status: machinev1.MachineStatus{ + CurrentStatus: machinev1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + }, + }, + } + controlMachineObjects := []runtime.Object{} + controlMachineObjects = append(controlMachineObjects, testMachine) + + c, trackers := createController(stop, testNamespace, controlMachineObjects, nil, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + c.safetyOptions.APIserverInactiveStartTime = apiServerInactiveStartTime + c.safetyOptions.MachineControllerFrozen = preMachineControllerIsFrozen + if !controlAPIServerIsUp { + trackers.ControlMachine.SetError("APIServer is Not Reachable") + trackers.ControlCore.SetError("APIServer is Not Reachable") + } + if !targetAPIServerIsUp { + trackers.TargetCore.SetError("APIServer is Not Reachable") + } + + c.reconcileClusterMachineSafetyAPIServer("") + + Expect(c.safetyOptions.MachineControllerFrozen).Should(Equal(postMachineControllerFrozen)) + }, + + // Both APIServers are reachable + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Inactive, Pre-Frozen: false = Post-Frozen: false", + true, true, zeroDuration, false, false), + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Inactive, Pre-Frozen: true = Post-Frozen: false", + true, true, zeroDuration, true, false), + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Started, Pre-Frozen: false = Post-Frozen: false", + true, true, fiveSecondsDuration, false, false), + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Started, Pre-Frozen: true = Post-Frozen: false", + true, true, fiveSecondsDuration, true, false), + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Elapsed, Pre-Frozen: false = Post-Frozen: false", + true, true, fiveMinutesDuration, false, false), + Entry("Control APIServer: Reachable, Target APIServer: Reachable, Inactive Timer: Elapsed, Pre-Frozen: true = Post-Frozen: false", + true, true, fiveMinutesDuration, true, false), + + // Target APIServer is not reachable + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Inactive, Pre-Frozen: false = Post-Frozen: false", + true, false, zeroDuration, false, false), + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Inactive, Pre-Frozen: true = Post-Frozen: true", + true, false, zeroDuration, true, true), + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Started, Pre-Frozen: false = Post-Frozen: false", + true, false, fiveSecondsDuration, false, false), + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Started, Pre-Frozen: true = Post-Frozen: true", + true, false, fiveSecondsDuration, true, true), + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Elapsed, Pre-Frozen: false = Post-Frozen: true", + true, false, fiveMinutesDuration, false, true), + Entry("Control APIServer: Reachable, Target APIServer: UnReachable, Inactive Timer: Elapsed, Pre-Frozen: true = Post-Frozen: true", + true, false, fiveMinutesDuration, true, true), + + // Control APIServer is not reachable + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Inactive, Pre-Frozen: false = Post-Frozen: false", + false, true, zeroDuration, false, false), + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Inactive, Pre-Frozen: true = Post-Frozen: true", + false, true, zeroDuration, true, true), + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Started, Pre-Frozen: false = Post-Frozen: false", + false, true, fiveSecondsDuration, false, false), + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Started, Pre-Frozen: true = Post-Frozen: true", + false, true, fiveSecondsDuration, true, true), + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Elapsed, Pre-Frozen: false = Post-Frozen: true", + false, true, fiveMinutesDuration, false, true), + Entry("Control APIServer: UnReachable, Target APIServer: Reachable, Inactive Timer: Elapsed, Pre-Frozen: true = Post-Frozen: true", + false, true, fiveMinutesDuration, true, true), + + // Both APIServers are not reachable + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Inactive, Pre-Frozen: false = Post-Frozen: false", + false, false, zeroDuration, false, false), + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Inactive, Pre-Frozen: true = Post-Frozen: true", + false, false, zeroDuration, true, true), + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Started, Pre-Frozen: false = Post-Frozen: false", + false, false, fiveSecondsDuration, false, false), + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Started, Pre-Frozen: true = Post-Frozen: true", + false, false, fiveSecondsDuration, true, true), + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Elapsed, Pre-Frozen: false = Post-Frozen: true", + false, false, fiveMinutesDuration, false, true), + Entry("Control APIServer: UnReachable, Target APIServer: UnReachable, Inactive Timer: Elapsed, Pre-Frozen: true = Post-Frozen: true", + false, false, fiveMinutesDuration, true, true), + ) +}) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go new file mode 100644 index 000000000..f9e732505 --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -0,0 +1,2534 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controller + +import ( + "fmt" + "math" + "time" + + machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" + fakemachineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake" + customfake "github.com/gardener/machine-controller-manager/pkg/fakeclient" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" + . "github.com/onsi/ginkgo" + . "github.com/onsi/ginkgo/extensions/table" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + k8stesting "k8s.io/client-go/testing" +) + +const testNamespace = "test" + +var _ = Describe("machine", func() { + var ( + fakeMachineClient *fakemachineapi.FakeMachineV1alpha1 + c *controller + ) + + Describe("#isHealthy", func() { + BeforeEach(func() { + fakeMachineClient = &fakemachineapi.FakeMachineV1alpha1{ + Fake: &k8stesting.Fake{}, + } + c = &controller{ + controlMachineClient: fakeMachineClient, + nodeConditions: "ReadonlyFilesystem,KernelDeadlock,DiskPressure", + } + }) + + testMachine := v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testmachine", + Namespace: testNamespace, + }, + Status: v1alpha1.MachineStatus{ + Conditions: []corev1.NodeCondition{}, + }, + } + DescribeTable("Checking health of the machine", + func(conditionType corev1.NodeConditionType, conditionStatus corev1.ConditionStatus, expected bool) { + testMachine.Status.Conditions = []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + { + Type: corev1.NodeDiskPressure, + Status: corev1.ConditionFalse, + }, + { + Type: corev1.NodeMemoryPressure, + Status: corev1.ConditionFalse, + }, + { + Type: corev1.NodeNetworkUnavailable, + Status: corev1.ConditionFalse, + }, + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + } + for i, condition := range testMachine.Status.Conditions { + if condition.Type == conditionType { + testMachine.Status.Conditions[i].Status = conditionStatus + break + } + } + Expect(c.isHealthy(&testMachine)).Should(BeIdenticalTo(expected)) + }, + Entry("with NodeReady is True", corev1.NodeReady, corev1.ConditionTrue, true), + Entry("with NodeReady is False", corev1.NodeReady, corev1.ConditionFalse, false), + Entry("with NodeReady is Unknown", corev1.NodeReady, corev1.ConditionUnknown, false), + + Entry("with NodeDiskPressure is True", corev1.NodeDiskPressure, corev1.ConditionTrue, false), + Entry("with NodeDiskPressure is False", corev1.NodeDiskPressure, corev1.ConditionFalse, true), + Entry("with NodeDiskPressure is Unknown", corev1.NodeDiskPressure, corev1.ConditionUnknown, false), + + Entry("with NodeMemoryPressure is True", corev1.NodeMemoryPressure, corev1.ConditionTrue, true), + Entry("with NodeMemoryPressure is Unknown", corev1.NodeMemoryPressure, corev1.ConditionUnknown, true), + Entry("with NodeMemoryPressure is False", corev1.NodeMemoryPressure, corev1.ConditionFalse, true), + + Entry("with NodeNetworkUnavailable is True", corev1.NodeNetworkUnavailable, corev1.ConditionTrue, true), + Entry("with NodeNetworkUnavailable is Unknown", corev1.NodeNetworkUnavailable, corev1.ConditionUnknown, true), + Entry("with NodeNetworkUnavailable is False", corev1.NodeNetworkUnavailable, corev1.ConditionFalse, true), + + Entry("with NodeReady is True", corev1.NodeReady, corev1.ConditionTrue, true), + Entry("with NodeReady is Unknown", corev1.NodeReady, corev1.ConditionUnknown, false), + Entry("with NodeReady is False", corev1.NodeReady, corev1.ConditionFalse, false), + ) + }) + + /* + Describe("##updateMachineConditions", func() { + Describe("Update conditions of a non-existing machine", func() { + It("should return error", func() { + stop := make(chan struct{}) + defer close(stop) + + objects := []runtime.Object{} + c, trackers := createController(stop, testNamespace, objects, nil, nil) + defer trackers.Stop() + + testMachine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testmachine", + Namespace: testNamespace, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + }, + }, + } + conditions := []corev1.NodeCondition{} + var _, err = c.updateMachineConditions(testMachine, conditions) + Expect(err).Should(Not(BeNil())) + }) + }) + DescribeTable("Update conditions of an existing machine", + func(phase v1alpha1.MachinePhase, conditions []corev1.NodeCondition, expectedPhase v1alpha1.MachinePhase) { + stop := make(chan struct{}) + defer close(stop) + + testMachine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testmachine", + Namespace: testNamespace, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: phase, + }, + }, + } + objects := []runtime.Object{} + objects = append(objects, testMachine) + + c, trackers := createController(stop, testNamespace, objects, nil, nil) + defer trackers.Stop() + + var updatedMachine, err = c.updateMachineConditions(testMachine, conditions) + Expect(updatedMachine.Status.Conditions).Should(BeEquivalentTo(conditions)) + Expect(updatedMachine.Status.CurrentStatus.Phase).Should(BeIdenticalTo(expectedPhase)) + Expect(err).Should(BeNil()) + }, + Entry("healthy status but machine terminating", v1alpha1.MachineTerminating, []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, v1alpha1.MachineTerminating), + Entry("unhealthy status but machine running", v1alpha1.MachineRunning, []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionFalse, + }, + }, v1alpha1.MachineUnknown), + Entry("healthy status but machine not running", v1alpha1.MachineAvailable, []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, v1alpha1.MachineRunning), + ) + }) + */ + + Describe("#ValidateMachine", func() { + type data struct { + action machineapi.Machine + expect field.ErrorList + } + DescribeTable("#happy path", + func(data *data) { + errList := validation.ValidateMachine(&data.action) + Expect(errList).To(Equal(data.expect)) + }, + Entry("aws", &data{ + action: machineapi.Machine{ + Spec: machineapi.MachineSpec{ + Class: machineapi.ClassSpec{ + Kind: "AWSMachineClass", + Name: "aws", + }, + }, + }, + expect: field.ErrorList{}, + }), + ) + }) + + Describe("#ValidateMachineClass", func() { + type setup struct { + aws []*v1alpha1.MachineClass + secrets []*corev1.Secret + } + type expect struct { + machineClass interface{} + secret *corev1.Secret + err bool + } + type data struct { + setup setup + action *v1alpha1.ClassSpec + expect expect + } + + objMeta := &metav1.ObjectMeta{ + GenerateName: "class", + Namespace: testNamespace, + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.aws { + machineObjects = append(machineObjects, o) + } + + coreObjects := []runtime.Object{} + for _, o := range data.setup.secrets { + coreObjects = append(coreObjects, o) + } + + controller, trackers := createController(stop, objMeta.Namespace, machineObjects, nil, coreObjects, nil) + defer trackers.Stop() + + waitForCacheSync(stop, controller) + machineClass, secret, err := controller.ValidateMachineClass(data.action) + + if data.expect.machineClass == nil { + Expect(machineClass).To(BeNil()) + } else { + Expect(machineClass).To(Equal(data.expect.machineClass)) + } + if data.expect.secret == nil { + Expect(secret).To(BeNil()) + } else { + Expect(secret).To(Equal(data.expect.secret)) + } + if !data.expect.err { + Expect(err).To(BeNil()) + } else { + Expect(err).To(HaveOccurred()) + } + }, + Entry("non-existing machine class", &data{ + setup: setup{ + aws: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + }, + action: &v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "non-existing", + }, + expect: expect{ + err: true, + }, + }), + Entry("non-existing secret", &data{ + setup: setup{ + secrets: []*corev1.Secret{}, + aws: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + }, + action: &v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "class-0", + }, + expect: expect{ + machineClass: &v1alpha1.MachineClass{ + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + err: false, //TODO Why? Create issue + }, + }), + Entry("valid machineClass", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + aws: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + }, + action: &v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "class-0", + }, + expect: expect{ + machineClass: &v1alpha1.MachineClass{ + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + secret: &corev1.Secret{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + err: false, + }, + }), + ) + }) + + Describe("#triggerCreationFlow", func() { + type setup struct { + machineClaasses []*v1alpha1.MachineClass + machines []*v1alpha1.Machine + secrets []*corev1.Secret + fakeResourceActions *customfake.ResourceActions + } + type action struct { + machine string + fakeDriver *driver.FakeDriver + } + type expect struct { + machine *v1alpha1.Machine + err error + retry machineutils.Retry + } + type data struct { + setup setup + action action + expect expect + } + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + Namespace: "test", + } + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.machineClaasses { + machineObjects = append(machineObjects, o) + } + for _, o := range data.setup.machines { + machineObjects = append(machineObjects, o) + } + + coreObjects := []runtime.Object{} + for _, o := range data.setup.secrets { + coreObjects = append(coreObjects, o) + } + + fakedriver := driver.NewFakeDriver(data.action.fakeDriver) + + controller, trackers := createController(stop, objMeta.Namespace, machineObjects, nil, coreObjects, fakedriver) + defer trackers.Stop() + + waitForCacheSync(stop, controller) + + action := data.action + machine, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + machineClass, err := controller.controlMachineClient.MachineClasses(objMeta.Namespace).Get(machine.Spec.Class.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + secret, err := controller.targetCoreClient.CoreV1().Secrets(objMeta.Namespace).Get(machineClass.SecretRef.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + retry, err := controller.triggerCreationFlow( + &driver.CreateMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + }, + ) + + if data.expect.err != nil || err != nil { + Expect(err).To(HaveOccurred()) + Expect(err).To(Equal(data.expect.err)) + } + + actual, err := controller.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) + Expect(err).To(BeNil()) + Expect(actual.Spec.ProviderID).To(Equal(data.expect.machine.Spec.ProviderID)) + Expect(actual.Status.Node).To(Equal(data.expect.machine.Status.Node)) + Expect(actual.Finalizers).To(Equal(data.expect.machine.Finalizers)) + Expect(retry).To(Equal(data.expect.retry)) + }, + + Entry("Machine creation in process. Machine finalizers are UPDATED", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClaasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil, false), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: false, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machineClass", + }, + }, + }, nil, nil, nil, nil, true), + err: fmt.Errorf("Machine creation in process. Machine finalizers are UPDATED"), + retry: true, + }, + }), + Entry("Machine creation succeeds with object UPDATE", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClaasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil, true), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: false, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machineClass", + }, + ProviderID: "fakeID", + }, + }, nil, nil, nil, nil, true), + err: fmt.Errorf("Machine creation in process. Machine UPDATE successful"), + retry: true, + }, + }), + Entry("Machine creation succeeds with status UPDATE", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClaasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + nil, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + err: fmt.Errorf("Machine creation in process. Machine/Status UPDATE successful"), + retry: true, + }, + }), + Entry("Machine creation has already succeeded, so no update", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClaasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachinePending, + + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Creating machine on cloud provider", + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachinePending, + + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Creating machine on cloud provider", + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + err: nil, + retry: false, + }, + }), + + /* + Entry("Machine creation success even on temporary APIServer disruption", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + aws: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil), + fakeResourceActions: &customfake.ResourceActions{ + Machine: customfake.Actions{ + Get: "Failed to GET machine", + }, + }, + }, + action: action{ + machine: "machine-0", + fakeProviderID: "fakeID-0", + fakeNodeName: "fakeNode-0", + fakeError: nil, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, &v1alpha1.MachineStatus{ + Node: "fakeNode", + //TODO conditions + }, nil, nil, nil), + err: false, + }, + }), + */ + ) + }) + + Describe("#triggerDeletionFlow", func() { + type setup struct { + secrets []*corev1.Secret + machineClasses []*v1alpha1.MachineClass + machines []*v1alpha1.Machine + nodes []*corev1.Node + fakeResourceActions *customfake.ResourceActions + } + type action struct { + machine string + forceDeleteLabelPresent bool + fakeMachineStatus *v1alpha1.MachineStatus + fakeDriver *driver.FakeDriver + } + type expect struct { + machine *v1alpha1.Machine + err error + nodeDeleted bool + retry machineutils.Retry + } + type data struct { + setup setup + action action + expect expect + } + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + Namespace: "test", + } + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.machineClasses { + machineObjects = append(machineObjects, o) + } + for _, o := range data.setup.machines { + machineObjects = append(machineObjects, o) + } + + coreObjects := []runtime.Object{} + for _, o := range data.setup.secrets { + coreObjects = append(coreObjects, o) + } + for _, o := range data.setup.nodes { + coreObjects = append(coreObjects, o) + } + + fakeDriver := driver.NewFakeDriver( + data.action.fakeDriver, + ) + + controller, trackers := createController(stop, objMeta.Namespace, machineObjects, nil, coreObjects, fakeDriver) + defer trackers.Stop() + waitForCacheSync(stop, controller) + + action := data.action + machine, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + machineClass, err := controller.controlMachineClient.MachineClasses(objMeta.Namespace).Get(machine.Spec.Class.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + secret, err := controller.targetCoreClient.CoreV1().Secrets(objMeta.Namespace).Get(machineClass.SecretRef.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + if data.setup.fakeResourceActions != nil { + trackers.TargetCore.SetFakeResourceActions(data.setup.fakeResourceActions, math.MaxInt32) + } + + // Deletion of machine is triggered + retry, err := controller.triggerDeletionFlow(&driver.DeleteMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + }) + if err != nil || data.expect.err != nil { + Expect(err).To(Equal(data.expect.err)) + } + Expect(retry).To(Equal(data.expect.retry)) + + machine, err = controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(machine.Spec).To(Equal(data.expect.machine.Spec)) + Expect(machine.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase)) + Expect(machine.Status.LastOperation.State).To(Equal(data.expect.machine.Status.LastOperation.State)) + Expect(machine.Status.LastOperation.Type).To(Equal(data.expect.machine.Status.LastOperation.Type)) + Expect(machine.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description)) + Expect(machine.Finalizers).To(Equal(data.expect.machine.Finalizers)) + + if data.expect.nodeDeleted { + _, nodeErr := controller.targetCoreClient.CoreV1().Nodes().Get(machine.Status.Node, metav1.GetOptions{}) + Expect(nodeErr).To(HaveOccurred()) + } + + }, + Entry("Do not process machine deletion for object without finalizer", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + false, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine \"machine-0\" is missing finalizers. Deletion cannot proceed"), + retry: machineutils.DoNotRetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + false, + ), + }, + }), + Entry("Change machine phase to termination successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. Phase set to termination"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.GetVMStatus, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Checking existance of VM at provider successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.GetVMStatus, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. VM with matching ID found"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Drain machine successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Drain skipping as machine is NotReady for a long time (5 minutes)", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + Conditions: []corev1.NodeCondition{ + corev1.NodeCondition{ + Type: corev1.NodeReady, + Status: corev1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)), + }, + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Drain machine failure, but since force deletion label is present deletion continues", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + "force-deletion": "True", + }, + true, + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + }, + }, + fakeResourceActions: &customfake.ResourceActions{ + Node: customfake.Actions{ + Update: "Failed to update node", + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Failed to update node"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Drain machine failure after drain timeout, hence deletion continues", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.NewTime(time.Now().Add(-2 * time.Hour)), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.NewTime(time.Now().Add(-2 * time.Hour)), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + }, + }, + fakeResourceActions: &customfake.ResourceActions{ + Node: customfake.Actions{ + Update: "Failed to update node", + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Failed to update node"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Drain machine failure", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.InitiateDrain, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + }, + }, + fakeResourceActions: &customfake.ResourceActions{ + Node: customfake.Actions{ + Update: "Failed to update node", + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Failed to update node"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain failed due to - Failed to update node. Will retry in next sync. %s", machineutils.InitiateDrain), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Delete VM successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. VM deletion was successful. " + machineutils.InitiateNodeDeletion), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("VM deletion was successful. %s", machineutils.InitiateNodeDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Delete node object successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("VM deletion was successful. %s", machineutils.InitiateNodeDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. Deletion of node object was succesful"), + retry: machineutils.RetryOp, + nodeDeleted: true, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Deletion of Node Object %q is successful. %s", "fakeID-0", machineutils.InitiateFinalizerRemoval), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + Entry("Delete machine finalizer successfully", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Deletion of Node Object %q is successful. %s", "fakeID-0", machineutils.InitiateFinalizerRemoval), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + retry: machineutils.DoNotRetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Deletion of Node Object %q is successful. %s", "fakeID-0", machineutils.InitiateFinalizerRemoval), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + false, + ), + }, + }), + Entry("Unable to decode deletion flow state for machine", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Some random last op description", + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. Phase set to termination"), + retry: machineutils.RetryOp, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + Node: "fakeNode", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.GetVMStatus, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + "node": "fakeID-0", + }, + true, + ), + }, + }), + ) + }) + + /* + Describe("#checkMachineTimeout", func() { + type setup struct { + machines []*v1alpha1.Machine + } + type action struct { + machine string + } + type expect struct { + machine *v1alpha1.Machine + err bool + } + type data struct { + setup setup + action action + expect expect + } + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + Namespace: "test", + } + + machineName := "machine-0" + timeOutOccurred := -21 * time.Minute + timeOutNotOccurred := -5 * time.Minute + creationTimeOut := 20 * time.Minute + healthTimeOut := 10 * time.Minute + + DescribeTable("##Machine Timeout Scenarios", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.machines { + machineObjects = append(machineObjects, o) + } + + coreObjects := []runtime.Object{} + + controller, trackers := createController(stop, objMeta.Namespace, machineObjects, nil, coreObjects) + defer trackers.Stop() + waitForCacheSync(stop, controller) + + action := data.action + machine, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + //Expect(err).ToNot(HaveOccurred()) + + controller.checkMachineTimeout(machine) + + actual, err := controller.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) + Expect(err).To(BeNil()) + Expect(actual.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase)) + Expect(actual.Status.CurrentStatus.//TimeoutActive).To(Equal(data.expect.machine.Status.CurrentStatus.//TimeoutActive)) + Expect(actual.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description)) + Expect(actual.Status.LastOperation.State).To(Equal(data.expect.machine.Status.LastOperation.State)) + Expect(actual.Status.LastOperation.Type).To(Equal(data.expect.machine.Status.LastOperation.Type)) + }, + Entry("Machine is still running", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + //TimeoutActive: false, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutNotOccurred)), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine % successfully joined the cluster", machineName), + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutNotOccurred)), + }, + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine % successfully joined the cluster", machineName), + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + }, + }, nil, nil, nil), + }, + }), + Entry("Machine creation has still not timed out", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutNotOccurred)), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown", machineName), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutNotOccurred)), + }, + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown", machineName), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + }, + }, nil, nil, nil), + }, + }), + Entry("Machine creation has timed out", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachinePending, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutOccurred)), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Creating machine on cloud provider", + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutOccurred)), + }, + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineFailed, + + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf( + "Machine %s failed to join the cluster in %s minutes.", + machineName, + creationTimeOut, + ), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationCreate, + }, + }, nil, nil, nil), + }, + }), + Entry("Machine health has timed out", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutOccurred)), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown", machineName), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationHealthCheck, + LastUpdateTime: metav1.NewTime(time.Now().Add(timeOutOccurred)), + }, + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineFailed, + + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf( + "Machine %s is not healthy since %s minutes. Changing status to failed. Node Conditions: %+v", + machineName, + healthTimeOut, + []corev1.NodeCondition{}, + ), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationHealthCheck, + }, + }, nil, nil, nil), + }, + }), + ) + }) + + Describe("#updateMachineState", func() { + type setup struct { + machines []*v1alpha1.Machine + nodes []*corev1.Node + } + type action struct { + machine string + } + type expect struct { + machine *v1alpha1.Machine + err bool + } + type data struct { + setup setup + action action + expect expect + } + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + // using default namespace for non-namespaced objects + // as our current fake client is with the assumption + // that all objects are namespaced + Namespace: "", + } + + machineName := "machine-0" + + DescribeTable("##Different machine state update scenrios", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.machines { + machineObjects = append(machineObjects, o) + } + + coreObjects := []runtime.Object{} + for _, o := range data.setup.nodes { + coreObjects = append(coreObjects, o) + } + + controller, trackers := createController(stop, objMeta.Namespace, machineObjects, nil, coreObjects) + defer trackers.Stop() + waitForCacheSync(stop, controller) + + action := data.action + machine, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + controller.updateMachineState(machine) + + actual, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{}) + Expect(err).To(BeNil()) + Expect(actual.Name).To(Equal(data.expect.machine.Name)) + Expect(actual.Status.Node).To(Equal(data.expect.machine.Status.Node)) + Expect(actual.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase)) + Expect(actual.Status.CurrentStatus.//TimeoutActive).To(Equal(data.expect.machine.Status.CurrentStatus.//TimeoutActive)) + Expect(actual.Status.LastOperation.State).To(Equal(data.expect.machine.Status.LastOperation.State)) + Expect(actual.Status.LastOperation.Type).To(Equal(data.expect.machine.Status.LastOperation.Type)) + Expect(actual.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description)) + + if data.expect.machine.Labels != nil { + if _, ok := data.expect.machine.Labels["node"]; ok { + Expect(actual.Labels["node"]).To(Equal(data.expect.machine.Labels["node"])) + } + } + + for i := range actual.Status.Conditions { + Expect(actual.Status.Conditions[i].Type).To(Equal(data.expect.machine.Status.Conditions[i].Type)) + Expect(actual.Status.Conditions[i].Status).To(Equal(data.expect.machine.Status.Conditions[i].Status)) + Expect(actual.Status.Conditions[i].Reason).To(Equal(data.expect.machine.Status.Conditions[i].Reason)) + Expect(actual.Status.Conditions[i].Message).To(Equal(data.expect.machine.Status.Conditions[i].Message)) + } + }, + Entry("Machine does not have a node backing", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{}, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{}, nil, nil, nil), + }, + }), + Entry("Node object backing machine not found and machine conditions are empty", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "dummy-node", + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "dummy-node", + }, nil, nil, nil), + }, + }), + Entry("Machine is running but node object is lost", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "dummy-node", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Machine % successfully joined the cluster", machineName), + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + Conditions: []corev1.NodeCondition{ + { + Message: "kubelet is posting ready status", + Reason: "KubeletReady", + Status: "True", + Type: "Ready", + }, + }, + }, nil, nil, nil), + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "dummy-node", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf( + "Node object went missing. Machine %s is unhealthy - changing MachineState to Unknown", + machineName, + ), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationHealthCheck, + LastUpdateTime: metav1.Now(), + }, + Conditions: []corev1.NodeCondition{ + { + Message: "kubelet is posting ready status", + Reason: "KubeletReady", + Status: "True", + Type: "Ready", + }, + }, + }, nil, nil, nil), + }, + }), + Entry("Machine and node both are present and kubelet ready status is updated", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "machine", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachinePending, + + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Creating machine on cloud provider", + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + Conditions: []corev1.NodeCondition{ + { + Message: "kubelet is not ready", + Reason: "KubeletReady", + Status: "False", + Type: "Ready", + }, + }, + }, nil, nil, nil), + nodes: []*corev1.Node{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Message: "kubelet is posting ready status", + Reason: "KubeletReady", + Status: "True", + Type: "Ready", + }, + }, + }, + }, + }, + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "machine", + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + Conditions: []corev1.NodeCondition{ + { + Message: "kubelet is posting ready status", + Reason: "KubeletReady", + Status: "True", + Type: "Ready", + }, + }, + }, nil, nil, nil), + }, + }), + Entry("Machine object does not have node-label and node exists", &data{ + setup: setup{ + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + }, &v1alpha1.MachineStatus{ + Node: "node", + }, nil, nil, nil), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node-0", + }, + }, + }, + }, + action: action{ + machine: machineName, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-0", + }, + }, &v1alpha1.MachineStatus{ + Node: "node", + }, nil, nil, + map[string]string{ + "node": "node-0", + }, + ), + }, + }), + ) + }) + */ + +}) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go new file mode 100644 index 000000000..e636e21f5 --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -0,0 +1,1060 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This file was copied and modified from the kubernetes/kubernetes project +https://github.com/kubernetes/kubernetes/release-1.8/pkg/controller/deployment/util/pod_util.go + +Modifications Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "strings" + "time" + + machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/util/provider/drain" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog" +) + +var ( + // emptyMap is a dummy emptyMap to compare with + emptyMap = make(map[string]string) +) + +// TODO: use client library instead when it starts to support update retries +// see https://github.com/kubernetes/kubernetes/issues/21479 +type updateMachineFunc func(machine *v1alpha1.Machine) error + +/* +// UpdateMachineWithRetries updates a machine with given applyUpdate function. Note that machine not found error is ignored. +// The returned bool value can be used to tell if the machine is actually updated. +func UpdateMachineWithRetries(machineClient v1alpha1client.MachineInterface, machineLister v1alpha1listers.MachineLister, namespace, name string, applyUpdate updateMachineFunc) (*v1alpha1.Machine, error) { + var machine *v1alpha1.Machine + + retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + var err error + machine, err = machineLister.Machines(namespace).Get(name) + if err != nil { + return err + } + machine = machine.DeepCopy() + // Apply the update, then attempt to push it to the apiserver. + if applyErr := applyUpdate(machine); applyErr != nil { + return applyErr + } + machine, err = machineClient.Update(machine) + return err + }) + + // Ignore the precondition violated error, this machine is already updated + // with the desired label. + if retryErr == errorsutil.ErrPreconditionViolated { + klog.V(4).Infof("Machine %s precondition doesn't hold, skip updating it.", name) + retryErr = nil + } + + return machine, retryErr +} +*/ + +func (c *controller) ValidateMachineClass(classSpec *v1alpha1.ClassSpec) (*v1alpha1.MachineClass, *v1.Secret, error) { + var ( + machineClass *v1alpha1.MachineClass + secretRef *v1.Secret + err error + ) + + machineClass, err = c.machineClassLister.MachineClasses(c.namespace).Get(classSpec.Name) + if err != nil { + klog.V(2).Infof("MachineClass %q/%q not found. Skipping. %v", c.namespace, classSpec.Name, err) + return nil, nil, err + } + + internalMachineClass := &machineapi.MachineClass{} + err = c.internalExternalScheme.Convert(machineClass, internalMachineClass, nil) + if err != nil { + klog.Warning("Error in scheme conversion") + return nil, nil, err + } + + // TODO: Perform validation + /* + validationerr := validation.ValidateMachineClass(internalMachineClass) + if validationerr.ToAggregate() != nil && len(validationerr.ToAggregate().Errors()) > 0 { + err = fmt.Errorf("Validation of MachineClass failed %s", validationerr.ToAggregate().Error()) + klog.Warning(err) + return nil, nil, err + } + */ + + secretRef, err = c.getSecret(machineClass.SecretRef, machineClass.Name) + if err != nil { + klog.Warningf("Secret not found for %q", machineClass.SecretRef.Name) + return nil, nil, err + } + + return machineClass, secretRef, nil +} + +// getSecret retrives the kubernetes secret if found +func (c *controller) getSecret(ref *v1.SecretReference, MachineClassName string) (*v1.Secret, error) { + if ref == nil { + // If no secretRef, return nil + return nil, nil + } + + secretRef, err := c.secretLister.Secrets(ref.Namespace).Get(ref.Name) + if err != nil && apierrors.IsNotFound(err) { + klog.V(3).Infof("No secret %q: found for MachineClass %q", ref, MachineClassName) + return nil, nil + } else if err != nil { + klog.Errorf("Unable get secret %q for MachineClass %q: %v", MachineClassName, ref, err) + return nil, err + } + return secretRef, err +} + +// nodeConditionsHaveChanged compares two node statuses to see if any of the statuses have changed +func nodeConditionsHaveChanged(machineConditions []v1.NodeCondition, nodeConditions []v1.NodeCondition) bool { + + if len(machineConditions) != len(nodeConditions) { + return true + } + + for i := range nodeConditions { + if nodeConditions[i].Status != machineConditions[i].Status { + return true + } + } + + return false +} + +// syncMachineNodeTemplate syncs nodeTemplates between machine and corresponding node-object. +// It ensures, that any nodeTemplate element available on Machine should be available on node-object. +// Although there could be more elements already available on node-object which will not be touched. +func (c *controller) syncMachineNodeTemplates(machine *v1alpha1.Machine) (machineutils.Retry, error) { + var ( + initializedNodeAnnotation bool + currentlyAppliedALTJSONByte []byte + lastAppliedALT v1alpha1.NodeTemplateSpec + ) + + node, err := c.nodeLister.Get(machine.Status.Node) + if err != nil && apierrors.IsNotFound(err) { + // Dont return error so that other steps can be executed. + return machineutils.DoNotRetryOp, nil + } + if err != nil { + klog.Errorf("Error occurred while trying to fetch node object - err: %s", err) + return machineutils.DoNotRetryOp, err + } + + nodeCopy := node.DeepCopy() + + // Initialize node annotations if empty + if nodeCopy.Annotations == nil { + nodeCopy.Annotations = make(map[string]string) + initializedNodeAnnotation = true + } + + // Extracts the last applied annotations to lastAppliedLabels + lastAppliedALTJSONString, exists := node.Annotations[machineutils.LastAppliedALTAnnotation] + if exists { + err = json.Unmarshal([]byte(lastAppliedALTJSONString), &lastAppliedALT) + if err != nil { + klog.Errorf("Error occurred while syncing node annotations, labels & taints: %s", err) + return machineutils.RetryOp, err + } + } + + annotationsChanged := SyncMachineAnnotations(machine, nodeCopy, lastAppliedALT.Annotations) + labelsChanged := SyncMachineLabels(machine, nodeCopy, lastAppliedALT.Labels) + taintsChanged := SyncMachineTaints(machine, nodeCopy, lastAppliedALT.Spec.Taints) + + // Update node-object with latest nodeTemplate elements if elements have changed. + if initializedNodeAnnotation || labelsChanged || annotationsChanged || taintsChanged { + + klog.V(2).Infof( + "Updating machine annotations:%v, labels:%v, taints:%v for machine: %q", + annotationsChanged, + labelsChanged, + taintsChanged, + machine.Name, + ) + + // Update the machineutils.LastAppliedALTAnnotation + lastAppliedALT = machine.Spec.NodeTemplateSpec + currentlyAppliedALTJSONByte, err = json.Marshal(lastAppliedALT) + if err != nil { + klog.Errorf("Error occurred while syncing node annotations, labels & taints: %s", err) + return machineutils.RetryOp, err + } + nodeCopy.Annotations[machineutils.LastAppliedALTAnnotation] = string(currentlyAppliedALTJSONByte) + + _, err := c.targetCoreClient.CoreV1().Nodes().Update(nodeCopy) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Updated failed for node object of machine %q. Retrying, error: %q", machine.Name, err) + } else { + // Return error even when machine object is updated + err = fmt.Errorf("Machine ALTs have been reconciled") + } + + return machineutils.RetryOp, err + } + + return machineutils.DoNotRetryOp, nil +} + +// SyncMachineAnnotations syncs the annotations of the machine with node-objects. +// It returns true if update is needed else false. +func SyncMachineAnnotations( + machine *v1alpha1.Machine, + node *v1.Node, + lastAppliedAnnotations map[string]string, +) bool { + toBeUpdated := false + mAnnotations, nAnnotations := machine.Spec.NodeTemplateSpec.Annotations, node.Annotations + + // Initialize node annotations if nil + if nAnnotations == nil { + nAnnotations = make(map[string]string) + node.Annotations = nAnnotations + } + // Intialize machine annotations to empty map if nil + if mAnnotations == nil { + mAnnotations = emptyMap + } + + // Delete any annotation that existed in the past but has been deleted now + for lastAppliedAnnotationKey := range lastAppliedAnnotations { + if _, exists := mAnnotations[lastAppliedAnnotationKey]; !exists { + delete(nAnnotations, lastAppliedAnnotationKey) + toBeUpdated = true + } + } + + // Add/Update any key that doesn't exist or whose value as changed + for mKey, mValue := range mAnnotations { + if nValue, exists := nAnnotations[mKey]; !exists || mValue != nValue { + nAnnotations[mKey] = mValue + toBeUpdated = true + } + } + + return toBeUpdated +} + +// SyncMachineLabels syncs the labels of the machine with node-objects. +// It returns true if update is needed else false. +func SyncMachineLabels( + machine *v1alpha1.Machine, + node *v1.Node, + lastAppliedLabels map[string]string, +) bool { + toBeUpdated := false + mLabels, nLabels := machine.Spec.NodeTemplateSpec.Labels, node.Labels + + // Initialize node labels if nil + if nLabels == nil { + nLabels = make(map[string]string) + node.Labels = nLabels + } + // Intialize machine labels to empty map if nil + if mLabels == nil { + mLabels = emptyMap + } + + // Delete any labels that existed in the past but has been deleted now + for lastAppliedLabelKey := range lastAppliedLabels { + if _, exists := mLabels[lastAppliedLabelKey]; !exists { + delete(nLabels, lastAppliedLabelKey) + toBeUpdated = true + } + } + + // Add/Update any key that doesn't exist or whose value as changed + for mKey, mValue := range mLabels { + if nValue, exists := nLabels[mKey]; !exists || mValue != nValue { + nLabels[mKey] = mValue + toBeUpdated = true + } + } + + return toBeUpdated +} + +type taintKeyEffect struct { + // Required. The taint key to be applied to a node. + Key string + // Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + Effect v1.TaintEffect +} + +// SyncMachineTaints syncs the taints of the machine with node-objects. +// It returns true if update is needed else false. +func SyncMachineTaints( + machine *v1alpha1.Machine, + node *v1.Node, + lastAppliedTaints []v1.Taint, +) bool { + toBeUpdated := false + mTaints, nTaints := machine.Spec.NodeTemplateSpec.Spec.Taints, node.Spec.Taints + mTaintsMap := make(map[taintKeyEffect]*v1.Taint, 0) + nTaintsMap := make(map[taintKeyEffect]*v1.Taint, 0) + + // Convert the slice of taints to map of taint [key, effect] = Taint + // Helps with indexed searching + for i := range mTaints { + mTaint := &mTaints[i] + taintKE := taintKeyEffect{ + Key: mTaint.Key, + Effect: mTaint.Effect, + } + mTaintsMap[taintKE] = mTaint + } + for i := range nTaints { + nTaint := &nTaints[i] + taintKE := taintKeyEffect{ + Key: nTaint.Key, + Effect: nTaint.Effect, + } + nTaintsMap[taintKE] = nTaint + } + + // Delete taints that existed on the machine object in the last update but deleted now + for _, lastAppliedTaint := range lastAppliedTaints { + + lastAppliedKE := taintKeyEffect{ + Key: lastAppliedTaint.Key, + Effect: lastAppliedTaint.Effect, + } + + if _, exists := mTaintsMap[lastAppliedKE]; !exists { + delete(nTaintsMap, lastAppliedKE) + toBeUpdated = true + } + } + + // Add any taints that exists in the machine object but not on the node object + for mKE, mV := range mTaintsMap { + if nV, exists := nTaintsMap[mKE]; !exists || *nV != *mV { + nTaintsMap[mKE] = mV + toBeUpdated = true + } + } + + if toBeUpdated { + // Convert the map of taints to slice of taints + nTaints = make([]v1.Taint, len(nTaintsMap)) + i := 0 + for _, nV := range nTaintsMap { + nTaints[i] = *nV + i++ + } + node.Spec.Taints = nTaints + } + + return toBeUpdated +} + +// machineCreateErrorHandler TODO +func (c *controller) machineCreateErrorHandler(machine *v1alpha1.Machine, createMachineResponse *driver.CreateMachineResponse, err error) (machineutils.Retry, error) { + var retryRequired = machineutils.DoNotRetryOp + + if grpcErr, ok := status.FromError(err); ok { + switch grpcErr.Code() { + case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: + retryRequired = machineutils.RetryOp + } + } + + clone := machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: "Cloud provider message - " + err.Error(), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineFailed, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + if createMachineResponse != nil && createMachineResponse.LastKnownState != "" { + clone.Status.LastKnownState = createMachineResponse.LastKnownState + } + + _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", machine.Name, err) + } else { + klog.V(2).Infof("Machine/status UPDATE for %q during CREATE error", machine.Name) + } + + return retryRequired, err +} + +// reconcileMachineHealth updates the machine object with +// any change in node conditions or health +func (c *controller) reconcileMachineHealth(machine *v1alpha1.Machine) (machineutils.Retry, error) { + var ( + objectRequiresUpdate = false + clone = machine.DeepCopy() + description string + lastOperationType v1alpha1.MachineOperationType + ) + + node, err := c.nodeLister.Get(machine.Status.Node) + if err == nil { + if nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { + clone.Status.Conditions = node.Status.Conditions + klog.V(3).Infof("Machine %q conditions are changing", machine.Name) + objectRequiresUpdate = true + } + + if !c.isHealthy(clone) && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { + // If machine is not healthy, and current state is running, + // change the machinePhase to unknown and activate health check timeout + description = fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown", clone.Name) + klog.Warning(description) + + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + //TimeoutActive: true, + LastUpdateTime: metav1.Now(), + } + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationHealthCheck, + LastUpdateTime: metav1.Now(), + } + objectRequiresUpdate = true + + } else if c.isHealthy(clone) && clone.Status.CurrentStatus.Phase != v1alpha1.MachineRunning { + // If machine is healhy and current machinePhase is not running. + // indicates that the machine is not healthy and status needs to be updated. + + if clone.Status.LastOperation.Type == v1alpha1.MachineOperationCreate && + clone.Status.LastOperation.State != v1alpha1.MachineStateSuccessful { + // When machine creation went through + description = fmt.Sprintf("Machine %s successfully joined the cluster", clone.Name) + lastOperationType = v1alpha1.MachineOperationCreate + + // Delete the bootstrap token + err = c.deleteBootstrapToken(clone.Name) + if err != nil { + klog.Warning(err) + } + } else { + // Machine rejoined the cluster after a healthcheck + description = fmt.Sprintf("Machine %s successfully re-joined the cluster", clone.Name) + lastOperationType = v1alpha1.MachineOperationHealthCheck + } + klog.V(2).Info(description) + + // Machine is ready and has joined/re-joined the cluster + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: v1alpha1.MachineStateSuccessful, + Type: lastOperationType, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + objectRequiresUpdate = true + } + + } else if err != nil && apierrors.IsNotFound(err) { + // Node object is not found + + if len(machine.Status.Conditions) > 0 && + machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { + // If machine has conditions on it, + // and corresponding node object went missing + // and if machine object still reports healthy + description = fmt.Sprintf( + "Node object went missing. Machine %s is unhealthy - changing MachineState to Unknown", + machine.Name, + ) + klog.Warning(description) + + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineUnknown, + //TimeoutActive: true, + LastUpdateTime: metav1.Now(), + } + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationHealthCheck, + LastUpdateTime: metav1.Now(), + } + objectRequiresUpdate = true + } + + } else { + // Any other types of errors while fetching node object + klog.Errorf("Could not fetch node object for machine %q", machine.Name) + return machineutils.RetryOp, err + } + + if !objectRequiresUpdate && + (machine.Status.CurrentStatus.Phase == v1alpha1.MachinePending || + machine.Status.CurrentStatus.Phase == v1alpha1.MachineUnknown) { + var ( + description string + timeOutDuration time.Duration + ) + + checkCreationTimeout := machine.Status.CurrentStatus.Phase == v1alpha1.MachinePending + sleepTime := 1 * time.Minute + + if checkCreationTimeout { + timeOutDuration = c.safetyOptions.MachineCreationTimeout.Duration + } else { + timeOutDuration = c.safetyOptions.MachineHealthTimeout.Duration + } + + // Timeout value obtained by subtracting last operation with expected time out period + timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time) + if timeOut > 0 { + // Machine health timeout occurs while joining or rejoining of machine + + if checkCreationTimeout { + // Timeout occurred while machine creation + description = fmt.Sprintf( + "Machine %s failed to join the cluster in %s minutes.", + machine.Name, + timeOutDuration, + ) + } else { + // Timeour occurred due to machine being unhealthy for too long + description = fmt.Sprintf( + "Machine %s is not healthy since %s minutes. Changing status to failed. Node Conditions: %+v", + machine.Name, + timeOutDuration, + machine.Status.Conditions, + ) + } + + // Log the error message for machine failure + klog.Error(description) + + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: v1alpha1.MachineStateFailed, + Type: machine.Status.LastOperation.Type, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineFailed, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + objectRequiresUpdate = true + } else { + // If timeout has not occurred, re-enqueue the machine + // after a specified sleep time + c.enqueueMachineAfter(machine, sleepTime) + } + } + + if objectRequiresUpdate { + _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Update failed for machine %q. Retrying, error: %q", machine.Name, err) + } else { + klog.V(2).Infof("Machine State has been updated for %q", machine.Name) + // Return error for continuing in next iteration + err = fmt.Errorf("Machine creation is successful. Machine State has been UPDATED") + } + + return machineutils.RetryOp, err + } + + return machineutils.DoNotRetryOp, nil +} + +/* + SECTION + Manipulate Finalizers +*/ + +func (c *controller) addMachineFinalizers(machine *v1alpha1.Machine) (machineutils.Retry, error) { + if finalizers := sets.NewString(machine.Finalizers...); !finalizers.Has(DeleteFinalizerName) { + + finalizers.Insert(DeleteFinalizerName) + clone := machine.DeepCopy() + clone.Finalizers = finalizers.List() + _, err := c.controlMachineClient.Machines(clone.Namespace).Update(clone) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Failed to add finalizers for machine %q: %s", machine.Name, err) + } else { + // Return error even when machine object is updated + klog.V(2).Infof("Added finalizer to machine %q", machine.Name) + err = fmt.Errorf("Machine creation in process. Machine finalizers are UPDATED") + } + + return machineutils.RetryOp, err + } + + return machineutils.DoNotRetryOp, nil +} + +func (c *controller) deleteMachineFinalizers(machine *v1alpha1.Machine) (machineutils.Retry, error) { + if finalizers := sets.NewString(machine.Finalizers...); finalizers.Has(DeleteFinalizerName) { + + finalizers.Delete(DeleteFinalizerName) + clone := machine.DeepCopy() + clone.Finalizers = finalizers.List() + _, err := c.controlMachineClient.Machines(clone.Namespace).Update(clone) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Failed to delete finalizers for machine %q: %s", machine.Name, err) + return machineutils.RetryOp, err + } + + klog.V(2).Infof("Removed finalizer to machine %q", machine.Name) + return machineutils.DoNotRetryOp, nil + } + + return machineutils.DoNotRetryOp, nil +} + +/* + SECTION + Helper Functions +*/ +func (c *controller) isHealthy(machine *v1alpha1.Machine) bool { + numOfConditions := len(machine.Status.Conditions) + + if numOfConditions == 0 { + // Kubernetes node object for this machine hasn't been received + return false + } + + for _, condition := range machine.Status.Conditions { + if condition.Type == v1.NodeReady && condition.Status != v1.ConditionTrue { + // If Kubelet is not ready + return false + } + conditions := strings.Split(c.nodeConditions, ",") + for _, c := range conditions { + if string(condition.Type) == c && condition.Status != v1.ConditionFalse { + return false + } + } + } + return true +} + +/* + SECTION + Delete machine +*/ + +// setMachineTerminationStatus set's the machine status to terminating +func (c *controller) setMachineTerminationStatus(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) { + clone := deleteMachineRequest.Machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: machineutils.GetVMStatus, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + //TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + + _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if err != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", deleteMachineRequest.Machine.Name, err) + } else { + klog.V(2).Infof("Machine %q status updated to terminating ", deleteMachineRequest.Machine.Name) + // Return error even when machine object is updated to ensure reconcilation is restarted + err = fmt.Errorf("Machine deletion in process. Phase set to termination") + } + return machineutils.RetryOp, err +} + +// getVMStatus tries to retrive VM status backed by machine +func (c *controller) getVMStatus(getMachineStatusRequest *driver.GetMachineStatusRequest) (machineutils.Retry, error) { + var ( + retry machineutils.Retry + description string + state v1alpha1.MachineState + phase v1alpha1.MachinePhase + ) + + _, err := c.driver.GetMachineStatus(context.TODO(), getMachineStatusRequest) + if err == nil { + // VM Found + description = machineutils.InitiateDrain + state = v1alpha1.MachineStateProcessing + retry = machineutils.RetryOp + phase = v1alpha1.MachineTerminating + // Return error even when machine object is updated to ensure reconcilation is restarted + err = fmt.Errorf("Machine deletion in process. VM with matching ID found") + + } else { + if grpcErr, ok := status.FromError(err); !ok { + // Error occurred with decoding gRPC error status, aborting without retry. + description = "Error occurred with decoding gRPC error status while getting VM status, aborting without retry. " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineFailed + retry = machineutils.DoNotRetryOp + + err = fmt.Errorf("Machine deletion has failed. " + description) + } else { + // Decoding gRPC error code + switch grpcErr.Code() { + + case codes.Unimplemented: + // GetMachineStatus() call is not implemented + // In this case, try to drain and delete + description = machineutils.InitiateDrain + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + retry = machineutils.RetryOp + + case codes.NotFound: + // VM was not found at provder + description = "VM was not found at provider. " + machineutils.InitiateNodeDeletion + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + retry = machineutils.RetryOp + + case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: + description = "Error occurred with decoding gRPC error status while getting VM status, aborting with retry. " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineTerminating + retry = machineutils.RetryOp + + default: + // Error occurred with decoding gRPC error status, abort with retry. + description = "Error occurred with decoding gRPC error status while getting VM status, aborting without retry. gRPC code: " + grpcErr.Message() + " " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineTerminating + retry = machineutils.DoNotRetryOp + } + } + + } + + clone := getMachineStatusRequest.Machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: phase, + LastUpdateTime: metav1.Now(), + } + + _, updateErr := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if updateErr != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", getMachineStatusRequest.Machine.Name, updateErr) + } + + return retry, err +} + +// drainNode attempts to drain the node backed by the machine object +func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) { + var ( + // Declarations + err error + forceDeletePods bool + forceDeleteMachine bool + timeOutOccurred bool + skipDrain bool + description string + state v1alpha1.MachineState + phase v1alpha1.MachinePhase + + // Initialization + machine = deleteMachineRequest.Machine + maxEvictRetries = c.safetyOptions.MaxEvictRetries + pvDetachTimeOut = c.safetyOptions.PvDetachTimeout.Duration + timeOutDuration = c.safetyOptions.MachineDrainTimeout.Duration + forceDeleteLabelPresent = machine.Labels["force-deletion"] == "True" + nodeName = machine.Labels["node"] + nodeNotReadyDuration = 5 * time.Minute + ) + + for _, condition := range machine.Status.Conditions { + if condition.Type == v1.NodeReady && condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > nodeNotReadyDuration) { + klog.Warningf("Skipping drain for NotReady machine %q", machine.Name) + err = fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion) + skipDrain = true + } + } + + if skipDrain { + // If not is not ready for over 5 minutes, skip draining this machine + description = fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + + } else { + // Timeout value obtained by subtracting last operation with expected time out period + timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time) + timeOutOccurred = timeOut > 0 + + if forceDeleteLabelPresent || timeOutOccurred { + // To perform forceful machine drain/delete either one of the below conditions must be satified + // 1. force-deletion: "True" label must be present + // 2. Deletion operation is more than drain-timeout minutes old + // 3. Last machine drain had failed + forceDeleteMachine = true + forceDeletePods = true + timeOutDuration = 1 * time.Minute + maxEvictRetries = 1 + + klog.V(2).Infof( + "Force deletion has been triggerred for machine %q due to Label:%t, timeout:%t", + machine.Name, + forceDeleteLabelPresent, + timeOutOccurred, + ) + } + + buf := bytes.NewBuffer([]byte{}) + errBuf := bytes.NewBuffer([]byte{}) + + drainOptions := drain.NewDrainOptions( + c.targetCoreClient, + timeOutDuration, + maxEvictRetries, + pvDetachTimeOut, + nodeName, + -1, + forceDeletePods, + true, + true, + true, + buf, + errBuf, + c.driver, + c.pvcLister, + c.pvLister, + ) + err = drainOptions.RunDrain() + if err == nil { + // Drain successful + klog.V(2).Infof("Drain successful for machine %q. \nBuf:%v \nErrBuf:%v", machine.Name, buf, errBuf) + + description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + + // Return error even when machine object is updated + err = fmt.Errorf("Machine deletion in process. " + description) + } else if err != nil && forceDeleteMachine { + // Drain failed on force deletion + klog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err) + + description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + } else { + klog.Warningf("Drain failed for machine %q. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err) + + description = fmt.Sprintf("Drain failed due to - %s. Will retry in next sync. %s", err.Error(), machineutils.InitiateDrain) + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineTerminating + } + } + + clone := machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: phase, + LastUpdateTime: metav1.Now(), + } + + _, updateErr := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if updateErr != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", machine.Name, updateErr) + } + + return machineutils.RetryOp, err +} + +// deleteVM attempts to delete the VM backed by the machine object +func (c *controller) deleteVM(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) { + var ( + machine = deleteMachineRequest.Machine + retryRequired machineutils.Retry + description string + state v1alpha1.MachineState + phase v1alpha1.MachinePhase + ) + + deleteMachineResponse, err := c.driver.DeleteMachine(context.TODO(), deleteMachineRequest) + if err != nil { + + klog.Errorf("Error while deleting machine %s: %s", machine.Name, err) + + if grpcErr, ok := status.FromError(err); ok { + switch grpcErr.Code() { + case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: + retryRequired = machineutils.RetryOp + description = fmt.Sprintf("VM deletion failed due to - %s. However, will re-try in the next resync. %s", err.Error(), machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineTerminating + case codes.NotFound: + retryRequired = machineutils.RetryOp + description = fmt.Sprintf("VM not found. Continuing deletion flow. %s", machineutils.InitiateNodeDeletion) + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + default: + retryRequired = machineutils.DoNotRetryOp + description = fmt.Sprintf("VM deletion failed due to - %s. Aborting operation. %s", err.Error(), machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineTerminating + } + } else { + retryRequired = machineutils.DoNotRetryOp + description = fmt.Sprintf("Error occurred while decoding gRPC error: %s. %s", err.Error(), machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateFailed + phase = v1alpha1.MachineFailed + } + + } else { + retryRequired = machineutils.RetryOp + description = fmt.Sprintf("VM deletion was successful. %s", machineutils.InitiateNodeDeletion) + state = v1alpha1.MachineStateProcessing + phase = v1alpha1.MachineTerminating + + err = fmt.Errorf("Machine deletion in process. " + description) + } + + clone := machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: phase, + LastUpdateTime: metav1.Now(), + } + + if deleteMachineResponse != nil && deleteMachineResponse.LastKnownState != "" { + clone.Status.LastKnownState = deleteMachineResponse.LastKnownState + } + + _, updateErr := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if updateErr != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", machine.Name, updateErr) + } + + return retryRequired, err +} + +// deleteNodeObject attempts to delete the node object backed by the machine object +func (c *controller) deleteNodeObject(machine *v1alpha1.Machine) (machineutils.Retry, error) { + var ( + err error + description string + state v1alpha1.MachineState + ) + + nodeName := machine.Labels["node"] + + if nodeName != "" { + // Delete node object + err = c.targetCoreClient.CoreV1().Nodes().Delete(nodeName, &metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + // If its an error, and anyother error than object not found + description = fmt.Sprintf("Deletion of Node Object %q failed due to error: %s. %s", nodeName, err, machineutils.InitiateNodeDeletion) + state = v1alpha1.MachineStateFailed + } else if err == nil { + description = fmt.Sprintf("Deletion of Node Object %q is successful. %s", nodeName, machineutils.InitiateFinalizerRemoval) + state = v1alpha1.MachineStateProcessing + + err = fmt.Errorf("Machine deletion in process. Deletion of node object was succesful") + } else { + description = fmt.Sprintf("No node object found for %q, continuing deletion flow. %s", nodeName, machineutils.InitiateFinalizerRemoval) + state = v1alpha1.MachineStateProcessing + } + } else { + description = fmt.Sprintf("No node object found for machine, continuing deletion flow. %s", machineutils.InitiateFinalizerRemoval) + state = v1alpha1.MachineStateProcessing + + err = fmt.Errorf("Machine deletion in process. No node object found") + } + + clone := machine.DeepCopy() + clone.Status.LastOperation = v1alpha1.LastOperation{ + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + } + clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + } + + _, updateErr := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) + if updateErr != nil { + // Keep retrying until update goes through + klog.Errorf("Machine/status UPDATE failed for machine %q. Retrying, error: %s", machine.Name, updateErr) + } + + return machineutils.RetryOp, err +} diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go new file mode 100644 index 000000000..ac6aad02c --- /dev/null +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -0,0 +1,1856 @@ +/* +Copyright (c) 2019 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controller + +import ( + "encoding/json" + "fmt" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + machinev1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" + . "github.com/onsi/ginkgo" + . "github.com/onsi/ginkgo/extensions/table" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/klog" +) + +var _ = Describe("machine_util", func() { + + Describe("#syncMachineNodeTemplates", func() { + + type setup struct { + machine *machinev1.Machine + } + type action struct { + node *corev1.Node + } + type expect struct { + node *corev1.Node + err error + } + type data struct { + setup setup + action action + expect expect + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + controlObjects := []runtime.Object{} + coreObjects := []runtime.Object{} + + machineObject := data.setup.machine + + nodeObject := data.action.node + coreObjects = append(coreObjects, nodeObject) + controlObjects = append(controlObjects, machineObject) + + c, trackers := createController(stop, testNamespace, controlObjects, nil, coreObjects, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + _, err := c.syncMachineNodeTemplates(machineObject) + + waitForCacheSync(stop, c) + + if data.expect.err == nil { + Expect(err).To(BeNil()) + } else { + Expect(err).To(HaveOccurred()) + Expect(err).To(Equal(data.expect.err)) + } + + //updatedNodeObject, _ := c.nodeLister.Get(nodeObject.Name) + updatedNodeObject, _ := c.targetCoreClient.CoreV1().Nodes().Get(nodeObject.Name, metav1.GetOptions{}) + + if data.expect.node != nil { + Expect(updatedNodeObject.Spec.Taints).Should(ConsistOf(data.expect.node.Spec.Taints)) + Expect(updatedNodeObject.Labels).Should(Equal(data.expect.node.Labels)) + + // ignore LastAppliedALTAnnotataion + delete(updatedNodeObject.Annotations, machineutils.LastAppliedALTAnnotation) + Expect(updatedNodeObject.Annotations).Should(Equal(data.expect.node.Annotations)) + } + }, + + Entry("when nodeTemplate is not updated in node-object", &data{ + setup: setup{ + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "test-machine-class", + }, + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{{ + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + &machinev1.MachineStatus{ + Node: "test-node", + }, + nil, nil, nil, true), + }, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + }, + Labels: map[string]string{ + "key1": "value1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Namespace: testNamespace, + Annotations: map[string]string{ + "anno1": "anno1", + }, + Labels: map[string]string{ + "key1": "value1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + err: fmt.Errorf("Machine ALTs have been reconciled"), + }, + }), + + Entry("when nodeTemplate is updated in node-object", &data{ + setup: setup{ + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "test-machine-class", + }, + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"},\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + &machinev1.MachineStatus{ + Node: "test-node", + }, + nil, nil, nil, true), + }, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{}, + }, + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + err: fmt.Errorf("Machine ALTs have been reconciled"), + }, + }), + + Entry("when node object does not exist", &data{ + setup: setup{ + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "test-machine-class", + }, + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + &machinev1.MachineStatus{ + Node: "test-node", + }, + nil, nil, nil, true), + }, + action: action{ + node: &corev1.Node{}, + }, + expect: expect{ + node: &corev1.Node{}, + err: nil, // we should not return error if node-object does not exist to ensure rest of the steps are then executed. + }, + }), + + Entry("Multiple taints with same key and value added to taint", &data{ + setup: setup{ + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "test-machine-class", + }, + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoExecute", + }, + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + &machinev1.MachineStatus{ + Node: "test-node", + }, + nil, nil, nil, true), + }, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"},\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + Labels: map[string]string{ + "key1": "value1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{}, + }, + }, + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Namespace: testNamespace, + Annotations: map[string]string{ + "anno1": "anno1", + }, + Labels: map[string]string{ + "key1": "value1", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "key1", + Value: "value1", + Effect: "NoExecute", + }, + { + Key: "key1", + Value: "value1", + Effect: "NoSchedule", + }, + }, + }, + }, + err: fmt.Errorf("Machine ALTs have been reconciled"), + }, + }), + ) + + }) + + Describe("#SyncMachineLabels", func() { + + type setup struct{} + type action struct { + node *corev1.Node + machine *machinev1.Machine + } + type expect struct { + node *corev1.Node + labelsChanged bool + } + type data struct { + setup setup + action action + expect expect + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + c, trackers := createController(stop, testNamespace, nil, nil, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + testNode := data.action.node + testMachine := data.action.machine + expectedNode := data.expect.node + + var lastAppliedALT v1alpha1.NodeTemplateSpec + lastAppliedALTJSONString, exists := testNode.Annotations[machineutils.LastAppliedALTAnnotation] + if exists { + err := json.Unmarshal([]byte(lastAppliedALTJSONString), &lastAppliedALT) + if err != nil { + klog.Errorf("Error occurred while syncing node annotations, labels & taints: %s", err) + } + } + + labelsChanged := SyncMachineLabels(testMachine, testNode, lastAppliedALT.Labels) + + waitForCacheSync(stop, c) + + Expect(testNode.Labels).Should(Equal(expectedNode.Labels)) + Expect(labelsChanged).To(Equal(data.expect.labelsChanged)) + }, + + Entry("when labels have not been updated", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + labelsChanged: false, + }, + }), + + Entry("when labels values are updated ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "valueChanged", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "valueChanged", + }, + }, + }, + labelsChanged: true, + }, + }), + + Entry("when new label keys are added ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + }, + labelsChanged: true, + }, + }), + + Entry("when label is deleted from machine object", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\", \"key2\":\"value2\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + labelsChanged: true, + }, + }), + + Entry("when labels values are updated manually on node object", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value2", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + labelsChanged: true, + }, + }), + + Entry("when new labels are added on node-object ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + "keyNew": "valueNew", + }, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + "keyNew": "valueNew", + }, + }, + }, + labelsChanged: false, + }, + }), + + Entry("when existing labels are deleted from node-object ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{}, + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"labels\":{\"key1\":\"value1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + }, + labelsChanged: true, + }, + }), + ) + + }) + + Describe("#SyncMachineAnnotations", func() { + + type setup struct{} + type action struct { + node *corev1.Node + machine *machinev1.Machine + } + type expect struct { + node *corev1.Node + annotationsChanged bool + } + type data struct { + setup setup + action action + expect expect + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + c, trackers := createController(stop, testNamespace, nil, nil, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + testNode := data.action.node + testMachine := data.action.machine + expectedNode := data.expect.node + + var lastAppliedALT v1alpha1.NodeTemplateSpec + lastAppliedALTJSONString, exists := testNode.Annotations[machineutils.LastAppliedALTAnnotation] + if exists { + err := json.Unmarshal([]byte(lastAppliedALTJSONString), &lastAppliedALT) + if err != nil { + klog.Errorf("Error occurred while syncing node annotations, labels & taints: %s", err) + } + } + + annotationsChanged := SyncMachineAnnotations(testMachine, testNode, lastAppliedALT.Annotations) + + waitForCacheSync(stop, c) + + // ignore machineutils.LastAppliedALTAnnotation for comparison + delete(testNode.Annotations, machineutils.LastAppliedALTAnnotation) + Expect(testNode.Annotations).Should(Equal(expectedNode.Annotations)) + Expect(annotationsChanged).To(Equal(data.expect.annotationsChanged)) + }, + + Entry("when annotations have not been updated", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + annotationsChanged: false, + }, + }), + + Entry("when annotations values are updated ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "annoChanged", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "annoChanged", + }, + }, + }, + annotationsChanged: true, + }, + }), + + Entry("when new annotation keys are added ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + "anno2": "anno2", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + "anno2": "anno2", + }, + }, + }, + annotationsChanged: true, + }, + }), + + Entry("when annotations are deleted ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + "anno2": "anno2", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\", \"anno2\":\"anno2\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{}, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + annotationsChanged: true, + }, + }), + + Entry("when annotations values are updated manually on node object", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno2", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + annotationsChanged: true, + }, + }), + + Entry("when new annotations are added on node-objects.", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + "annoNew": "annoNew", + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + "annoNew": "annoNew", + }, + }, + }, + annotationsChanged: false, + }, + }), + + Entry("when existing annotations are deleted from node-objects. ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null,\"annotations\":{\"anno1\":\"anno1\"}}}", + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "anno1": "anno1", + "anno2": "anno2", + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + "anno1": "anno1", + "anno2": "anno2", + }, + }, + }, + annotationsChanged: true, + }, + }), + ) + + }) + + Describe("#SyncMachineTaints", func() { + + type setup struct{} + type action struct { + node *corev1.Node + machine *machinev1.Machine + } + type expect struct { + node *corev1.Node + taintsChanged bool + } + type data struct { + setup setup + action action + expect expect + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + c, trackers := createController(stop, testNamespace, nil, nil, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + testNode := data.action.node + testMachine := data.action.machine + expectedNode := data.expect.node + + var lastAppliedALT v1alpha1.NodeTemplateSpec + lastAppliedALTJSONString, exists := testNode.Annotations[machineutils.LastAppliedALTAnnotation] + if exists { + err := json.Unmarshal([]byte(lastAppliedALTJSONString), &lastAppliedALT) + if err != nil { + klog.Errorf("Error occurred while syncing node annotations, labels & taints: %s", err) + } + } + + taintsChanged := SyncMachineTaints(testMachine, testNode, lastAppliedALT.Spec.Taints) + + waitForCacheSync(stop, c) + + Expect(testNode.Spec.Taints).Should(ConsistOf(expectedNode.Spec.Taints)) + Expect(taintsChanged).To(Equal(data.expect.taintsChanged)) + }, + + Entry("when taints have not been updated", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: false, + }, + }), + + Entry("when taints values are updated ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"OldValue\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "OldValue", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "NewValue", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "NewValue", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: true, + }, + }), + + Entry("when new taints are added ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key2", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key2", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: true, + }, + }), + + Entry("when taints are deleted ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"},{\"key\":\"Key2\",\"value\":\"Value2\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key2", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: true, + }, + }), + + Entry("when node taint value is overwritten manually & new taint was added with same key & value but different effect", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key1", + Value: "Value1", + Effect: "NoExecute", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key1", + Value: "Value1", + Effect: "NoExecute", + }, + }, + }, + }, + taintsChanged: true, + }, + }), + + Entry("when new taints are added on node-object ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "KeyNew", + Value: "ValueNew", + Effect: "NoSchedule", + }, + }, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "KeyNew", + Value: "ValueNew", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: false, + }, + }), + + Entry("when existing taints are deleted from node-objects ", &data{ + setup: setup{}, + action: action{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + Annotations: map[string]string{ + machineutils.LastAppliedALTAnnotation: "{\"metadata\":{\"creationTimestamp\":null},\"spec\":{\"taints\":[{\"key\":\"Key1\",\"value\":\"Value1\",\"effect\":\"NoSchedule\"}]}}", + }, + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{}, + }, + }, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label": "test-label", + }, + }, + Spec: machinev1.MachineSpec{ + NodeTemplateSpec: machinev1.NodeTemplateSpec{ + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key2", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + }, + }, + nil, nil, nil, nil, true), + }, + expect: expect{ + node: &corev1.Node{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Node", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: "Key1", + Value: "Value1", + Effect: "NoSchedule", + }, + { + Key: "Key2", + Value: "Value2", + Effect: "NoSchedule", + }, + }, + }, + }, + taintsChanged: true, + }, + }), + ) + + }) +}) diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go new file mode 100644 index 000000000..4dd61c48c --- /dev/null +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -0,0 +1,182 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/tools/cache" + + "k8s.io/klog" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" +) + +// machineClassKind is used to identify the machineClassKind for generic machineClasses +const machineClassKind = "MachineClass" + +func (c *controller) machineToMachineClassDelete(obj interface{}) { + machine, ok := obj.(*v1alpha1.Machine) + if machine == nil || !ok { + return + } + if machine.Spec.Class.Kind == machineClassKind { + c.machineClassQueue.Add(machine.Spec.Class.Name) + } +} + +func (c *controller) machineClassAdd(obj interface{}) { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + c.machineClassQueue.Add(key) +} + +func (c *controller) machineClassUpdate(oldObj, newObj interface{}) { + old, ok := oldObj.(*v1alpha1.MachineClass) + if old == nil || !ok { + return + } + new, ok := newObj.(*v1alpha1.MachineClass) + if new == nil || !ok { + return + } + + c.machineClassAdd(newObj) +} + +// reconcileClusterMachineClassKey reconciles an machineClass due to controller resync +// or an event on the machineClass. +func (c *controller) reconcileClusterMachineClassKey(key string) error { + _, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return err + } + + class, err := c.machineClassLister.MachineClasses(c.namespace).Get(name) + if errors.IsNotFound(err) { + klog.Infof("%s %q: Not doing work because it has been deleted", machineClassKind, key) + return nil + } + if err != nil { + klog.Infof("%s %q: Unable to retrieve object from store: %v", machineClassKind, key, err) + return err + } + + return c.reconcileClusterMachineClass(class) +} + +func (c *controller) reconcileClusterMachineClass(class *v1alpha1.MachineClass) error { + klog.V(4).Info("Start Reconciling machineclass: ", class.Name) + defer func() { + c.enqueueMachineClassAfter(class, 10*time.Minute) + klog.V(4).Info("Stop Reconciling machineclass: ", class.Name) + }() + + internalClass := &machine.MachineClass{} + err := c.internalExternalScheme.Convert(class, internalClass, nil) + if err != nil { + return err + } + + machines, err := c.findMachinesForClass(machineClassKind, class.Name) + if err != nil { + return err + } + + // Manipulate finalizers + if class.DeletionTimestamp == nil && len(machines) > 0 { + err = c.addMachineClassFinalizers(class) + if err != nil { + return err + } + } else { + if finalizers := sets.NewString(class.Finalizers...); !finalizers.Has(DeleteFinalizerName) { + return nil + } + + if len(machines) == 0 { + return c.deleteMachineClassFinalizers(class) + } + + klog.V(3).Infof("Cannot remove finalizer of %s because still Machine[s|Sets|Deployments] are referencing it", class.Name) + return nil + } + + for _, machine := range machines { + c.addMachine(machine) + } + return nil +} + +/* + SECTION + Manipulate Finalizers +*/ + +func (c *controller) addMachineClassFinalizers(class *v1alpha1.MachineClass) error { + clone := class.DeepCopy() + + if finalizers := sets.NewString(clone.Finalizers...); !finalizers.Has(DeleteFinalizerName) { + finalizers.Insert(DeleteFinalizerName) + return c.updateMachineClassFinalizers(clone, finalizers.List()) + } + return nil +} + +func (c *controller) deleteMachineClassFinalizers(class *v1alpha1.MachineClass) error { + clone := class.DeepCopy() + + if finalizers := sets.NewString(clone.Finalizers...); finalizers.Has(DeleteFinalizerName) { + finalizers.Delete(DeleteFinalizerName) + return c.updateMachineClassFinalizers(clone, finalizers.List()) + } + return nil +} + +func (c *controller) updateMachineClassFinalizers(class *v1alpha1.MachineClass, finalizers []string) error { + // Get the latest version of the class so that we can avoid conflicts + class, err := c.controlMachineClient.MachineClasses(class.Namespace).Get(class.Name, metav1.GetOptions{}) + if err != nil { + return err + } + + clone := class.DeepCopy() + clone.Finalizers = finalizers + _, err = c.controlMachineClient.MachineClasses(class.Namespace).Update(clone) + if err != nil { + klog.Warning("Updating machineClass failed, retrying. ", class.Name, err) + return err + } + klog.V(3).Infof("Successfully added/removed finalizer on the machineclass %q", class.Name) + return err +} + +func (c *controller) enqueueMachineClassAfter(obj interface{}, after time.Duration) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + return + } + c.machineClassQueue.AddAfter(key, after) +} diff --git a/pkg/util/provider/machinecontroller/machineclass_util.go b/pkg/util/provider/machinecontroller/machineclass_util.go new file mode 100644 index 000000000..fdd15ea15 --- /dev/null +++ b/pkg/util/provider/machinecontroller/machineclass_util.go @@ -0,0 +1,68 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "k8s.io/apimachinery/pkg/labels" +) + +/* +TODO: Move this code to MCM/MachineSet controller as well? +func (c *controller) findMachineDeploymentsForClass(kind, name string) ([]*v1alpha1.MachineDeployment, error) { + machineDeployments, err := c.machineDeploymentLister.List(labels.Everything()) + if err != nil { + return nil, err + } + var filtered []*v1alpha1.MachineDeployment + for _, machineDeployment := range machineDeployments { + if machineDeployment.Spec.Template.Spec.Class.Kind == kind && machineDeployment.Spec.Template.Spec.Class.Name == name { + filtered = append(filtered, machineDeployment) + } + } + return filtered, nil +} + +func (c *controller) findMachineSetsForClass(kind, name string) ([]*v1alpha1.MachineSet, error) { + machineSets, err := c.machineSetLister.List(labels.Everything()) + if err != nil { + return nil, err + } + var filtered []*v1alpha1.MachineSet + for _, machineSet := range machineSets { + if machineSet.Spec.Template.Spec.Class.Kind == kind && machineSet.Spec.Template.Spec.Class.Name == name { + filtered = append(filtered, machineSet) + } + } + return filtered, nil +} +*/ + +func (c *controller) findMachinesForClass(kind, name string) ([]*v1alpha1.Machine, error) { + machines, err := c.machineLister.List(labels.Everything()) + if err != nil { + return nil, err + } + var filtered []*v1alpha1.Machine + for _, machine := range machines { + if machine.Spec.Class.Kind == kind && machine.Spec.Class.Name == name { + filtered = append(filtered, machine) + } + } + return filtered, nil +} diff --git a/pkg/util/provider/machinecontroller/metrics.go b/pkg/util/provider/machinecontroller/metrics.go new file mode 100644 index 000000000..7dfcf29be --- /dev/null +++ b/pkg/util/provider/machinecontroller/metrics.go @@ -0,0 +1,332 @@ +/* +Copyright (c) 2018 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "strconv" + + v1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/util/provider/metrics" + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" +) + +// Describe is method required to implement the prometheus.Collect interface. +func (c *controller) Describe(ch chan<- *prometheus.Desc) { + ch <- metrics.MachineCountDesc +} + +/* +TODO: Fix metric collection +// CollectMachineDeploymentMetrics is method to collect machineSet related metrics. +func (c *controller) CollectMachineDeploymentMetrics(ch chan<- prometheus.Metric) { + machineDeploymentList, err := c.machineDeploymentLister.MachineDeployments(c.namespace).List(labels.Everything()) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machinedeployment-count"}).Inc() + return + } + metric, err := prometheus.NewConstMetric(metrics.MachineDeploymentCountDesc, prometheus.GaugeValue, float64(len(machineDeploymentList))) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machinedeployment-count"}).Inc() + return + } + ch <- metric + + for _, machineDeployment := range machineDeploymentList { + + mdMeta := machineDeployment.ObjectMeta + mdSpec := machineDeployment.Spec + + metrics.MachineDeploymentInfo.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace, + "createdAt": strconv.FormatInt(mdMeta.GetCreationTimestamp().Time.Unix(), 10), + "spec_strategy_type": string(mdSpec.Strategy.Type), + }).Set(float64(1)) + + var paused float64 + if mdSpec.Paused { + paused = 1 + } + metrics.MachineDeploymentInfoSpecPaused.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(paused) + + metrics.MachineDeploymentInfoSpecReplicas.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(mdSpec.Replicas)) + + metrics.MachineDeploymentInfoSpecMinReadySeconds.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(mdSpec.MinReadySeconds)) + + if mdSpec.Strategy.Type == v1alpha1.RollingUpdateMachineDeploymentStrategyType { + metrics.MachineDeploymentInfoSpecRollingUpdateMaxSurge.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(mdSpec.Strategy.RollingUpdate.MaxSurge.IntValue())) + metrics.MachineDeploymentInfoSpecRollingUpdateMaxUnavailable.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(mdSpec.Strategy.RollingUpdate.MaxUnavailable.IntValue())) + } + if mdSpec.RevisionHistoryLimit != nil { + metrics.MachineDeploymentInfoSpecRevisionHistoryLimit.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(int64(*mdSpec.RevisionHistoryLimit))) + } + if mdSpec.ProgressDeadlineSeconds != nil { + metrics.MachineDeploymentInfoSpecProgressDeadlineSeconds.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(int64(*mdSpec.ProgressDeadlineSeconds))) + } + if mdSpec.RollbackTo != nil { + metrics.MachineDeploymentInfoSpecRollbackToRevision.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace}).Set(float64(mdSpec.RollbackTo.Revision)) + } + + for _, condition := range machineDeployment.Status.Conditions { + var status float64 + switch condition.Status { + case v1alpha1.ConditionTrue: + status = 1 + case v1alpha1.ConditionFalse: + status = 0 + case v1alpha1.ConditionUnknown: + status = 2 + } + + metrics.MachineDeploymentStatusCondition.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace, + "condition": string(condition.Type), + }).Set(status) + } + + statusLabels := prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace, + } + metrics.MachineDeploymentStatusAvailableReplicas.With(statusLabels).Set(float64(machineDeployment.Status.AvailableReplicas)) + metrics.MachineDeploymentStatusUnavailableReplicas.With(statusLabels).Set(float64(machineDeployment.Status.UnavailableReplicas)) + metrics.MachineDeploymentStatusReadyReplicas.With(statusLabels).Set(float64(machineDeployment.Status.ReadyReplicas)) + metrics.MachineDeploymentStatusUpdatedReplicas.With(statusLabels).Set(float64(machineDeployment.Status.UpdatedReplicas)) + metrics.MachineDeploymentStatusReplicas.With(statusLabels).Set(float64(machineDeployment.Status.Replicas)) + + if machineDeployment.Status.CollisionCount != nil { + metrics.MachineDeploymentStatusCollisionCount.With(statusLabels).Set(float64(*machineDeployment.Status.CollisionCount)) + } + + if machineDeployment.Status.FailedMachines != nil { + for _, failedMachine := range machineDeployment.Status.FailedMachines { + metrics.MachineDeploymentStatusFailedMachines.With(prometheus.Labels{ + "name": mdMeta.Name, + "namespace": mdMeta.Namespace, + "failed_machine_name": failedMachine.Name, + "failed_machine_provider_id": failedMachine.ProviderID, + "failed_machine_last_operation_state": string(failedMachine.LastOperation.State), + "failed_machine_last_operation_machine_operation_type": string(failedMachine.LastOperation.Type), + "failed_machine_owner_ref": failedMachine.OwnerRef}).Set(float64(1)) + + } + } + + } +} + +// CollectMachineSetMetrics is method to collect machineSet related metrics. +func (c *controller) CollectMachineSetMetrics(ch chan<- prometheus.Metric) { + machineSetList, err := c.machineSetLister.MachineSets(c.namespace).List(labels.Everything()) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machineset-count"}).Inc() + return + } + metric, err := prometheus.NewConstMetric(metrics.MachineSetCountDesc, prometheus.GaugeValue, float64(len(machineSetList))) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machineset-count"}).Inc() + return + } + ch <- metric + + for _, machineSet := range machineSetList { + + msMeta := machineSet.ObjectMeta + msSpec := machineSet.Spec + + metrics.MachineSetInfo.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + "createdAt": strconv.FormatInt(msMeta.GetCreationTimestamp().Time.Unix(), 10), + "spec_machine_class_api_group": msSpec.MachineClass.APIGroup, + "spec_machine_class_kind": msSpec.MachineClass.Kind, + "spec_machine_class_name": msSpec.MachineClass.Name}).Set(float64(1)) + + metrics.MachineSetInfoSpecReplicas.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace}).Set(float64(msSpec.Replicas)) + metrics.MachineSetInfoSpecMinReadySeconds.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace}).Set(float64(msSpec.MinReadySeconds)) + + for _, condition := range machineSet.Status.Conditions { + var status float64 + switch condition.Status { + case v1alpha1.ConditionTrue: + status = 1 + case v1alpha1.ConditionFalse: + status = 0 + case v1alpha1.ConditionUnknown: + status = 2 + } + + metrics.MachineSetStatusCondition.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + "condition": string(condition.Type), + }).Set(status) + } + + metrics.MachineSetStatusAvailableReplicas.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + }).Set(float64(machineSet.Status.AvailableReplicas)) + + metrics.MachineSetStatusFullyLabelledReplicas.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + }).Set(float64(machineSet.Status.FullyLabeledReplicas)) + + metrics.MachineSetStatusReadyReplicas.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + }).Set(float64(machineSet.Status.ReadyReplicas)) + + metrics.MachineSetStatusReplicas.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + }).Set(float64(machineSet.Status.ReadyReplicas)) + + if machineSet.Status.FailedMachines != nil { + + for _, failedMachine := range *machineSet.Status.FailedMachines { + metrics.MachineSetStatusFailedMachines.With(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + "failed_machine_name": failedMachine.Name, + "failed_machine_provider_id": failedMachine.ProviderID, + "failed_machine_last_operation_state": string(failedMachine.LastOperation.State), + "failed_machine_last_operation_machine_operation_type": string(failedMachine.LastOperation.Type), + "failed_machine_owner_ref": failedMachine.OwnerRef}).Set(float64(1)) + } + } + } +} +*/ + +// CollectMachines is method to collect Machine related metrics. +func (c *controller) CollectMachineMetrics(ch chan<- prometheus.Metric) { + // Collect the count of machines managed by the mcm. + machineList, err := c.machineLister.Machines(c.namespace).List(labels.Everything()) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc() + return + } + + for _, machine := range machineList { + mMeta := machine.ObjectMeta + mSpec := machine.Spec + + metrics.MachineInfo.With(prometheus.Labels{ + "name": mMeta.Name, + "namespace": mMeta.Namespace, + "createdAt": strconv.FormatInt(mMeta.GetCreationTimestamp().Time.Unix(), 10), + "spec_provider_id": mSpec.ProviderID, + "spec_class_api_group": mSpec.Class.APIGroup, + "spec_class_kind": mSpec.Class.Kind, + "spec_class_name": mSpec.Class.Name}).Set(float64(1)) + + for _, condition := range machine.Status.Conditions { + var status float64 + switch condition.Status { + case v1.ConditionTrue: + status = 1 + case v1.ConditionFalse: + status = 0 + case v1.ConditionUnknown: + status = 2 + } + + metrics.MachineStatusCondition.With(prometheus.Labels{ + "name": mMeta.Name, + "namespace": mMeta.Namespace, + "condition": string(condition.Type), + }).Set(status) + } + + var phase float64 + switch machine.Status.CurrentStatus.Phase { + case v1alpha1.MachinePending: + phase = -2 + case v1alpha1.MachineAvailable: + phase = -1 + case v1alpha1.MachineRunning: + phase = 0 + case v1alpha1.MachineTerminating: + phase = 1 + case v1alpha1.MachineUnknown: + phase = 2 + case v1alpha1.MachineFailed: + phase = 3 + } + metrics.MachineCSPhase.With(prometheus.Labels{ + "name": mMeta.Name, + "namespace": mMeta.Namespace, + }).Set(phase) + + } + + metric, err := prometheus.NewConstMetric(metrics.MachineCountDesc, prometheus.GaugeValue, float64(len(machineList))) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc() + return + } + ch <- metric + +} + +// CollectMachines is method to collect Machine related metrics. +func (c *controller) CollectMachineControllerFrozenStatus(ch chan<- prometheus.Metric) { + var frozenStatus float64 + if c.safetyOptions.MachineControllerFrozen { + frozenStatus = 1 + } + metric, err := prometheus.NewConstMetric(metrics.MachineControllerFrozenDesc, prometheus.GaugeValue, frozenStatus) + if err != nil { + metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc() + return + } + ch <- metric +} + +// Collect is method required to implement the prometheus.Collect interface. +func (c *controller) Collect(ch chan<- prometheus.Metric) { + c.CollectMachineMetrics(ch) + //c.CollectMachineSetMetrics(ch) + //c.CollectMachineDeploymentMetrics(ch) + c.CollectMachineControllerFrozenStatus(ch) +} diff --git a/pkg/util/provider/machinecontroller/node.go b/pkg/util/provider/machinecontroller/node.go new file mode 100644 index 000000000..1815d965c --- /dev/null +++ b/pkg/util/provider/machinecontroller/node.go @@ -0,0 +1,66 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "k8s.io/klog" + + "k8s.io/api/core/v1" + "k8s.io/client-go/tools/cache" + + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +func (c *controller) nodeAdd(obj interface{}) { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + c.nodeQueue.Add(key) +} + +func (c *controller) nodeUpdate(oldObj, newObj interface{}) { + c.nodeAdd(newObj) +} + +func (c *controller) nodeDelete(obj interface{}) { + node, ok := obj.(*v1.Node) + if node == nil || !ok { + return + } + +} + +// Not being used at the moment, saving it for a future use case. +func (c *controller) reconcileClusterNodeKey(key string) error { + node, err := c.nodeLister.Get(key) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + klog.Errorf("ClusterNode %q: Unable to retrieve object from store: %v", key, err) + return err + } + + return c.reconcileClusterNode(node) +} + +func (c *controller) reconcileClusterNode(node *v1.Node) error { + return nil +} diff --git a/pkg/util/provider/machinecontroller/secret.go b/pkg/util/provider/machinecontroller/secret.go new file mode 100644 index 000000000..f83213281 --- /dev/null +++ b/pkg/util/provider/machinecontroller/secret.go @@ -0,0 +1,189 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "time" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/tools/cache" + "k8s.io/klog" +) + +// reconcileClusterSecretKey reconciles an secret due to controller resync +// or an event on the secret +func (c *controller) reconcileClusterSecretKey(key string) error { + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return err + } else if c.namespace != namespace { + // Secret exists outside of controller namespace + return nil + } + + secret, err := c.secretLister.Secrets(c.namespace).Get(name) + if errors.IsNotFound(err) { + klog.V(4).Infof("%q: Not doing work because it has been deleted", key) + return nil + } else if err != nil { + klog.V(4).Infof("%q: Unable to retrieve object from store: %v", key, err) + return err + } + + return c.reconcileClusterSecret(secret) +} + +// reconcileClusterSecret manipulates finalizers based on +// machineClass references +func (c *controller) reconcileClusterSecret(secret *corev1.Secret) error { + startTime := time.Now() + + klog.V(4).Infof("Start syncing %q", secret.Name) + defer func() { + c.enqueueSecretAfter(secret, 10*time.Minute) + klog.V(4).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime)) + }() + + // Check if machineClasses are referring to this secret + exists, err := c.existsMachineClassForSecret(secret.Name) + if err != nil { + return err + } + + if exists { + // If one or more machineClasses refer this, add finalizer (if it doesn't exist) + err = c.addSecretFinalizers(secret) + if err != nil { + return err + } + } else { + if finalizers := sets.NewString(secret.Finalizers...); !finalizers.Has(DeleteFinalizerName) { + // Finalizer doesn't exist, simply return nil + return nil + } + err = c.deleteSecretFinalizers(secret) + if err != nil { + return err + } + } + + return nil +} + +/* + SECTION + Manipulate Finalizers +*/ + +func (c *controller) addSecretFinalizers(secret *corev1.Secret) error { + clone := secret.DeepCopy() + + if finalizers := sets.NewString(clone.Finalizers...); !finalizers.Has(DeleteFinalizerName) { + finalizers.Insert(DeleteFinalizerName) + return c.updateSecretFinalizers(clone, finalizers.List()) + } + return nil +} + +func (c *controller) deleteSecretFinalizers(secret *corev1.Secret) error { + clone := secret.DeepCopy() + + if finalizers := sets.NewString(clone.Finalizers...); finalizers.Has(DeleteFinalizerName) { + finalizers.Delete(DeleteFinalizerName) + return c.updateSecretFinalizers(clone, finalizers.List()) + } + return nil +} + +func (c *controller) updateSecretFinalizers(secret *corev1.Secret, finalizers []string) error { + // Get the latest version of the secret so that we can avoid conflicts + secret, err := c.controlCoreClient.CoreV1().Secrets(secret.Namespace).Get(secret.Name, metav1.GetOptions{}) + if err != nil { + return err + } + + clone := secret.DeepCopy() + clone.Finalizers = finalizers + _, err = c.controlCoreClient.CoreV1().Secrets(clone.Namespace).Update(clone) + + if err != nil { + klog.Warning("Updating secret finalizers failed, retrying", secret.Name, err) + return err + } + klog.V(3).Infof("Successfully added/removed finalizer on the secret %q", secret.Name) + return err +} + +/* + SECTION + Event handlers +*/ + +func (c *controller) secretAdd(obj interface{}) { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + c.secretQueue.Add(key) +} + +func (c *controller) secretDelete(obj interface{}) { + c.secretAdd(obj) +} + +func (c *controller) enqueueSecretAfter(obj interface{}, after time.Duration) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + return + } + c.secretQueue.AddAfter(key, after) +} + +func (c *controller) machineClassToSecretAdd(obj interface{}) { + machineClass, ok := obj.(*v1alpha1.MachineClass) + if machineClass == nil || !ok { + return + } + c.secretQueue.Add(machineClass.SecretRef.Namespace + "/" + machineClass.SecretRef.Name) +} + +func (c *controller) machineClassToSecretUpdate(oldObj interface{}, newObj interface{}) { + oldMachineClass, ok := oldObj.(*v1alpha1.MachineClass) + if oldMachineClass == nil || !ok { + return + } + newMachineClass, ok := newObj.(*v1alpha1.MachineClass) + if newMachineClass == nil || !ok { + return + } + + if oldMachineClass.SecretRef.Name != newMachineClass.SecretRef.Name || + oldMachineClass.SecretRef.Namespace != newMachineClass.SecretRef.Namespace { + c.secretQueue.Add(oldMachineClass.SecretRef.Namespace + "/" + oldMachineClass.SecretRef.Name) + c.secretQueue.Add(newMachineClass.SecretRef.Namespace + "/" + newMachineClass.SecretRef.Name) + } +} + +func (c *controller) machineClassToSecretDelete(obj interface{}) { + c.machineClassToSecretAdd(obj) +} diff --git a/pkg/util/provider/machinecontroller/secret_test.go b/pkg/util/provider/machinecontroller/secret_test.go new file mode 100644 index 000000000..78d86c625 --- /dev/null +++ b/pkg/util/provider/machinecontroller/secret_test.go @@ -0,0 +1,146 @@ +/* +Copyright (c) 2019 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controller + +import ( + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +var _ = Describe("secret", func() { + + //TODO: This method has dependency on generic-machineclass. Implement later. + Describe("#reconcileClusterSecret", func() {}) + + Describe("#addSecretFinalizers", func() { + var ( + testSecret *corev1.Secret + ) + + BeforeEach(func() { + testSecret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "Secret-test", + Namespace: testNamespace, + }, + } + }) + + // Testcase: It should add finalizer on Secret. + It("should add finalizer on Secret.", func() { + stop := make(chan struct{}) + defer close(stop) + + objects := []runtime.Object{} + objects = append(objects, testSecret) + c, trackers := createController(stop, testNamespace, nil, objects, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + c.addSecretFinalizers(testSecret) + + waitForCacheSync(stop, c) + expectedSecret, _ := c.controlCoreClient.CoreV1().Secrets(testSecret.Namespace).Get(testSecret.Name, metav1.GetOptions{}) + + Expect(expectedSecret.Finalizers).To(HaveLen(1)) + Expect(expectedSecret.Finalizers).To(ContainElement(DeleteFinalizerName)) + }) + }) + + Describe("#deleteSecretFinalizers", func() { + var ( + testSecret *corev1.Secret + finalizers []string + ) + + BeforeEach(func() { + testSecret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "Secret-test", + Namespace: testNamespace, + }, + } + finalizers = []string{DeleteFinalizerName} + }) + + // Testcase: It should delete the finalizer from Secret. + It("should delete the finalizer from Secret.", func() { + stop := make(chan struct{}) + defer close(stop) + + objects := []runtime.Object{} + objects = append(objects, testSecret) + c, trackers := createController(stop, testNamespace, nil, objects, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + testSecret, _ := c.controlCoreClient.CoreV1().Secrets(testSecret.Namespace).Get(testSecret.Name, metav1.GetOptions{}) + + testSecret.Finalizers = finalizers + Expect(testSecret.Finalizers).Should(Not(BeEmpty())) + + c.deleteSecretFinalizers(testSecret) + + waitForCacheSync(stop, c) + + expectedSecret, _ := c.controlCoreClient.CoreV1().Secrets(testSecret.Namespace).Get(testSecret.Name, metav1.GetOptions{}) + + Expect(expectedSecret.Finalizers).Should(HaveLen(0)) + }) + }) + + Describe("#updateSecretFinalizers", func() { + var ( + testSecret *corev1.Secret + finalizers []string + ) + + BeforeEach(func() { + testSecret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "Secret-test", + Namespace: testNamespace, + }, + } + + finalizers = []string{"finalizer1", "finalizer2"} + }) + + // Testcase: It should update the finalizer on Secret. + It("should update the finalizer on Secret.", func() { + stop := make(chan struct{}) + defer close(stop) + + objects := []runtime.Object{} + objects = append(objects, testSecret) + c, trackers := createController(stop, testNamespace, nil, objects, nil, nil) + defer trackers.Stop() + waitForCacheSync(stop, c) + + c.updateSecretFinalizers(testSecret, finalizers) + + waitForCacheSync(stop, c) + + testSecret, _ := c.controlCoreClient.CoreV1().Secrets(testSecret.Namespace).Get(testSecret.Name, metav1.GetOptions{}) + + Expect(testSecret.Finalizers).To(Equal(finalizers)) + }) + }) +}) diff --git a/pkg/util/provider/machinecontroller/secret_util.go b/pkg/util/provider/machinecontroller/secret_util.go new file mode 100644 index 000000000..8460ae92c --- /dev/null +++ b/pkg/util/provider/machinecontroller/secret_util.go @@ -0,0 +1,55 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller is used to provide the core functionalities of machine-controller-manager +package controller + +import ( + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "k8s.io/apimachinery/pkg/labels" +) + +// existsMachineClassForSecret checks for any machineClass +// referring to the passed secret object +// TODO: Check using finalizers on secrets +func (c *controller) existsMachineClassForSecret(name string) (bool, error) { + MachineClasses, err := c.findMachineClassForSecret(name) + if err != nil { + return false, err + } + + if len(MachineClasses) == 0 { + return false, nil + } + + return true, nil +} + +// findMachineClassForSecret returns the set of +// MachineClasses referring to the passed secret +func (c *controller) findMachineClassForSecret(name string) ([]*v1alpha1.MachineClass, error) { + machineClasses, err := c.machineClassLister.List(labels.Everything()) + if err != nil { + return nil, err + } + var filtered []*v1alpha1.MachineClass + for _, machineClass := range machineClasses { + if machineClass.SecretRef.Name == name { + filtered = append(filtered, machineClass) + } + } + return filtered, nil +} diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go new file mode 100644 index 000000000..2aa6b0434 --- /dev/null +++ b/pkg/util/provider/machineutils/utils.go @@ -0,0 +1,50 @@ +/* +Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package machineutils contains the consts and global vaariables for machine operation +package machineutils + +const ( + // GetVMStatus sets machine status to terminating and specifies next step as getting VMs + GetVMStatus = "Set machine status to termination. Now, getting VM Status" + // InitiateDrain specifies next step as initiate node drain + InitiateDrain = "Initiate node drain" + // InitiateVMDeletion specifies next step as initiate VM deletion + InitiateVMDeletion = "Initiate VM deletion" + // InitiateNodeDeletion specifies next step as node object deletion + InitiateNodeDeletion = "Initiate node object deletion" + // InitiateFinalizerRemoval specifies next step as machine finalizer removal + InitiateFinalizerRemoval = "Initiate machine object finalizer removal" + + // LastAppliedALTAnnotation contains the last configuration of annotations, labels & taints applied on the node object + LastAppliedALTAnnotation = "node.machine.sapcloud.io/last-applied-anno-labels-taints" + // MachinePriority is the annotation used to specify priority + // associated with a machine while deleting it. The less its + // priority the more likely it is to be deleted first + // Default priority for a machine is set to 3 + MachinePriority = "machinepriority.machine.sapcloud.io" +) + +// Retry is a label for retrying operation +type Retry bool + +// These are the valid values for Retry. +const ( + // RetryOp tells the controller to retry + RetryOp Retry = true + // DoNotRetryOp tells the controller to not retry for now. Resync after re-sync period + DoNotRetryOp Retry = false +) diff --git a/pkg/util/provider/metrics/metrics.go b/pkg/util/provider/metrics/metrics.go new file mode 100644 index 000000000..c2a521043 --- /dev/null +++ b/pkg/util/provider/metrics/metrics.go @@ -0,0 +1,344 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + namespace = "mcm" + machineSubsystem = "machine" + machinesetSubsystem = "machine_set" + machinedeploymentSubsystem = "machine_deployment" + cloudAPISubsystem = "cloud_api" +) + +var ( + // MachineControllerFrozenDesc is a metric about MachineController's frozen status + MachineControllerFrozenDesc = prometheus.NewDesc("mcm_machine_controller_frozen", "Frozen status of the machine controller manager.", nil, nil) + // MachineCountDesc is a metric about machine count of the mcm manages + MachineCountDesc = prometheus.NewDesc("mcm_machine_items_total", "Count of machines currently managed by the mcm.", nil, nil) + + //MachineCSPhase Current status phase of the Machines currently managed by the mcm. + MachineCSPhase = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machineSubsystem, + Name: "current_status_phase", + Help: "Current status phase of the Machines currently managed by the mcm.", + }, []string{"name", "namespace"}) + + //MachineInfo Information of the Machines currently managed by the mcm. + MachineInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machineSubsystem, + Name: "info", + Help: "Information of the Machines currently managed by the mcm.", + }, []string{"name", "namespace", "createdAt", + "spec_provider_id", "spec_class_api_group", "spec_class_kind", "spec_class_name"}) + + // MachineStatusCondition Information of the mcm managed Machines' status conditions + MachineStatusCondition = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machineSubsystem, + Name: "status_condition", + Help: "Information of the mcm managed Machines' status conditions.", + }, []string{"name", "namespace", "condition"}) + + /* + // MachineSetCountDesc Count of machinesets currently managed by the mcm + MachineSetCountDesc = prometheus.NewDesc("mcm_machine_set_items_total", "Count of machinesets currently managed by the mcm.", nil, nil) + + // MachineSetInfo Information of the Machinesets currently managed by the mcm. + MachineSetInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "info", + Help: "Information of the Machinesets currently managed by the mcm.", + }, []string{"name", "namespace", "createdAt", + "spec_machine_class_api_group", "spec_machine_class_kind", "spec_machine_class_name"}) + + // MachineSetInfoSpecReplicas Count of the Machinesets Spec Replicas. + MachineSetInfoSpecReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "info_spec_replicas", + Help: "Count of the Machinesets Spec Replicas.", + }, []string{"name", "namespace"}) + + // MachineSetInfoSpecMinReadySeconds Information of the Machinesets currently managed by the mcm. + MachineSetInfoSpecMinReadySeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "info_spec_min_ready_seconds", + Help: "Information of the Machinesets currently managed by the mcm.", + }, []string{"name", "namespace"}) + + // MachineSetStatusCondition Information of the mcm managed Machinesets' status conditions. + MachineSetStatusCondition = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "status_condition", + Help: "Information of the mcm managed Machinesets' status conditions.", + }, []string{"name", "namespace", "condition"}) + + // MachineSetStatusFailedMachines Information of the mcm managed Machinesets' failed machines. + MachineSetStatusFailedMachines = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "failed_machines", + Help: "Information of the mcm managed Machinesets' failed machines.", + }, []string{"name", "namespace", "failed_machine_name", "failed_machine_provider_id", "failed_machine_owner_ref", + "failed_machine_last_operation_state", + "failed_machine_last_operation_machine_operation_type"}) + + // MachineSetStatusAvailableReplicas Information of the mcm managed Machinesets' status for available replicas. + MachineSetStatusAvailableReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "status_available_replicas", + Help: "Information of the mcm managed Machinesets' status for available replicas.", + }, []string{"name", "namespace"}) + + // MachineSetStatusFullyLabelledReplicas Information of the mcm managed Machinesets' status for fully labelled replicas. + MachineSetStatusFullyLabelledReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "status_fully_labelled_replicas", + Help: "Information of the mcm managed Machinesets' status for fully labelled replicas.", + }, []string{"name", "namespace"}) + + // MachineSetStatusReadyReplicas Information of the mcm managed Machinesets' status for ready replicas + MachineSetStatusReadyReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "status_ready_replicas", + Help: "Information of the mcm managed Machinesets' status for ready replicas.", + }, []string{"name", "namespace"}) + + // MachineSetStatusReplicas Information of the mcm managed Machinesets' status for replicas. + MachineSetStatusReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinesetSubsystem, + Name: "status_replicas", + Help: "Information of the mcm managed Machinesets' status for replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentCountDesc Count of machinedeployments currently managed by the mcm. + MachineDeploymentCountDesc = prometheus.NewDesc("mcm_machine_deployment_items_total", "Count of machinedeployments currently managed by the mcm.", nil, nil) + + // MachineDeploymentInfo Information of the Machinedeployments currently managed by the mcm. + MachineDeploymentInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info", + Help: "Information of the Machinedeployments currently managed by the mcm.", + }, []string{"name", "namespace", "createdAt", "spec_strategy_type"}) + + // MachineDeploymentInfoSpecPaused Information of the Machinedeployments paused status. + MachineDeploymentInfoSpecPaused = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_paused", + Help: "Information of the Machinedeployments paused status.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecReplicas Information of the Machinedeployments spec replicas. + MachineDeploymentInfoSpecReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_replicas", + Help: "Information of the Machinedeployments spec replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecMinReadySeconds Information of the Machinedeployments spec min ready seconds. + MachineDeploymentInfoSpecMinReadySeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_min_ready_seconds", + Help: "Information of the Machinedeployments spec min ready seconds.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecRollingUpdateMaxSurge Information of the Machinedeployments spec rolling update max surge. + MachineDeploymentInfoSpecRollingUpdateMaxSurge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_rolling_update_max_surge", + Help: "Information of the Machinedeployments spec rolling update max surge.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecRollingUpdateMaxUnavailable Information of the Machinedeployments spec rolling update max unavailable. + MachineDeploymentInfoSpecRollingUpdateMaxUnavailable = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_rolling_update_max_unavailable", + Help: "Information of the Machinedeployments spec rolling update max unavailable.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecRevisionHistoryLimit Information of the Machinedeployments spec revision history limit. + MachineDeploymentInfoSpecRevisionHistoryLimit = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_revision_history_limit", + Help: "Information of the Machinedeployments spec revision history limit.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecProgressDeadlineSeconds Information of the Machinedeployments spec deadline seconds. + MachineDeploymentInfoSpecProgressDeadlineSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_progress_deadline_seconds", + Help: "Information of the Machinedeployments spec deadline seconds.", + }, []string{"name", "namespace"}) + + // MachineDeploymentInfoSpecRollbackToRevision Information of the Machinedeployments spec rollback to revision. + MachineDeploymentInfoSpecRollbackToRevision = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "info_spec_rollback_to_revision", + Help: "Information of the Machinedeployments spec rollback to revision.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusCondition Information of the mcm managed Machinedeployments' status conditions. + MachineDeploymentStatusCondition = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_condition", + Help: "Information of the mcm managed Machinedeployments' status conditions.", + }, []string{"name", "namespace", "condition"}) + + // MachineDeploymentStatusAvailableReplicas Count of the mcm managed Machinedeployments available replicas. + MachineDeploymentStatusAvailableReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_available_replicas", + Help: "Count of the mcm managed Machinedeployments available replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusUnavailableReplicas Count of the mcm managed Machinedeployments unavailable replicas. + MachineDeploymentStatusUnavailableReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_unavailable_replicas", + Help: "Count of the mcm managed Machinedeployments unavailable replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusReadyReplicas Count of the mcm managed Machinedeployments ready replicas. + MachineDeploymentStatusReadyReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_ready_replicas", + Help: "Count of the mcm managed Machinedeployments ready replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusUpdatedReplicas Count of the mcm managed Machinedeployments updated replicas. + MachineDeploymentStatusUpdatedReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_updated_replicas", + Help: "Count of the mcm managed Machinedeployments updated replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusCollisionCount Mcm managed Machinedeployments collision count. + MachineDeploymentStatusCollisionCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_collision_count", + Help: "Mcm managed Machinedeployments collision count.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusReplicas Count of the mcm managed Machinedeployments replicas. + MachineDeploymentStatusReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "status_replicas", + Help: "Count of the mcm managed Machinedeployments replicas.", + }, []string{"name", "namespace"}) + + // MachineDeploymentStatusFailedMachines Information of the mcm managed Machinedeployments' failed machines. + MachineDeploymentStatusFailedMachines = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: machinedeploymentSubsystem, + Name: "failed_machines", + Help: "Information of the mcm managed Machinedeployments' failed machines.", + }, []string{"name", "namespace", "failed_machine_name", "failed_machine_provider_id", "failed_machine_owner_ref", + "failed_machine_last_operation_state", + "failed_machine_last_operation_machine_operation_type"}) + */ + + // APIRequestCount Number of Cloud Service API requests, partitioned by provider, and service. + APIRequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cloudAPISubsystem, + Name: "requests_total", + Help: "Number of Cloud Service API requests, partitioned by provider, and service.", + }, []string{"provider", "service"}, + ) + + // APIFailedRequestCount Number of Failed Cloud Service API requests, partitioned by provider, and service. + APIFailedRequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cloudAPISubsystem, + Name: "requests_failed_total", + Help: "Number of Failed Cloud Service API requests, partitioned by provider, and service.", + }, []string{"provider", "service"}, + ) + + // ScrapeFailedCounter is a Prometheus metric, which counts errors during metrics collection. + ScrapeFailedCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Name: "scrape_failure_total", + Help: "Total count of scrape failures.", + }, []string{"kind"}) +) + +func init() { + prometheus.MustRegister(ScrapeFailedCounter) + prometheus.MustRegister(MachineInfo) + prometheus.MustRegister(MachineStatusCondition) + prometheus.MustRegister(MachineCSPhase) + /* + prometheus.MustRegister(MachineSetInfo) + prometheus.MustRegister(MachineSetInfoSpecReplicas) + prometheus.MustRegister(MachineSetInfoSpecMinReadySeconds) + prometheus.MustRegister(MachineSetStatusAvailableReplicas) + prometheus.MustRegister(MachineSetStatusFullyLabelledReplicas) + prometheus.MustRegister(MachineSetStatusReadyReplicas) + prometheus.MustRegister(MachineSetStatusReplicas) + prometheus.MustRegister(MachineSetStatusCondition) + prometheus.MustRegister(MachineSetStatusFailedMachines) + prometheus.MustRegister(MachineDeploymentInfo) + prometheus.MustRegister(MachineDeploymentInfoSpecPaused) + prometheus.MustRegister(MachineDeploymentInfoSpecReplicas) + prometheus.MustRegister(MachineDeploymentInfoSpecRevisionHistoryLimit) + prometheus.MustRegister(MachineDeploymentInfoSpecMinReadySeconds) + prometheus.MustRegister(MachineDeploymentInfoSpecRollingUpdateMaxSurge) + prometheus.MustRegister(MachineDeploymentInfoSpecRollingUpdateMaxUnavailable) + prometheus.MustRegister(MachineDeploymentInfoSpecProgressDeadlineSeconds) + prometheus.MustRegister(MachineDeploymentInfoSpecRollbackToRevision) + prometheus.MustRegister(MachineDeploymentStatusCondition) + prometheus.MustRegister(MachineDeploymentStatusAvailableReplicas) + prometheus.MustRegister(MachineDeploymentStatusUnavailableReplicas) + prometheus.MustRegister(MachineDeploymentStatusReadyReplicas) + prometheus.MustRegister(MachineDeploymentStatusUpdatedReplicas) + prometheus.MustRegister(MachineDeploymentStatusCollisionCount) + prometheus.MustRegister(MachineDeploymentStatusReplicas) + prometheus.MustRegister(MachineDeploymentStatusFailedMachines) + */ + prometheus.MustRegister(APIRequestCount) + prometheus.MustRegister(APIFailedRequestCount) +} diff --git a/pkg/util/provider/options/helpers.go b/pkg/util/provider/options/helpers.go new file mode 100644 index 000000000..b13bfecc5 --- /dev/null +++ b/pkg/util/provider/options/helpers.go @@ -0,0 +1,115 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This file was copied and modified from the kubernetes/kubernetes project +https://github.com/kubernetes/kubernetes/blob/release-1.8/pkg/apis/componentconfig/helpers.go +*/ + +// Package options is used to specify options to MCM +package options + +import ( + "encoding/json" + "fmt" + "net" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilnet "k8s.io/apimachinery/pkg/util/net" +) + +// used for validating command line opts +// TODO(mikedanese): remove these when we remove command line flags + +// IPVar is used to store the IP Address as a string +type IPVar struct { + Val *string +} + +// Set is used to set IPVar +func (v IPVar) Set(s string) error { + if net.ParseIP(s) == nil { + return fmt.Errorf("%q is not a valid IP address", s) + } + if v.Val == nil { + // it's okay to panic here since this is programmer error + panic("the string pointer passed into IPVar should not be nil") + } + *v.Val = s + return nil +} + +// String is used to get IPVar in string format +func (v IPVar) String() string { + if v.Val == nil { + return "" + } + return *v.Val +} + +// Type is used to determine the type of IPVar +func (v IPVar) Type() string { + return "ip" +} + +// PortRangeVar is used to store the range of ports +type PortRangeVar struct { + Val *string +} + +// Set is used to set the PortRangeVar +func (v PortRangeVar) Set(s string) error { + if _, err := utilnet.ParsePortRange(s); err != nil { + return fmt.Errorf("%q is not a valid port range: %v", s, err) + } + if v.Val == nil { + // it's okay to panic here since this is programmer error + panic("the string pointer passed into PortRangeVar should not be nil") + } + *v.Val = s + return nil +} + +// String is used to get PortRangeVar in string format +func (v PortRangeVar) String() string { + if v.Val == nil { + return "" + } + return *v.Val +} + +// Type is used to determine the type of PortRangeVar +func (v PortRangeVar) Type() string { + return "port-range" +} + +// ConvertObjToConfigMap converts an object to a ConfigMap. +// This is specifically meant for ComponentConfigs. +func ConvertObjToConfigMap(name string, obj runtime.Object) (*v1.ConfigMap, error) { + eJSONBytes, err := json.Marshal(obj) + if err != nil { + return nil, err + } + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Data: map[string]string{ + name: string(eJSONBytes[:]), + }, + } + return cm, nil +} diff --git a/pkg/util/provider/options/types.go b/pkg/util/provider/options/types.go new file mode 100644 index 000000000..c2448eef4 --- /dev/null +++ b/pkg/util/provider/options/types.go @@ -0,0 +1,152 @@ +/* +Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package options is used to specify options to MCM +package options + +import ( + "time" + + mcmoptions "github.com/gardener/machine-controller-manager/pkg/options" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ClientConnectionConfiguration contains details for constructing a client. +type ClientConnectionConfiguration struct { + // kubeConfigFile is the path to a kubeconfig file. + KubeConfigFile string + // acceptContentTypes defines the Accept header sent by clients when connecting to a server, overriding the + // default value of 'application/json'. This field will control all connections to the server used by a particular + // client. + AcceptContentTypes string + // contentType is the content type used when sending data to the server from this client. + ContentType string + // qps controls the number of queries per second allowed for this connection. + QPS float32 + // burst allows extra queries to accumulate when a client is exceeding its rate. + Burst int +} + +// MachineControllerConfiguration contains machine configurations +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type MachineControllerConfiguration struct { + metav1.TypeMeta + + // namespace in seed cluster in which controller would look for the resources. + Namespace string + + // port is the port that the controller-manager's http service runs on. + Port int32 + // address is the IP address to serve on (set to 0.0.0.0 for all interfaces). + Address string + // CloudProvider is the provider for cloud services. + CloudProvider string + // ConcurrentNodeSyncs is the number of node objects that are + // allowed to sync concurrently. Larger number = more responsive nodes, + // but more CPU (and network) load. + ConcurrentNodeSyncs int32 + + // enableProfiling enables profiling via web interface host:port/debug/pprof/ + EnableProfiling bool + // enableContentionProfiling enables lock contention profiling, if enableProfiling is true. + EnableContentionProfiling bool + // contentType is contentType of requests sent to apiserver. + ContentType string + // kubeAPIQPS is the QPS to use while talking with kubernetes apiserver. + KubeAPIQPS float32 + // kubeAPIBurst is the burst to use while talking with kubernetes apiserver. + KubeAPIBurst int32 + // leaderElection defines the configuration of leader election client. + LeaderElection mcmoptions.LeaderElectionConfiguration + // How long to wait between starting controller managers + ControllerStartInterval metav1.Duration + // minResyncPeriod is the resync period in reflectors; will be random between + // minResyncPeriod and 2*minResyncPeriod. + MinResyncPeriod metav1.Duration + + // SafetyOptions is the set of options to set to ensure safety of controller + SafetyOptions SafetyOptions + + //NodeCondition is the string of known NodeConditions. If any of these NodeCondition is set for a timeout period, the machine will be declared failed and will replaced. + NodeConditions string + + //BootstrapTokenAuthExtraGroups is a comma-separated string of groups to set bootstrap token's "auth-extra-groups" field to. + BootstrapTokenAuthExtraGroups string +} + +// SafetyOptions are used to configure the upper-limit and lower-limit +// while configuring freezing of machineSet objects +type SafetyOptions struct { + // Timeout (in durartion) used while creation of + // a machine before it is declared as failed + MachineCreationTimeout metav1.Duration + // Timeout (in durartion) used while health-check of + // a machine before it is declared as failed + MachineHealthTimeout metav1.Duration + // Deprecated. No effect. Timeout (in durartion) used while draining of machine before deletion, + // beyond which it forcefully deletes machine + MachineDrainTimeout metav1.Duration + // Maximum number of times evicts would be attempted on a pod for it is forcibly deleted + // during draining of a machine. + MaxEvictRetries int32 + // Timeout (in duration) used while waiting for PV to detach + PvDetachTimeout metav1.Duration + + // Timeout (in duration) for which the APIServer can be down before + // declare the machine controller frozen by safety controller + MachineSafetyAPIServerStatusCheckTimeout metav1.Duration + // Period (in durartion) used to poll for orphan VMs + // by safety controller + MachineSafetyOrphanVMsPeriod metav1.Duration + // Period (in duration) used to poll for APIServer's health + // by safety controller + MachineSafetyAPIServerStatusCheckPeriod metav1.Duration + + // APIserverInactiveStartTime to keep track of the + // start time of when the APIServers were not reachable + APIserverInactiveStartTime time.Time + // MachineControllerFrozen indicates if the machine controller + // is frozen due to Unreachable APIServers + MachineControllerFrozen bool +} + +// LeaderElectionConfiguration defines the configuration of leader election +// clients for components that can run with leader election enabled. +type LeaderElectionConfiguration struct { + // leaderElect enables a leader election client to gain leadership + // before executing the main loop. Enable this when running replicated + // components for high availability. + LeaderElect bool + // leaseDuration is the duration that non-leader candidates will wait + // after observing a leadership renewal until attempting to acquire + // leadership of a led but unrenewed leader slot. This is effectively the + // maximum duration that a leader can be stopped before it is replaced + // by another candidate. This is only applicable if leader election is + // enabled. + LeaseDuration metav1.Duration + // renewDeadline is the interval between attempts by the acting master to + // renew a leadership slot before it stops leading. This must be less + // than or equal to the lease duration. This is only applicable if leader + // election is enabled. + RenewDeadline metav1.Duration + // retryPeriod is the duration the clients should wait between attempting + // acquisition and renewal of a leadership. This is only applicable if + // leader election is enabled. + RetryPeriod metav1.Duration + // resourceLock indicates the resource object type that will be used to lock + // during leader election cycles. + ResourceLock string +}