Refactor backup controller with controller-runtime.

Signed-off-by: Ming <mqiu@vmware.com> Signed-off-by: Xun Jiang <blackpiglet@gmail.com>
vmware-tanzu · Mar 1, 2023 · 02fa8c3 · 02fa8c3
1 parent 27bbbec
commit 02fa8c3
Show file tree

Hide file tree

Showing 5 changed files with 176 additions and 268 deletions.
diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go
@@ -31,11 +31,13 @@ import (
 	snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v4/apis/volumesnapshot/v1"
 	snapshotv1client "github.com/kubernetes-csi/external-snapshotter/client/v4/clientset/versioned"
 	snapshotv1informers "github.com/kubernetes-csi/external-snapshotter/client/v4/informers/externalversions"
+	snapshotv1listers "github.com/kubernetes-csi/external-snapshotter/client/v4/listers/volumesnapshot/v1"
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/sirupsen/logrus"
 	"github.com/spf13/cobra"
 	corev1api "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	kubeerrs "k8s.io/apimachinery/pkg/util/errors"
@@ -63,7 +65,6 @@ import (
 	velerodiscovery "github.com/vmware-tanzu/velero/pkg/discovery"
 	"github.com/vmware-tanzu/velero/pkg/features"
 	clientset "github.com/vmware-tanzu/velero/pkg/generated/clientset/versioned"
-	informers "github.com/vmware-tanzu/velero/pkg/generated/informers/externalversions"
 	"github.com/vmware-tanzu/velero/pkg/metrics"
 	"github.com/vmware-tanzu/velero/pkg/nodeagent"
 	"github.com/vmware-tanzu/velero/pkg/persistence"
@@ -233,7 +234,6 @@ type server struct {
 	discoveryClient                     discovery.DiscoveryInterface
 	discoveryHelper                     velerodiscovery.Helper
 	dynamicClient                       dynamic.Interface
-	sharedInformerFactory               informers.SharedInformerFactory
 	csiSnapshotterSharedInformerFactory *CSIInformerFactoryWrapper
 	csiSnapshotClient                   *snapshotv1client.Clientset
 	ctx                                 context.Context
@@ -348,7 +348,6 @@ func newServer(f client.Factory, config serverConfig, logger *logrus.Logger) (*s
 		veleroClient:                        veleroClient,
 		discoveryClient:                     veleroClient.Discovery(),
 		dynamicClient:                       dynamicClient,
-		sharedInformerFactory:               informers.NewSharedInformerFactoryWithOptions(veleroClient, 0, informers.WithNamespace(f.Namespace())),
 		csiSnapshotterSharedInformerFactory: NewCSIInformerFactoryWrapper(csiSnapClient),
 		csiSnapshotClient:                   csiSnapClient,
 		ctx:                                 ctx,
@@ -479,32 +478,32 @@ func (s *server) veleroResourcesExist() error {
 }
 
 // High priorities:
-// - Custom Resource Definitions come before Custom Resource so that they can be
-//   restored with their corresponding CRD.
-// - Namespaces go second because all namespaced resources depend on them.
-// - Storage Classes are needed to create PVs and PVCs correctly.
-// - VolumeSnapshotClasses  are needed to provision volumes using volumesnapshots
-// - VolumeSnapshotContents are needed as they contain the handle to the volume snapshot in the
-//	 storage provider
-// - VolumeSnapshots are needed to create PVCs using the VolumeSnapshot as their data source.
-// - PVs go before PVCs because PVCs depend on them.
-// - PVCs go before pods or controllers so they can be mounted as volumes.
-// - Service accounts go before secrets so service account token secrets can be filled automatically.
-// - Secrets and config maps go before pods or controllers so they can be mounted
-// 	 as volumes.
-// - Limit ranges go before pods or controllers so pods can use them.
-// - Pods go before controllers so they can be explicitly restored and potentially
-//	 have pod volume restores run before controllers adopt the pods.
-// - Replica sets go before deployments/other controllers so they can be explicitly
-//	 restored and be adopted by controllers.
-// - CAPI ClusterClasses go before Clusters.
+//   - Custom Resource Definitions come before Custom Resource so that they can be
+//     restored with their corresponding CRD.
+//   - Namespaces go second because all namespaced resources depend on them.
+//   - Storage Classes are needed to create PVs and PVCs correctly.
+//   - VolumeSnapshotClasses  are needed to provision volumes using volumesnapshots
+//   - VolumeSnapshotContents are needed as they contain the handle to the volume snapshot in the
+//     storage provider
+//   - VolumeSnapshots are needed to create PVCs using the VolumeSnapshot as their data source.
+//   - PVs go before PVCs because PVCs depend on them.
+//   - PVCs go before pods or controllers so they can be mounted as volumes.
+//   - Service accounts go before secrets so service account token secrets can be filled automatically.
+//   - Secrets and config maps go before pods or controllers so they can be mounted
+//     as volumes.
+//   - Limit ranges go before pods or controllers so pods can use them.
+//   - Pods go before controllers so they can be explicitly restored and potentially
+//     have pod volume restores run before controllers adopt the pods.
+//   - Replica sets go before deployments/other controllers so they can be explicitly
+//     restored and be adopted by controllers.
+//   - CAPI ClusterClasses go before Clusters.
 //
 // Low priorities:
-// - Tanzu ClusterBootstraps go last as it can reference any other kind of resources.
-//   ClusterBootstraps go before CAPI Clusters otherwise a new default ClusterBootstrap object is created for the cluster
-// - CAPI Clusters come before ClusterResourceSets because failing to do so means the CAPI controller-manager will panic.
-//	 Both Clusters and ClusterResourceSets need to come before ClusterResourceSetBinding in order to properly restore workload clusters.
-//   See https://github.com/kubernetes-sigs/cluster-api/issues/4105
+//   - Tanzu ClusterBootstraps go last as it can reference any other kind of resources.
+//     ClusterBootstraps go before CAPI Clusters otherwise a new default ClusterBootstrap object is created for the cluster
+//   - CAPI Clusters come before ClusterResourceSets because failing to do so means the CAPI controller-manager will panic.
+//     Both Clusters and ClusterResourceSets need to come before ClusterResourceSetBinding in order to properly restore workload clusters.
+//     See https://github.com/kubernetes-sigs/cluster-api/issues/4105
 var defaultRestorePriorities = restore.Priorities{
 	HighPriorities: []string{
 		"customresourcedefinitions",
@@ -564,6 +563,32 @@ func (s *server) initRepoManager() error {
 	return nil
 }
 
+func (s *server) getCSIVolumeSnapshotListers() snapshotv1listers.VolumeSnapshotLister {
+	// Make empty listers that will only be populated if CSI is properly enabled.
+	var vsLister snapshotv1listers.VolumeSnapshotLister
+	var err error
+
+	// If CSI is enabled, check for the CSI groups and generate the listers
+	// If CSI isn't enabled, return empty listers.
+	if features.IsEnabled(velerov1api.CSIFeatureFlag) {
+		_, err = s.discoveryClient.ServerResourcesForGroupVersion(snapshotv1api.SchemeGroupVersion.String())
+		switch {
+		case apierrors.IsNotFound(err):
+			// CSI is enabled, but the required CRDs aren't installed, so halt.
+			s.logger.Fatalf("The '%s' feature flag was specified, but CSI API group [%s] was not found.", velerov1api.CSIFeatureFlag, snapshotv1api.SchemeGroupVersion.String())
+		case err == nil:
+			// CSI is enabled, and the resources were found.
+			// Instantiate the listers fully
+			s.logger.Debug("Creating CSI listers")
+			// Access the wrapped factory directly here since we've already done the feature flag check above to know it's safe.
+			vsLister = s.csiSnapshotterSharedInformerFactory.factory.Snapshot().V1().VolumeSnapshots().Lister()
+		case err != nil:
+			cmd.CheckError(err)
+		}
+	}
+	return vsLister
+}
+
 func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string) error {
 	s.logger.Info("Starting controllers")
 
@@ -624,19 +649,12 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
 	}
 
 	// start the informers & and wait for the caches to sync
-	s.sharedInformerFactory.Start(ctx.Done())
 	s.csiSnapshotterSharedInformerFactory.Start(ctx.Done())
 	s.logger.Info("Waiting for informer caches to sync")
-	cacheSyncResults := s.sharedInformerFactory.WaitForCacheSync(ctx.Done())
 	csiCacheSyncResults := s.csiSnapshotterSharedInformerFactory.WaitForCacheSync(ctx.Done())
 	s.logger.Info("Done waiting for informer caches to sync")
 
-	// Append our CSI informer types into the larger list of caches, so we can check them all at once
 	for informer, synced := range csiCacheSyncResults {
-		cacheSyncResults[informer] = synced
-	}
-
-	for informer, synced := range cacheSyncResults {
 		if !synced {
 			return errors.Errorf("cache was not synced for informer %v", informer)
 		}
@@ -649,18 +667,42 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
 			s.discoveryHelper,
 			client.NewDynamicFactory(s.dynamicClient),
 			podexec.NewPodCommandExecutor(s.kubeClientConfig, s.kubeClient.CoreV1().RESTClient()),
-			podvolume.NewBackupperFactory(s.repoLocker, s.repoEnsurer, s.veleroClient, s.kubeClient.CoreV1(),
-				s.kubeClient.CoreV1(), s.kubeClient.CoreV1(),
-				s.sharedInformerFactory.Velero().V1().BackupRepositories().Informer().HasSynced, s.logger),
+			podvolume.NewBackupperFactory(
+				s.repoLocker,
+				s.repoEnsurer,
+				s.veleroClient,
+				s.kubeClient.CoreV1(),
+				s.kubeClient.CoreV1(),
+				s.kubeClient.CoreV1(),
+				s.logger,
+			),
 			s.config.podVolumeOperationTimeout,
 			s.config.defaultVolumesToFsBackup,
 			s.config.clientPageSize,
 			s.config.uploaderType,
 		)
 		cmd.CheckError(err)
-		if err := controller.NewBackupReconciler(s.ctx, s.discoveryHelper, backupper, s.logger, s.logLevel, newPluginManager, backupTracker, s.mgr.GetClient(),
-			s.config.defaultBackupLocation, s.config.defaultVolumesToFsBackup, s.config.defaultBackupTTL, s.config.defaultCSISnapshotTimeout, defaultVolumeSnapshotLocations,
-			s.metrics, backupStoreGetter, s.config.formatFlag.Parse(), s.credentialFileStore).SetupWithManager(s.mgr); err != nil {
+		if err := controller.NewBackupReconciler(
+			s.ctx,
+			s.discoveryHelper,
+			backupper,
+			s.logger,
+			s.logLevel,
+			newPluginManager,
+			backupTracker,
+			s.mgr.GetClient(),
+			s.config.defaultBackupLocation,
+			s.config.defaultVolumesToFsBackup,
+			s.config.defaultBackupTTL,
+			s.config.defaultCSISnapshotTimeout,
+			defaultVolumeSnapshotLocations,
+			s.metrics,
+			backupStoreGetter,
+			s.config.formatFlag.Parse(),
+			s.getCSIVolumeSnapshotListers(),
+			s.csiSnapshotClient,
+			s.credentialFileStore,
+		).SetupWithManager(s.mgr); err != nil {
 			s.logger.Fatal(err, "unable to create controller", "controller", controller.Backup)
 		}
 	}
@@ -766,8 +808,15 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
 			client.NewDynamicFactory(s.dynamicClient),
 			s.config.restoreResourcePriorities,
 			s.kubeClient.CoreV1().Namespaces(),
-			podvolume.NewRestorerFactory(s.repoLocker, s.repoEnsurer, s.veleroClient, s.kubeClient.CoreV1(),
-				s.kubeClient.CoreV1(), s.kubeClient, s.sharedInformerFactory.Velero().V1().BackupRepositories().Informer().HasSynced, s.logger),
+			podvolume.NewRestorerFactory(
+				s.repoLocker,
+				s.repoEnsurer,
+				s.veleroClient,
+				s.kubeClient.CoreV1(),
+				s.kubeClient.CoreV1(),
+				s.kubeClient,
+				s.logger,
+			),
 			s.config.podVolumeOperationTimeout,
 			s.config.resourceTerminatingTimeout,
 			s.logger,