From f1215cebe5c51ef44c4816c1cf6fe79e76bef21d Mon Sep 17 00:00:00 2001 From: Clinton Knight Date: Thu, 4 Apr 2024 10:14:16 -0400 Subject: [PATCH] Reduced time ANF driver waits for some operations --- storage_drivers/azure/api/azure.go | 13 ++- storage_drivers/azure/azure_anf.go | 60 +++++----- storage_drivers/azure/azure_anf_subvolume.go | 2 +- .../azure/azure_anf_subvolume_test.go | 4 +- storage_drivers/azure/azure_anf_test.go | 103 ++++++++++++------ 5 files changed, 114 insertions(+), 68 deletions(-) diff --git a/storage_drivers/azure/api/azure.go b/storage_drivers/azure/api/azure.go index 596a86b6d..6b8b8995d 100644 --- a/storage_drivers/azure/api/azure.go +++ b/storage_drivers/azure/api/azure.go @@ -30,7 +30,8 @@ import ( ) const ( - VolumeCreateTimeout = 10 * time.Second + VolumeCreateTimeout = 2 * time.Second + SubvolumeCreateTimeout = 10 * time.Second SnapshotTimeout = 240 * time.Second // Snapshotter sidecar has a timeout of 5 minutes. Stay under that! DefaultTimeout = 120 * time.Second MaxLabelLength = 256 @@ -1008,6 +1009,9 @@ func (c Client) WaitForVolumeState( volumeState = StateDeleted return nil } + if errors.Is(err, context.Canceled) { + return backoff.Permanent(err) + } volumeState = "" return fmt.Errorf("could not get volume status; %v", err) @@ -1349,7 +1353,7 @@ func (c Client) DeleteVolume(ctx context.Context, filesystem *FileSystem) error return err } - Logc(ctx).WithFields(logFields).Debug("Volume deleted.") + Logc(ctx).WithFields(logFields).Debug("Volume deletion started.") return nil } @@ -1488,6 +1492,9 @@ func (c Client) WaitForSnapshotState( Logc(ctx).Debugf("Implied deletion for snapshot %s.", snapshot.Name) return nil } + if errors.Is(err, context.Canceled) { + return backoff.Permanent(err) + } return fmt.Errorf("could not get snapshot status; %v", err) } @@ -1634,7 +1641,7 @@ func (c Client) DeleteSnapshot(ctx context.Context, filesystem *FileSystem, snap return err } - Logc(ctx).WithFields(logFields).Debug("Snapshot deleted.") + Logc(ctx).WithFields(logFields).Debug("Snapshot deletion started.") return nil } diff --git a/storage_drivers/azure/azure_anf.go b/storage_drivers/azure/azure_anf.go index 5780de5db..0d394bbdd 100644 --- a/storage_drivers/azure/azure_anf.go +++ b/storage_drivers/azure/azure_anf.go @@ -582,10 +582,11 @@ func (d *NASStorageDriver) initializeAzureSDKClient( // Azure workload identity // If cloud identity is provided and cloud provider is set to 'Azure' during the installation, - // we can use AZURE_CLIENT_ID,AZURE_TENANT_ID,AZURE_FEDERATED_TOKEN_FILE and AZURE_AUTHORITY_HOST environment variables - // injected by workload identity webhook for initialization of ANF driver. + // we can use AZURE_CLIENT_ID,AZURE_TENANT_ID,AZURE_FEDERATED_TOKEN_FILE and AZURE_AUTHORITY_HOST + // environment variables injected by workload identity webhook for initialization of ANF driver. - if os.Getenv("AZURE_CLIENT_ID") != "" && os.Getenv("AZURE_TENANT_ID") != "" && os.Getenv("AZURE_FEDERATED_TOKEN_FILE") != "" && os.Getenv("AZURE_AUTHORITY_HOST") != "" { + if os.Getenv("AZURE_CLIENT_ID") != "" && os.Getenv("AZURE_TENANT_ID") != "" && + os.Getenv("AZURE_FEDERATED_TOKEN_FILE") != "" && os.Getenv("AZURE_AUTHORITY_HOST") != "" { Logc(ctx).Info("Using Azure workload identity.") } else { // Azure managed identity @@ -1336,7 +1337,8 @@ func (d *NASStorageDriver) Import(ctx context.Context, volConfig *storage.Volume } } - if err = d.SDK.ModifyVolume(ctx, volume, labels, &unixPermissions, &snapshotDirAccess, &modifiedExportRule); err != nil { + err = d.SDK.ModifyVolume(ctx, volume, labels, &unixPermissions, &snapshotDirAccess, &modifiedExportRule) + if err != nil { Logc(ctx).WithField("originalName", originalName).WithError(err).Error( "Could not import volume, volume modify failed.") return fmt.Errorf("could not import volume %s, volume modify failed; %v", originalName, err) @@ -1424,22 +1426,18 @@ func (d *NASStorageDriver) waitForVolumeCreate(ctx context.Context, volume *api. return errors.VolumeCreatingError(err.Error()) case api.StateDeleting: - // Wait for deletion to complete - _, errDelete := d.SDK.WaitForVolumeState( - ctx, volume, api.StateDeleted, []string{api.StateError}, d.defaultTimeout(), operation) - if errDelete != nil { - Logc(ctx).WithFields(logFields).WithError(errDelete).Error( - "Volume could not be cleaned up and must be manually deleted.") - } + // Don't wait if volume is already being deleted + Logc(ctx).WithFields(logFields).WithError(err).Error( + "Volume is being cleaned up and should be recreated later.") case api.StateError: // Delete a failed volume - errDelete := d.SDK.DeleteVolume(ctx, volume) - if errDelete != nil { + if errDelete := d.SDK.DeleteVolume(ctx, volume); errDelete != nil { Logc(ctx).WithFields(logFields).WithError(errDelete).Error( "Volume could not be cleaned up and must be manually deleted.") + return multierr.Combine(err, errDelete) } else { - Logc(ctx).WithField("volume", volume.Name).Info("Volume deleted.") + Logc(ctx).WithField("volume", volume.Name).Info("Cleanup of failed volume started.") } Logc(ctx).WithFields(logFields).Debugf("Volume is in %s state.", state) @@ -1453,7 +1451,7 @@ func (d *NASStorageDriver) waitForVolumeCreate(ctx context.Context, volume *api. } } - return nil + return err } // Destroy deletes a volume. @@ -1482,10 +1480,13 @@ func (d *NASStorageDriver) Destroy(ctx context.Context, volConfig *storage.Volum Logc(ctx).WithField("volume", name).Warn("Volume already deleted.") return nil } else if extantVolume.ProvisioningState == api.StateDeleting { - // This is a retry, so give it more time before giving up again. - _, err = d.SDK.WaitForVolumeState( - ctx, extantVolume, api.StateDeleted, []string{api.StateError}, d.volumeCreateTimeout, api.Delete) - return err + // This is a retry, so give Docker more time before giving up again. Don't wait in other contexts. + if d.Config.DriverContext == tridentconfig.ContextDocker { + _, err = d.SDK.WaitForVolumeState( + ctx, extantVolume, api.StateDeleted, []string{api.StateError}, d.defaultTimeout(), api.Delete) + return err + } + return nil } // Delete the volume @@ -1495,9 +1496,13 @@ func (d *NASStorageDriver) Destroy(ctx context.Context, volConfig *storage.Volum Logc(ctx).WithField("volume", extantVolume.Name).Info("Volume deleted.") - // Wait for deletion to complete - _, err = d.SDK.WaitForVolumeState(ctx, extantVolume, api.StateDeleted, []string{api.StateError}, d.defaultTimeout(), api.Delete) - return err + // If Docker, wait for deletion to complete. Don't wait in other contexts. + if d.Config.DriverContext == tridentconfig.ContextDocker { + _, err = d.SDK.WaitForVolumeState( + ctx, extantVolume, api.StateDeleted, []string{api.StateError}, d.defaultTimeout(), api.Delete) + return err + } + return nil } // Publish the volume to the host specified in publishInfo. This method may or may not be running on the host @@ -1834,10 +1839,13 @@ func (d *NASStorageDriver) DeleteSnapshot( return err } - // Wait for snapshot deletion to complete - return d.SDK.WaitForSnapshotState( - ctx, snapshot, extantVolume, api.StateDeleted, []string{api.StateError}, api.SnapshotTimeout, - ) + // If Docker, wait for deletion to complete. Don't wait in other contexts. + if d.Config.DriverContext == tridentconfig.ContextDocker { + return d.SDK.WaitForSnapshotState( + ctx, snapshot, extantVolume, api.StateDeleted, []string{api.StateError}, api.SnapshotTimeout, + ) + } + return nil } // List returns the list of volumes associated with this backend. diff --git a/storage_drivers/azure/azure_anf_subvolume.go b/storage_drivers/azure/azure_anf_subvolume.go index 61fa52981..9ee7785c5 100644 --- a/storage_drivers/azure/azure_anf_subvolume.go +++ b/storage_drivers/azure/azure_anf_subvolume.go @@ -210,7 +210,7 @@ func (d *NASBlockStorageDriver) defaultCreateTimeout() time.Duration { case tridentconfig.ContextDocker: return tridentconfig.DockerCreateTimeout default: - return api.VolumeCreateTimeout + return api.SubvolumeCreateTimeout } } diff --git a/storage_drivers/azure/azure_anf_subvolume_test.go b/storage_drivers/azure/azure_anf_subvolume_test.go index b8d8f7749..01b659700 100644 --- a/storage_drivers/azure/azure_anf_subvolume_test.go +++ b/storage_drivers/azure/azure_anf_subvolume_test.go @@ -279,8 +279,8 @@ func TestSubvolumeDefaultCreateTimeout(t *testing.T) { Expected time.Duration }{ {tridentconfig.ContextDocker, tridentconfig.DockerCreateTimeout}, - {tridentconfig.ContextCSI, api.VolumeCreateTimeout}, - {"", api.VolumeCreateTimeout}, + {tridentconfig.ContextCSI, api.SubvolumeCreateTimeout}, + {"", api.SubvolumeCreateTimeout}, } for _, test := range tests { t.Run(string(test.Context), func(t *testing.T) { diff --git a/storage_drivers/azure/azure_anf_test.go b/storage_drivers/azure/azure_anf_test.go index f8b06cee8..1f2267adc 100644 --- a/storage_drivers/azure/azure_anf_test.go +++ b/storage_drivers/azure/azure_anf_test.go @@ -5043,7 +5043,7 @@ func TestWaitForVolumeCreate_Creating(t *testing.T) { } } -func TestWaitForVolumeCreate_DeletingDeleteFinished(t *testing.T) { +func TestWaitForVolumeCreate_DeletingDelete(t *testing.T) { mockAPI, driver := newMockANFDriver(t) filesystem := &api.FileSystem{ @@ -5054,31 +5054,10 @@ func TestWaitForVolumeCreate_DeletingDeleteFinished(t *testing.T) { mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateAvailable, []string{api.StateError}, driver.volumeCreateTimeout, api.Delete).Return(api.StateDeleting, errFailed).Times(1) - mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateDeleted, []string{api.StateError}, - driver.defaultTimeout(), api.Delete).Return(api.StateDeleted, nil).Times(1) - - result := driver.waitForVolumeCreate(ctx, filesystem, api.Delete) - - assert.Nil(t, result, "not nil") -} - -func TestWaitForVolumeCreate_DeletingDeleteNotFinished(t *testing.T) { - mockAPI, driver := newMockANFDriver(t) - - filesystem := &api.FileSystem{ - Name: "testvol1", - CreationToken: "netapp-testvol1", - ProvisioningState: api.StateCreating, - } - - mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateAvailable, []string{api.StateError}, - driver.volumeCreateTimeout, api.Delete).Return(api.StateDeleting, errFailed).Times(1) - mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateDeleted, []string{api.StateError}, - driver.defaultTimeout(), api.Delete).Return(api.StateDeleting, errFailed).Times(1) result := driver.waitForVolumeCreate(ctx, filesystem, api.Delete) - assert.Nil(t, result, "not nil") + assert.NotNil(t, result, "not nil") } func TestWaitForVolumeCreate_ErrorDelete(t *testing.T) { @@ -5096,7 +5075,7 @@ func TestWaitForVolumeCreate_ErrorDelete(t *testing.T) { result := driver.waitForVolumeCreate(ctx, filesystem, api.Create) - assert.NotNil(t, result, "error is nil") + assert.NotNil(t, result, "nil error") } func TestWaitForVolumeCreate_ErrorDeleteFailed(t *testing.T) { @@ -5115,7 +5094,7 @@ func TestWaitForVolumeCreate_ErrorDeleteFailed(t *testing.T) { result := driver.waitForVolumeCreate(ctx, filesystem, api.Delete) - assert.NotNil(t, result, "error is nil") + assert.NotNil(t, result, "nil error") } func TestWaitForVolumeCreate_OtherStates(t *testing.T) { @@ -5134,7 +5113,7 @@ func TestWaitForVolumeCreate_OtherStates(t *testing.T) { result := driver.waitForVolumeCreate(ctx, filesystem, api.Create) - assert.Nil(t, result, "not nil") + assert.NotNil(t, result, "nil error") } } @@ -5233,9 +5212,10 @@ func getStructsForDestroySMBVolume( return volConfig, filesystem } -func TestDestroy_NFSVolume(t *testing.T) { +func TestDestroy_NFSVolume_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) @@ -5250,6 +5230,22 @@ func TestDestroy_NFSVolume(t *testing.T) { assert.Nil(t, result, "not nil") } +func TestDestroy_NFSVolume_CSI(t *testing.T) { + mockAPI, driver := newMockANFDriver(t) + driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextCSI + + volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) + + mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) + mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) + mockAPI.EXPECT().DeleteVolume(ctx, filesystem).Return(nil).Times(1) + + result := driver.Destroy(ctx, volConfig) + + assert.Nil(t, result, "not nil") +} + func TestDestroy_DiscoveryFailed(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) @@ -5291,9 +5287,10 @@ func TestDestroy_AlreadyDeleted(t *testing.T) { assert.Nil(t, result, "not nil") } -func TestDestroy_StillDeletingDeleted(t *testing.T) { +func TestDestroy_StillDeletingDeleted_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) filesystem.ProvisioningState = api.StateDeleting @@ -5301,16 +5298,17 @@ func TestDestroy_StillDeletingDeleted(t *testing.T) { mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateDeleted, []string{api.StateError}, - driver.volumeCreateTimeout, api.Delete).Return(api.StateDeleted, nil).Times(1) + tridentconfig.DockerDefaultTimeout, api.Delete).Return(api.StateDeleted, nil).Times(1) result := driver.Destroy(ctx, volConfig) assert.Nil(t, result, "not nil") } -func TestDestroy_StillDeleting(t *testing.T) { +func TestDestroy_StillDeleting_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) filesystem.ProvisioningState = api.StateDeleting @@ -5318,13 +5316,28 @@ func TestDestroy_StillDeleting(t *testing.T) { mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateDeleted, []string{api.StateError}, - driver.volumeCreateTimeout, api.Delete).Return(api.StateDeleting, errFailed).Times(1) + tridentconfig.DockerDefaultTimeout, api.Delete).Return(api.StateDeleting, errFailed).Times(1) result := driver.Destroy(ctx, volConfig) assert.NotNil(t, result, "expected error") } +func TestDestroy_StillDeleting_CSI(t *testing.T) { + mockAPI, driver := newMockANFDriver(t) + driver.initializeTelemetry(ctx, BackendUUID) + + volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) + filesystem.ProvisioningState = api.StateDeleting + + mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) + mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) + + result := driver.Destroy(ctx, volConfig) + + assert.Nil(t, result, "not nil") +} + func TestDestroy_DeleteFailed(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) @@ -5340,9 +5353,10 @@ func TestDestroy_DeleteFailed(t *testing.T) { assert.NotNil(t, result, "expected error") } -func TestDestroy_VolumeWaitFailed(t *testing.T) { +func TestDestroy_VolumeWaitFailed_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker volConfig, filesystem := getStructsForDestroyNFSVolume(ctx, driver) @@ -5366,8 +5380,6 @@ func TestDestroy_SMBVolume(t *testing.T) { mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) mockAPI.EXPECT().DeleteVolume(ctx, filesystem).Return(nil).Times(1) - mockAPI.EXPECT().WaitForVolumeState(ctx, filesystem, api.StateDeleted, []string{api.StateError}, - driver.defaultTimeout(), api.Delete).Return(api.StateDeleted, nil).Times(1) result := driver.Destroy(ctx, volConfig) @@ -6277,9 +6289,10 @@ func TestRestoreSnapshot_VolumeWaitFailed(t *testing.T) { assert.NotNil(t, result, "expected error") } -func TestDeleteSnapshot(t *testing.T) { +func TestDeleteSnapshot_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker snapTime := time.Now() volConfig, filesystem, snapConfig, snapshot := getStructsForCreateSnapshot(ctx, driver, snapTime) @@ -6296,6 +6309,23 @@ func TestDeleteSnapshot(t *testing.T) { assert.Nil(t, result, "not nil") } +func TestDeleteSnapshot_CSI(t *testing.T) { + mockAPI, driver := newMockANFDriver(t) + driver.initializeTelemetry(ctx, BackendUUID) + + snapTime := time.Now() + volConfig, filesystem, snapConfig, snapshot := getStructsForCreateSnapshot(ctx, driver, snapTime) + + mockAPI.EXPECT().RefreshAzureResources(ctx).Return(nil).Times(1) + mockAPI.EXPECT().VolumeExists(ctx, volConfig).Return(true, filesystem, nil).Times(1) + mockAPI.EXPECT().SnapshotForVolume(ctx, filesystem, snapConfig.InternalName).Return(snapshot, nil).Times(1) + mockAPI.EXPECT().DeleteSnapshot(ctx, filesystem, snapshot).Return(nil).Times(1) + + result := driver.DeleteSnapshot(ctx, snapConfig, volConfig) + + assert.Nil(t, result, "not nil") +} + func TestDeleteSnapshot_DiscoveryFailed(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) @@ -6389,9 +6419,10 @@ func TestDeleteSnapshot_SnapshotDeleteFailed(t *testing.T) { assert.NotNil(t, result, "expected error") } -func TestDeleteSnapshot_SnapshotWaitFailed(t *testing.T) { +func TestDeleteSnapshot_SnapshotWaitFailed_Docker(t *testing.T) { mockAPI, driver := newMockANFDriver(t) driver.initializeTelemetry(ctx, BackendUUID) + driver.Config.DriverContext = tridentconfig.ContextDocker snapTime := time.Now() volConfig, filesystem, snapConfig, snapshot := getStructsForCreateSnapshot(ctx, driver, snapTime)