Skip to content

Commit

Permalink
remote publication checksum stats (#15957)
Browse files Browse the repository at this point in the history
* Remote publication checksum stats

Signed-off-by: Himshikha Gupta <himshikh@amazon.com>
Co-authored-by: Himshikha Gupta <himshikh@amazon.com>
  • Loading branch information
himshikha and Himshikha Gupta authored Sep 17, 2024
1 parent e3bbc74 commit 81288b1
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_PUBLICATION_SETTING;
import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_PUBLICATION_SETTING_KEY;
import static org.opensearch.gateway.remote.RemoteClusterStateUtils.DELIMITER;
import static org.opensearch.gateway.remote.RemoteDownloadStats.CHECKSUM_VALIDATION_FAILED_COUNT;
import static org.opensearch.gateway.remote.model.RemoteClusterBlocks.CLUSTER_BLOCKS;
import static org.opensearch.gateway.remote.model.RemoteCoordinationMetadata.COORDINATION_METADATA;
import static org.opensearch.gateway.remote.model.RemoteCustomMetadata.CUSTOM_METADATA;
Expand Down Expand Up @@ -405,10 +406,28 @@ private void assertDataNodeDownloadStats(NodesStatsResponse nodesStatsResponse)
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getSuccessCount() > 0);
assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getFailedCount());
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getTotalTimeInMillis() > 0);
assertEquals(
0,
dataNodeDiscoveryStats.getClusterStateStats()
.getPersistenceStats()
.get(0)
.getExtendedFields()
.get(CHECKSUM_VALIDATION_FAILED_COUNT)
.get()
);

assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getSuccessCount() > 0);
assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getFailedCount());
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getTotalTimeInMillis() > 0);
assertEquals(
0,
dataNodeDiscoveryStats.getClusterStateStats()
.getPersistenceStats()
.get(1)
.getExtendedFields()
.get(CHECKSUM_VALIDATION_FAILED_COUNT)
.get()
);
}

private Map<String, Integer> getMetadataFiles(BlobStoreRepository repository, String subDirectory) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1644,6 +1644,12 @@ void validateClusterStateFromChecksum(
failedValidation
)
);
if (isFullStateDownload) {
remoteStateStats.stateFullDownloadValidationFailed();
} else {
remoteStateStats.stateDiffDownloadValidationFailed();
}

if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.gateway.remote;

import org.opensearch.cluster.coordination.PersistedStateStats;

import java.util.concurrent.atomic.AtomicLong;

/**
* Download stats for remote state
*
* @opensearch.internal
*/
public class RemoteDownloadStats extends PersistedStateStats {
static final String CHECKSUM_VALIDATION_FAILED_COUNT = "checksum_validation_failed_count";
private AtomicLong checksumValidationFailedCount = new AtomicLong(0);

public RemoteDownloadStats(String statsName) {
super(statsName);
addToExtendedFields(CHECKSUM_VALIDATION_FAILED_COUNT, checksumValidationFailedCount);
}

public void checksumValidationFailedCount() {
checksumValidationFailedCount.incrementAndGet();
}

public long getChecksumValidationFailedCount() {
return checksumValidationFailedCount.get();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@
public class RemotePersistenceStats {

RemoteUploadStats remoteUploadStats;
PersistedStateStats remoteDiffDownloadStats;
PersistedStateStats remoteFullDownloadStats;
RemoteDownloadStats remoteDiffDownloadStats;
RemoteDownloadStats remoteFullDownloadStats;

final String FULL_DOWNLOAD_STATS = "remote_full_download";
final String DIFF_DOWNLOAD_STATS = "remote_diff_download";

public RemotePersistenceStats() {
remoteUploadStats = new RemoteUploadStats();
remoteDiffDownloadStats = new PersistedStateStats(DIFF_DOWNLOAD_STATS);
remoteFullDownloadStats = new PersistedStateStats(FULL_DOWNLOAD_STATS);
remoteDiffDownloadStats = new RemoteDownloadStats(DIFF_DOWNLOAD_STATS);
remoteFullDownloadStats = new RemoteDownloadStats(FULL_DOWNLOAD_STATS);
}

public void cleanUpAttemptFailed() {
Expand Down Expand Up @@ -90,6 +90,22 @@ public void stateDiffDownloadFailed() {
remoteDiffDownloadStats.stateFailed();
}

public void stateDiffDownloadValidationFailed() {
remoteDiffDownloadStats.checksumValidationFailedCount();
}

public void stateFullDownloadValidationFailed() {
remoteFullDownloadStats.checksumValidationFailedCount();
}

public long getStateDiffDownloadValidationFailed() {
return remoteDiffDownloadStats.getChecksumValidationFailedCount();
}

public long getStateFullDownloadValidationFailed() {
return remoteFullDownloadStats.getChecksumValidationFailedCount();
}

public PersistedStateStats getUploadStats() {
return remoteUploadStats;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3342,6 +3342,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabledWithNullC
anyString(),
anyBoolean()
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationEnabled() throws IOException {
Expand Down Expand Up @@ -3374,6 +3375,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabled() throws
);
mockService.getClusterStateForManifest(ClusterName.DEFAULT.value(), manifest, NODE_ID, true);
verify(mockService, times(1)).validateClusterStateFromChecksum(manifest, clusterState, ClusterName.DEFAULT.value(), NODE_ID, true);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationModeNone() throws IOException {
Expand Down Expand Up @@ -3406,6 +3408,7 @@ public void testGetClusterStateForManifestWithChecksumValidationModeNone() throw
);
mockService.getClusterStateForManifest(ClusterName.DEFAULT.value(), manifest, NODE_ID, true);
verify(mockService, times(0)).validateClusterStateFromChecksum(any(), any(), anyString(), anyString(), anyBoolean());
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationEnabledWithMismatch() throws IOException {
Expand Down Expand Up @@ -3448,6 +3451,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabledWithMisma
NODE_ID,
true
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationDebugWithMismatch() throws IOException {
Expand Down Expand Up @@ -3494,6 +3498,7 @@ public void testGetClusterStateForManifestWithChecksumValidationDebugWithMismatc
NODE_ID,
true
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksum() throws IOException {
Expand Down Expand Up @@ -3535,6 +3540,7 @@ public void testGetClusterStateUsingDiffWithChecksum() throws IOException {
eq(NODE_ID),
eq(false)
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeNone() throws IOException {
Expand Down Expand Up @@ -3576,6 +3582,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeNone() throws IOExceptio
eq(NODE_ID),
eq(false)
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeDebugMismatch() throws IOException {
Expand Down Expand Up @@ -3616,6 +3623,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeDebugMismatch() throws I
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeTraceMismatch() throws IOException {
Expand Down Expand Up @@ -3677,6 +3685,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeTraceMismatch() throws I
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumMismatch() throws IOException {
Expand Down Expand Up @@ -3738,6 +3747,7 @@ public void testGetClusterStateUsingDiffWithChecksumMismatch() throws IOExceptio
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> clusterUUIDsPointers) throws IOException {
Expand Down

0 comments on commit 81288b1

Please sign in to comment.