Skip to content

Commit

Permalink
[controller] Fail hybrid store conversion if a pushjob is running (#1143
Browse files Browse the repository at this point in the history
)

If a batch store is converted to hybrid while a push is already running, it might convert the store to hybrid when its running the push in the target colo. After the target colo push finishes, the state of the store is mismatched in target vs child colo. Then during the push to other colo which does not support hybrid store ingestion and might not have the RT topic created which would make the pushjob hang forever. This PR blocks conversion to hybrid store during actively running pushes.

---------

Co-authored-by: Sourav Maji <tester@linkedin.com>
  • Loading branch information
majisourav99 and Sourav Maji authored Sep 4, 2024
1 parent 4fdaac5 commit ba6ead3
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import static com.linkedin.venice.ConfigKeys.ENABLE_ACTIVE_ACTIVE_REPLICATION_AS_DEFAULT_FOR_HYBRID_STORE;
import static com.linkedin.venice.utils.TestUtils.assertCommand;
import static com.linkedin.venice.utils.TestUtils.waitForNonDeterministicPushCompletion;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
Expand All @@ -12,6 +13,7 @@
import com.linkedin.venice.integration.utils.VeniceMultiRegionClusterCreateOptions;
import com.linkedin.venice.integration.utils.VeniceTwoLayerMultiRegionMultiClusterWrapper;
import com.linkedin.venice.meta.StoreInfo;
import com.linkedin.venice.meta.Version;
import com.linkedin.venice.utils.TestUtils;
import com.linkedin.venice.utils.Time;
import com.linkedin.venice.utils.Utils;
Expand Down Expand Up @@ -72,6 +74,12 @@ public void testClusterLevelActiveActiveReplicationConfigForNewHybridStores() th
// Check store level Active/Active is enabled or not
assertFalse(store.isActiveActiveReplicationEnabled());

waitForNonDeterministicPushCompletion(
Version.composeKafkaTopic(storeName, 1),
parentControllerClient,
30,
TimeUnit.SECONDS);

// Convert to hybrid store
assertCommand(
parentControllerClient.updateStore(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,22 @@
import static com.linkedin.venice.ConfigKeys.NATIVE_REPLICATION_SOURCE_FABRIC_AS_DEFAULT_FOR_BATCH_ONLY_STORES;
import static com.linkedin.venice.ConfigKeys.NATIVE_REPLICATION_SOURCE_FABRIC_AS_DEFAULT_FOR_HYBRID_STORES;
import static com.linkedin.venice.utils.TestUtils.assertCommand;
import static com.linkedin.venice.utils.TestUtils.waitForNonDeterministicPushCompletion;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;

import com.linkedin.venice.controllerapi.ControllerClient;
import com.linkedin.venice.controllerapi.ControllerResponse;
import com.linkedin.venice.controllerapi.UpdateStoreQueryParams;
import com.linkedin.venice.integration.utils.ServiceFactory;
import com.linkedin.venice.integration.utils.VeniceMultiRegionClusterCreateOptions;
import com.linkedin.venice.integration.utils.VeniceTwoLayerMultiRegionMultiClusterWrapper;
import com.linkedin.venice.meta.StoreInfo;
import com.linkedin.venice.meta.Version;
import com.linkedin.venice.utils.TestUtils;
import com.linkedin.venice.utils.Time;
import com.linkedin.venice.utils.Utils;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.testng.Assert;
Expand Down Expand Up @@ -60,7 +64,11 @@ public void testClusterLevelNativeReplicationConfigForNewStores() {
String pushJobId1 = "test-push-job-id-1";
parentControllerClient.createNewStore(storeName, "test-owner", "\"string\"", "\"string\"");
parentControllerClient.emptyPush(storeName, pushJobId1, 1);

waitForNonDeterministicPushCompletion(
Version.composeKafkaTopic(storeName, 1),
parentControllerClient,
30,
TimeUnit.SECONDS);
// Version 1 should exist.
StoreInfo store = assertCommand(parentControllerClient.getStore(storeName)).getStore();
assertEquals(store.getVersions().size(), 1);
Expand Down Expand Up @@ -97,4 +105,31 @@ public void testClusterLevelNativeReplicationConfigForNewStores() {
"dc-hybrid");
});
}

@Test(timeOut = TEST_TIMEOUT)
public void testConvertHybridDuringPushjob() {
String storeName = Utils.getUniqueString("test-store");
parentControllerClient.createNewStore(storeName, "test-owner", "\"string\"", "\"string\"");
parentControllerClient.requestTopicForWrites(
storeName,
1000,
Version.PushType.BATCH,
Version.numberBasedDummyPushId(1),
true,
true,
false,
Optional.empty(),
Optional.empty(),
Optional.of("dc-1"),
false,
-1);

ControllerResponse response = parentControllerClient.updateStore(
storeName,
new UpdateStoreQueryParams().setHybridRewindSeconds(1L).setHybridOffsetLagThreshold(1L));
Assert.assertTrue(response.isError());
Assert.assertTrue(response.getError().contains("Cannot convert to hybrid as there is already a pushjob running"));
parentControllerClient.killOfflinePushJob(Version.composeKafkaTopic(storeName, 1));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2281,6 +2281,16 @@ public void updateStore(String clusterName, String storeName, UpdateStoreQueryPa
setStore.storeName = storeName;
setStore.owner = owner.map(addToUpdatedConfigList(updatedConfigsList, OWNER)).orElseGet(currStore::getOwner);

if (!currStore.isHybrid() && (hybridRewindSeconds.isPresent() || hybridOffsetLagThreshold.isPresent())) {
// Today target colo pushjob cannot handle hybrid stores, so if a batch push is running, fail the request
Optional<String> currentPushTopic = getTopicForCurrentPushJob(clusterName, storeName, false, false);
if (currentPushTopic.isPresent()) {
String errorMessage =
"Cannot convert to hybrid as there is already a pushjob running with topic " + currentPushTopic.get();
LOGGER.error(errorMessage);
throw new VeniceHttpException(HttpStatus.SC_BAD_REQUEST, errorMessage, ErrorType.BAD_REQUEST);
}
}
// Invalid config update on hybrid will not be populated to admin channel so subsequent updates on the store won't
// be blocked by retry mechanism.
if (currStore.isHybrid() && (partitionerClass.isPresent() || partitionerParams.isPresent())) {
Expand Down

0 comments on commit ba6ead3

Please sign in to comment.