diff --git a/examples/local/backups/restart_tablets.sh b/examples/local/backups/restart_tablets.sh index 2c6aa19aa43..038fafc5dbf 100755 --- a/examples/local/backups/restart_tablets.sh +++ b/examples/local/backups/restart_tablets.sh @@ -23,19 +23,41 @@ for i in 100 101 102; do CELL=zone1 TABLET_UID=$i ./scripts/mysqlctl-up.sh CELL=zone1 KEYSPACE=commerce TABLET_UID=$i ./scripts/vttablet-up.sh done -sleep 5 -vtctldclient InitShardPrimary --force commerce/0 zone1-100 for i in 200 201 202; do CELL=zone1 TABLET_UID=$i ./scripts/mysqlctl-up.sh SHARD=-80 CELL=zone1 KEYSPACE=customer TABLET_UID=$i ./scripts/vttablet-up.sh done -sleep 5 -vtctldclient InitShardPrimary --force customer/-80 zone1-200 for i in 300 301 302; do CELL=zone1 TABLET_UID=$i ./scripts/mysqlctl-up.sh SHARD=80- CELL=zone1 KEYSPACE=customer TABLET_UID=$i ./scripts/vttablet-up.sh done sleep 5 -vtctldclient InitShardPrimary --force customer/80- zone1-300 \ No newline at end of file + +# Wait for all the replica tablets to be in the serving state before initiating +# InitShardPrimary. This is essential, since we want the RESTORE phase to be +# complete before we start InitShardPrimary, otherwise we end up reading the +# tablet type to RESTORE and do not set semi-sync, which leads to the primary +# hanging on writes. +totalTime=300 +for i in 101 201 301; do + while [ $totalTime -gt 0 ]; do + status=$(curl "http://$hostname:15$i/debug/status_details") + echo "$status" | grep "REPLICA: Serving" && break + totalTime=$((totalTime-1)) + sleep 0.1 + done +done + +# Check that all the replica tablets have reached REPLICA: Serving state +for i in 101 201 301; do + status=$(curl "http://$hostname:15$i/debug/status_details") + echo "$status" | grep "REPLICA: Serving" && continue + echo "tablet-$i did not reach REPLICA: Serving state. Exiting due to failure." + exit 1 +done + +vtctldclient InitShardPrimary --force commerce/0 zone1-100 +vtctldclient InitShardPrimary --force customer/-80 zone1-200 +vtctldclient InitShardPrimary --force customer/80- zone1-300