From aa327feabede5f2e44797bc0f1cdf0794fb0a488 Mon Sep 17 00:00:00 2001
From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Date: Mon, 19 Dec 2022 14:33:15 +0530
Subject: [PATCH] Timeout Fixes and VTOrc Improvement (#11881) (#1435)

* refactor: move tests out of newfeaturestest so that they run on upgrade-downgrade tests too

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: add failing ers test for handling multiple vttablet failures with default values of flags

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: add a new lock-timeout flag and use that instead of remote-operation-timeout

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: augment DownPrimary test to reproduce the issue of VTOrc not handling multiple failures

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: remove LockShardTimeout configuration from VTOrc and add parallelism to refresh of tablets

Signed-off-by: Manan Gupta <manan@planetscale.com>

* log: add more logging lines around ers in vtorc

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: get the test to work

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: fix usage of wait for replicas timeout

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: fix flags expected output

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: fix race in test now that the function is called in parallel multiple times

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: fix default of onCloseTimeout to 1 second

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: add failing unit test to refreshTabletsInKeyspaceShard

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: fix vtorc to not forget a tablet which has been deleted

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: fix backward compatibility, add tests and release notes docs

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: fix flags output

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: use disable-replication-manager instead of disable-active-reparents to allow vttablets to setup replication when restarted

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: fix flaky test by not checking for an error

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: handle the case of empty hostname in tablet initialization

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: update onclose timeout to 10 seconds

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: fix unit test

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: address review comments

Signed-off-by: Manan Gupta <manan@planetscale.com>

* docs: add comments explaining the test functions

Signed-off-by: Manan Gupta <manan@planetscale.com>

* feat: add summary docs for 'lock-shard-timeout' deprecation

Signed-off-by: Manan Gupta <manan@planetscale.com>

Signed-off-by: Manan Gupta <manan@planetscale.com>

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/flags/endtoend/mysqlctl.txt                |   2 +-
 go/flags/endtoend/mysqlctld.txt               |   2 +-
 go/flags/endtoend/vtbackup.txt                |   3 +-
 go/flags/endtoend/vtctld.txt                  |   7 +-
 go/flags/endtoend/vtgate.txt                  |   5 +-
 go/flags/endtoend/vtgr.txt                    |   3 +-
 go/flags/endtoend/vtorc.txt                   |   6 +-
 go/flags/endtoend/vttablet.txt                |   5 +-
 go/flags/endtoend/vttestserver.txt            |   2 +-
 go/internal/flag/flag.go                      |  11 ++
 .../reparent/newfeaturetest/reparent_test.go  | 152 +++---------------
 .../reparent/plannedreparent/reparent_test.go | 144 +++++++++++++++++
 go/test/endtoend/reparent/utils/utils.go      |   2 +-
 .../replication_manager/tablet_test.go        |   3 +-
 .../primaryfailure/primary_failure_test.go    |  14 +-
 go/vt/servenv/servenv.go                      |   2 +-
 go/vt/servenv/servenv_test.go                 |   4 +-
 go/vt/topo/locks.go                           |  35 ++--
 go/vt/topo/locks_test.go                      |  85 ++++++++++
 .../reparentutil/emergency_reparenter.go      |   2 +-
 go/vt/vtctl/reparentutil/replication.go       |   4 +-
 go/vt/vtctl/reparentutil/replication_test.go  |  22 ++-
 go/vt/vtorc/config/config.go                  |   9 +-
 go/vt/vtorc/config/config_test.go             |  15 --
 go/vt/vtorc/logic/tablet_discovery.go         |  32 ++--
 go/vt/vtorc/logic/tablet_discovery_test.go    |  31 +++-
 go/vt/vtorc/logic/topology_recovery.go        |   5 +
 go/vt/vttablet/tabletmanager/tm_init.go       |   4 +
 28 files changed, 389 insertions(+), 222 deletions(-)
 create mode 100644 go/vt/topo/locks_test.go

diff --git a/go/flags/endtoend/mysqlctl.txt b/go/flags/endtoend/mysqlctl.txt
index 6444c8f4575..c9f500d7b74 100644
--- a/go/flags/endtoend/mysqlctl.txt
+++ b/go/flags/endtoend/mysqlctl.txt
@@ -67,7 +67,7 @@ Global flags:
       --mysqlctl_client_protocol string                 the protocol to use to talk to the mysqlctl server (default "grpc")
       --mysqlctl_mycnf_template string                  template file to use for generating the my.cnf file during server init
       --mysqlctl_socket string                          socket file to use for remote mysqlctl actions (empty for local actions)
-      --onclose_timeout duration                        wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                        wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                         wait no more than this for OnTermSync handlers before stopping (default 10s)
       --pid_file string                                 If set, the process will write its pid to the named file, and delete it on graceful shutdown.
       --pool_hostname_resolve_interval duration         if set force an update to all hostnames and reconnect if changed, defaults to 0 (disabled)
diff --git a/go/flags/endtoend/mysqlctld.txt b/go/flags/endtoend/mysqlctld.txt
index f34697b39ef..f0a00822a1e 100644
--- a/go/flags/endtoend/mysqlctld.txt
+++ b/go/flags/endtoend/mysqlctld.txt
@@ -71,7 +71,7 @@ Usage of mysqlctld:
       --mysql_socket string                                              Path to the mysqld socket file
       --mysqlctl_mycnf_template string                                   template file to use for generating the my.cnf file during server init
       --mysqlctl_socket string                                           socket file to use for remote mysqlctl actions (empty for local actions)
-      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                                          wait no more than this for OnTermSync handlers before stopping (default 10s)
       --pid_file string                                                  If set, the process will write its pid to the named file, and delete it on graceful shutdown.
       --pool_hostname_resolve_interval duration                          if set force an update to all hostnames and reconnect if changed, defaults to 0 (disabled)
diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt
index 8b3d298a394..6d81a6de7a9 100644
--- a/go/flags/endtoend/vtbackup.txt
+++ b/go/flags/endtoend/vtbackup.txt
@@ -85,6 +85,7 @@ Usage of vtbackup:
       --keep-alive-timeout duration                     Wait until timeout elapses after a successful backup before shutting down.
       --keep_logs duration                              keep logs for this long (using ctime) (zero to keep forever)
       --keep_logs_by_mtime duration                     keep logs for this long (using mtime) (zero to keep forever)
+      --lock-timeout duration                           Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --log_backtrace_at traceLocation                  when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                                  If non-empty, write log files in this directory
       --log_err_stacks                                  log stack traces for errors
@@ -124,7 +125,7 @@ Usage of vtbackup:
       --psdb.gcs_backup.creds_path string               Credentials JSON for service account to use.
       --psdb.gcs_backup.key_uri string                  GCP KMS Keyring URI to use.
       --purge_logs_interval duration                    how often try to remove old logs (default 1h0m0s)
-      --remote_operation_timeout duration               time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration               time to wait for a remote operation (default 15s)
       --restart_before_backup                           Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs.
       --s3_backup_aws_endpoint string                   endpoint of the S3 backend (region must be provided).
       --s3_backup_aws_region string                     AWS region to use. (default "us-east-1")
diff --git a/go/flags/endtoend/vtctld.txt b/go/flags/endtoend/vtctld.txt
index 4a01c4e2115..b3852ee066a 100644
--- a/go/flags/endtoend/vtctld.txt
+++ b/go/flags/endtoend/vtctld.txt
@@ -1,5 +1,5 @@
 Usage of vtctld:
-      --action_timeout duration                                          time to wait for an action before resorting to force (default 2m0s)
+      --action_timeout duration                                          time to wait for an action before resorting to force (default 1m0s)
       --alsologtostderr                                                  log to standard error as well as files
       --azblob_backup_account_key_file string                            Path to a file containing the Azure Storage account key; if this flag is unset, the environment variable VT_AZBLOB_ACCOUNT_KEY will be used as the key itself (NOT a file path).
       --azblob_backup_account_name string                                Azure Storage Account name for backups; if this flag is unset, the environment variable VT_AZBLOB_ACCOUNT_NAME will be used.
@@ -61,12 +61,13 @@ Usage of vtctld:
       --keep_logs duration                                               keep logs for this long (using ctime) (zero to keep forever)
       --keep_logs_by_mtime duration                                      keep logs for this long (using mtime) (zero to keep forever)
       --lameduck-period duration                                         keep running at least this long after SIGTERM before stopping (default 50ms)
+      --lock-timeout duration                                            Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --log_backtrace_at traceLocation                                   when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                                                   If non-empty, write log files in this directory
       --log_err_stacks                                                   log stack traces for errors
       --log_rotate_max_size uint                                         size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800)
       --logtostderr                                                      log to standard error instead of files
-      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                                          wait no more than this for OnTermSync handlers before stopping (default 10s)
       --opentsdb_uri string                                              URI of opentsdb /api/put method
       --pid_file string                                                  If set, the process will write its pid to the named file, and delete it on graceful shutdown.
@@ -80,7 +81,7 @@ Usage of vtctld:
       --psdb.gcs_backup.creds_path string                                Credentials JSON for service account to use.
       --psdb.gcs_backup.key_uri string                                   GCP KMS Keyring URI to use.
       --purge_logs_interval duration                                     how often try to remove old logs (default 1h0m0s)
-      --remote_operation_timeout duration                                time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration                                time to wait for a remote operation (default 15s)
       --s3_backup_aws_endpoint string                                    endpoint of the S3 backend (region must be provided).
       --s3_backup_aws_region string                                      AWS region to use. (default "us-east-1")
       --s3_backup_aws_retries int                                        AWS request retries. (default -1)
diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt
index 6c5bc1a2d96..74e32ddbf9f 100644
--- a/go/flags/endtoend/vtgate.txt
+++ b/go/flags/endtoend/vtgate.txt
@@ -86,6 +86,7 @@ Usage of vtgate:
       --keyspaces_to_watch strings                                       Specifies which keyspaces this vtgate should have access to while routing queries or accessing the vschema.
       --lameduck-period duration                                         keep running at least this long after SIGTERM before stopping (default 50ms)
       --legacy_replication_lag_algorithm                                 Use the legacy algorithm when selecting vttablets for serving. (default true)
+      --lock-timeout duration                                            Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --lock_heartbeat_time duration                                     If there is lock function used. This will keep the lock connection active by using this heartbeat (default 5s)
       --log_backtrace_at traceLocation                                   when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                                                   If non-empty, write log files in this directory
@@ -136,7 +137,7 @@ Usage of vtgate:
       --mysql_tcp_version string                                         Select tcp, tcp4, or tcp6 to control the socket type. (default "tcp")
       --no_scatter                                                       when set to true, the planner will fail instead of producing a plan that includes scatter queries
       --normalize_queries                                                Rewrite queries with bind vars. Turn this off if the app itself sends normalized queries with bind vars. (default true)
-      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                                          wait no more than this for OnTermSync handlers before stopping (default 10s)
       --opentsdb_uri string                                              URI of opentsdb /api/put method
       --pid_file string                                                  If set, the process will write its pid to the named file, and delete it on graceful shutdown.
@@ -150,7 +151,7 @@ Usage of vtgate:
       --querylog-format string                                           format for query logs ("text" or "json") (default "text")
       --querylog-row-threshold uint                                      Number of rows a query has to return or affect before being logged; not useful for streaming queries. 0 means all queries will be logged.
       --redact-debug-ui-queries                                          redact full queries and bind variables from debug UI
-      --remote_operation_timeout duration                                time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration                                time to wait for a remote operation (default 15s)
       --retry-count int                                                  retry count (default 2)
       --schema_change_signal                                             Enable the schema tracker; requires queryserver-config-schema-change-signal to be enabled on the underlying vttablets for this to work (default true)
       --schema_change_signal_user string                                 User to be used to send down query to vttablet to retrieve schema changes
diff --git a/go/flags/endtoend/vtgr.txt b/go/flags/endtoend/vtgr.txt
index 75e7b0a0fc4..9e0798f9fca 100644
--- a/go/flags/endtoend/vtgr.txt
+++ b/go/flags/endtoend/vtgr.txt
@@ -22,6 +22,7 @@ Usage of vtgr:
   -h, --help                                       display usage and exit
       --keep_logs duration                         keep logs for this long (using ctime) (zero to keep forever)
       --keep_logs_by_mtime duration                keep logs for this long (using mtime) (zero to keep forever)
+      --lock-timeout duration                      Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --log_backtrace_at traceLocation             when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                             If non-empty, write log files in this directory
       --log_err_stacks                             log stack traces for errors
@@ -31,7 +32,7 @@ Usage of vtgr:
       --pprof strings                              enable profiling
       --purge_logs_interval duration               how often try to remove old logs (default 1h0m0s)
       --refresh_interval duration                  Refresh interval to load tablets. (default 10s)
-      --remote_operation_timeout duration          time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration          time to wait for a remote operation (default 15s)
       --scan_interval duration                     Scan interval to diagnose and repair. (default 3s)
       --scan_repair_timeout duration               Time to wait for a Diagnose and repair operation. (default 3s)
       --security_policy string                     the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only)
diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt
index 74ab84c90d8..c445e5f61bf 100644
--- a/go/flags/endtoend/vtorc.txt
+++ b/go/flags/endtoend/vtorc.txt
@@ -23,13 +23,13 @@ Usage of vtorc:
       --keep_logs duration                           keep logs for this long (using ctime) (zero to keep forever)
       --keep_logs_by_mtime duration                  keep logs for this long (using mtime) (zero to keep forever)
       --lameduck-period duration                     keep running at least this long after SIGTERM before stopping (default 50ms)
-      --lock-shard-timeout duration                  Duration for which a shard lock is held when running a recovery (default 30s)
+      --lock-timeout duration                        Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --log_backtrace_at traceLocation               when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                               If non-empty, write log files in this directory
       --log_err_stacks                               log stack traces for errors
       --log_rotate_max_size uint                     size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800)
       --logtostderr                                  log to standard error instead of files
-      --onclose_timeout duration                     wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                     wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                      wait no more than this for OnTermSync handlers before stopping (default 10s)
       --pid_file string                              If set, the process will write its pid to the named file, and delete it on graceful shutdown.
       --port int                                     port for the server
@@ -39,7 +39,7 @@ Usage of vtorc:
       --reasonable-replication-lag duration          Maximum replication lag on replicas which is deemed to be acceptable (default 10s)
       --recovery-period-block-duration duration      Duration for which a new recovery is blocked on an instance after running a recovery (default 30s)
       --recovery-poll-duration duration              Timer duration on which VTOrc polls its database to run a recovery (default 1s)
-      --remote_operation_timeout duration            time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration            time to wait for a remote operation (default 15s)
       --security_policy string                       the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only)
       --shutdown_wait_time duration                  Maximum time to wait for VTOrc to release all the locks that it is holding before shutting down on SIGTERM (default 30s)
       --snapshot-topology-interval duration          Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
index f885523a32e..f4214af29f0 100644
--- a/go/flags/endtoend/vttablet.txt
+++ b/go/flags/endtoend/vttablet.txt
@@ -160,6 +160,7 @@ Usage of vttablet:
       --keep_logs duration                                               keep logs for this long (using ctime) (zero to keep forever)
       --keep_logs_by_mtime duration                                      keep logs for this long (using mtime) (zero to keep forever)
       --lameduck-period duration                                         keep running at least this long after SIGTERM before stopping (default 50ms)
+      --lock-timeout duration                                            Maximum time for which a shard/keyspace lock can be acquired for (default 45s)
       --lock_tables_timeout duration                                     How long to keep the table locked before timing out (default 1m0s)
       --log_backtrace_at traceLocation                                   when logging hits line file:N, emit a stack trace (default :0)
       --log_dir string                                                   If non-empty, write log files in this directory
@@ -191,7 +192,7 @@ Usage of vttablet:
       --mysql_server_version string                                      MySQL server version to advertise.
       --mysqlctl_mycnf_template string                                   template file to use for generating the my.cnf file during server init
       --mysqlctl_socket string                                           socket file to use for remote mysqlctl actions (empty for local actions)
-      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                                          wait no more than this for OnTermSync handlers before stopping (default 10s)
       --opentsdb_uri string                                              URI of opentsdb /api/put method
       --pid_file string                                                  If set, the process will write its pid to the named file, and delete it on graceful shutdown.
@@ -246,7 +247,7 @@ Usage of vttablet:
       --redact-debug-ui-queries                                          redact full queries and bind variables from debug UI
       --relay_log_max_items int                                          Maximum number of rows for VReplication target buffering. (default 5000)
       --relay_log_max_size int                                           Maximum buffer size (in bytes) for VReplication target buffering. If single rows are larger than this, a single row is buffered at a time. (default 250000)
-      --remote_operation_timeout duration                                time to wait for a remote operation (default 30s)
+      --remote_operation_timeout duration                                time to wait for a remote operation (default 15s)
       --replication_connect_retry duration                               how long to wait in between replica reconnect attempts. Only precise to the second. (default 10s)
       --restore_concurrency int                                          (init restore parameter) how many concurrent files to restore at once (default 4)
       --restore_from_backup                                              (init restore parameter) will check BackupStorage for a recent backup at startup and start there
diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt
index d30ab355abe..de7edcc522a 100644
--- a/go/flags/endtoend/vttestserver.txt
+++ b/go/flags/endtoend/vttestserver.txt
@@ -75,7 +75,7 @@ Usage of vttestserver:
       --mysqlctl_socket string                                           socket file to use for remote mysqlctl actions (empty for local actions)
       --null_probability float                                           The probability to initialize a field with 'NULL'  if --initialize_with_random_data is true. Only applies to fields that can contain NULL values. (default 0.1)
       --num_shards strings                                               Comma separated shard count (one per keyspace) (default [2])
-      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 1ns)
+      --onclose_timeout duration                                         wait no more than this for OnClose handlers before stopping (default 10s)
       --onterm_timeout duration                                          wait no more than this for OnTermSync handlers before stopping (default 10s)
       --persistent_mode                                                  If this flag is set, the MySQL data directory is not cleaned up when LocalCluster.TearDown() is called. This is useful for running vttestserver as a database container in local developer environments. Note that db migration files (--schema_dir option) and seeding of random data (--initialize_with_random_data option) will only run during cluster startup if the data directory does not already exist. vschema migrations are run every time the cluster starts, since persistence for the topology server has not been implemented yet
       --pid_file string                                                  If set, the process will write its pid to the named file, and delete it on graceful shutdown.
diff --git a/go/internal/flag/flag.go b/go/internal/flag/flag.go
index 1f55da7b4ee..e126f3c45f5 100644
--- a/go/internal/flag/flag.go
+++ b/go/internal/flag/flag.go
@@ -70,6 +70,17 @@ func Parse(fs *flag.FlagSet) {
 	flag.Parse()
 }
 
+// IsFlagProvided returns if the given flag has been provided by the user explicitly or not
+func IsFlagProvided(name string) bool {
+	found := false
+	flag.Visit(func(f *flag.Flag) {
+		if f.Name == name {
+			found = true
+		}
+	})
+	return found
+}
+
 // TrickGlog tricks glog into understanding that flags have been parsed.
 //
 // N.B. Do not delete this function. `glog` is a persnickity package and wants
diff --git a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go
index b450fb44420..fb4026703a4 100644
--- a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go
+++ b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go
@@ -17,155 +17,41 @@ limitations under the License.
 package newfeaturetest
 
 import (
-	"strconv"
 	"testing"
 
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
-	"google.golang.org/protobuf/encoding/protojson"
-
-	"vitess.io/vitess/go/mysql"
 	"vitess.io/vitess/go/test/endtoend/cluster"
 	"vitess.io/vitess/go/test/endtoend/reparent/utils"
-	replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
 )
 
-// TestCrossCellDurability tests 2 things -
-// 1. When PRS is run with the cross_cell durability policy setup, then the semi-sync settings on all the tablets are as expected
-// 2. Bringing up a new vttablet should have its replication and semi-sync setup correctly without any external interference
-func TestCrossCellDurability(t *testing.T) {
-	defer cluster.PanicHandler(t)
-	clusterInstance := utils.SetupReparentCluster(t, "cross_cell")
-	defer utils.TeardownCluster(clusterInstance)
-	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
-
-	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
-
-	// When tablets[0] is the primary, the only tablet in a different cell is tablets[3].
-	// So the other two should have semi-sync turned off
-	utils.CheckSemiSyncSetupCorrectly(t, tablets[0], "ON")
-	utils.CheckSemiSyncSetupCorrectly(t, tablets[3], "ON")
-	utils.CheckSemiSyncSetupCorrectly(t, tablets[1], "OFF")
-	utils.CheckSemiSyncSetupCorrectly(t, tablets[2], "OFF")
-
-	// Run forced reparent operation, this should proceed unimpeded.
-	out, err := utils.Prs(t, clusterInstance, tablets[3])
-	require.NoError(t, err, out)
-
-	utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2]})
-
-	// All the tablets will have semi-sync setup since tablets[3] is in Cell2 and all
-	// others are in Cell1, so all of them are eligible to send semi-sync ACKs
-	for _, tablet := range tablets {
-		utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON")
-	}
-
-	for i, supportsBackup := range []bool{false, true} {
-		// Bring up a new replica tablet
-		// In this new tablet, we do not disable active reparents, otherwise replication will not be started.
-		newReplica := utils.StartNewVTTablet(t, clusterInstance, 300+i, supportsBackup)
-		// Add the tablet to the list of tablets in this shard
-		clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, newReplica)
-		// Check that we can replicate to it and semi-sync is setup correctly on it
-		utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2], newReplica})
-		utils.CheckSemiSyncSetupCorrectly(t, newReplica, "ON")
-	}
-}
-
-// TestFullStatus tests that the RPC FullStatus works as intended.
-func TestFullStatus(t *testing.T) {
+// TestRecoverWithMultipleVttabletFailures tests that ERS succeeds with the default values
+// even when there are multiple vttablet failures. In this test we use the semi_sync policy
+// to allow multiple failures to happen and still be recoverable.
+// The test takes down the vttablets of the primary and a rdonly tablet and runs ERS with the
+// default values of remote_operation_timeout, lock-timeout flags and wait_replicas_timeout subflag.
+func TestRecoverWithMultipleVttabletFailures(t *testing.T) {
 	defer cluster.PanicHandler(t)
 	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
 	defer utils.TeardownCluster(clusterInstance)
 	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
 	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
 
-	// Check that full status gives the correct result for a primary tablet
-	primaryStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[0].Alias)
+	// make tablets[1] a rdonly tablet.
+	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly")
 	require.NoError(t, err)
-	primaryStatus := &replicationdatapb.FullStatus{}
-	err = protojson.Unmarshal([]byte(primaryStatusString), primaryStatus)
-	require.NoError(t, err)
-	assert.NotEmpty(t, primaryStatus.ServerUuid)
-	assert.NotEmpty(t, primaryStatus.ServerId)
-	// For a primary tablet there is no replication status
-	assert.Nil(t, primaryStatus.ReplicationStatus)
-	assert.Contains(t, primaryStatus.PrimaryStatus.String(), "vt-0000000101-bin")
-	assert.Equal(t, primaryStatus.GtidPurged, "MySQL56/")
-	assert.False(t, primaryStatus.ReadOnly)
-	assert.True(t, primaryStatus.SemiSyncPrimaryEnabled)
-	assert.True(t, primaryStatus.SemiSyncReplicaEnabled)
-	assert.True(t, primaryStatus.SemiSyncPrimaryStatus)
-	assert.False(t, primaryStatus.SemiSyncReplicaStatus)
-	assert.EqualValues(t, 3, primaryStatus.SemiSyncPrimaryClients)
-	assert.EqualValues(t, 1000000000000000000, primaryStatus.SemiSyncPrimaryTimeout)
-	assert.EqualValues(t, 1, primaryStatus.SemiSyncWaitForReplicaCount)
-	assert.Equal(t, "ROW", primaryStatus.BinlogFormat)
-	assert.Equal(t, "FULL", primaryStatus.BinlogRowImage)
-	assert.Equal(t, "ON", primaryStatus.GtidMode)
-	assert.True(t, primaryStatus.LogReplicaUpdates)
-	assert.True(t, primaryStatus.LogBinEnabled)
-	assert.Regexp(t, `[58]\.[07].*`, primaryStatus.Version)
-	assert.NotEmpty(t, primaryStatus.VersionComment)
 
-	// Check that full status gives the correct result for a replica tablet
-	replicaStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[1].Alias)
-	require.NoError(t, err)
-	replicaStatus := &replicationdatapb.FullStatus{}
-	err = protojson.Unmarshal([]byte(replicaStatusString), replicaStatus)
-	require.NoError(t, err)
-	assert.NotEmpty(t, replicaStatus.ServerUuid)
-	assert.NotEmpty(t, replicaStatus.ServerId)
-	assert.Contains(t, replicaStatus.ReplicationStatus.Position, "MySQL56/"+replicaStatus.ReplicationStatus.SourceUuid)
-	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.IoState)
-	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.SqlState)
-	assert.Equal(t, fileNameFromPosition(replicaStatus.ReplicationStatus.FilePosition), fileNameFromPosition(primaryStatus.PrimaryStatus.FilePosition))
-	assert.LessOrEqual(t, rowNumberFromPosition(replicaStatus.ReplicationStatus.FilePosition), rowNumberFromPosition(primaryStatus.PrimaryStatus.FilePosition))
-	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition, primaryStatus.PrimaryStatus.FilePosition)
-	assert.Contains(t, replicaStatus.ReplicationStatus.RelayLogFilePosition, "vt-0000000102-relay")
-	assert.Equal(t, replicaStatus.ReplicationStatus.Position, primaryStatus.PrimaryStatus.Position)
-	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogPosition, primaryStatus.PrimaryStatus.Position)
-	assert.Empty(t, replicaStatus.ReplicationStatus.LastIoError)
-	assert.Empty(t, replicaStatus.ReplicationStatus.LastSqlError)
-	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUuid, primaryStatus.ServerUuid)
-	assert.LessOrEqual(t, int(replicaStatus.ReplicationStatus.ReplicationLagSeconds), 1)
-	assert.False(t, replicaStatus.ReplicationStatus.ReplicationLagUnknown)
-	assert.EqualValues(t, 0, replicaStatus.ReplicationStatus.SqlDelay)
-	assert.False(t, replicaStatus.ReplicationStatus.SslAllowed)
-	assert.False(t, replicaStatus.ReplicationStatus.HasReplicationFilters)
-	assert.False(t, replicaStatus.ReplicationStatus.UsingGtid)
-	assert.True(t, replicaStatus.ReplicationStatus.AutoPosition)
-	assert.Equal(t, replicaStatus.ReplicationStatus.SourceHost, utils.Hostname)
-	assert.EqualValues(t, replicaStatus.ReplicationStatus.SourcePort, tablets[0].MySQLPort)
-	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUser, "vt_repl")
-	assert.Contains(t, replicaStatus.PrimaryStatus.String(), "vt-0000000102-bin")
-	assert.Equal(t, replicaStatus.GtidPurged, "MySQL56/")
-	assert.True(t, replicaStatus.ReadOnly)
-	assert.False(t, replicaStatus.SemiSyncPrimaryEnabled)
-	assert.True(t, replicaStatus.SemiSyncReplicaEnabled)
-	assert.False(t, replicaStatus.SemiSyncPrimaryStatus)
-	assert.True(t, replicaStatus.SemiSyncReplicaStatus)
-	assert.EqualValues(t, 0, replicaStatus.SemiSyncPrimaryClients)
-	assert.EqualValues(t, 1000000000000000000, replicaStatus.SemiSyncPrimaryTimeout)
-	assert.EqualValues(t, 1, replicaStatus.SemiSyncWaitForReplicaCount)
-	assert.Equal(t, "ROW", replicaStatus.BinlogFormat)
-	assert.Equal(t, "FULL", replicaStatus.BinlogRowImage)
-	assert.Equal(t, "ON", replicaStatus.GtidMode)
-	assert.True(t, replicaStatus.LogReplicaUpdates)
-	assert.True(t, replicaStatus.LogBinEnabled)
-	assert.Regexp(t, `[58]\.[07].*`, replicaStatus.Version)
-	assert.NotEmpty(t, replicaStatus.VersionComment)
-}
+	// Confirm that replication is still working as intended
+	utils.ConfirmReplication(t, tablets[0], tablets[1:])
 
-// fileNameFromPosition gets the file name from the position
-func fileNameFromPosition(pos string) string {
-	return pos[0 : len(pos)-4]
-}
+	// Make the rdonly and primary tablets and databases unavailable.
+	utils.StopTablet(t, tablets[1], true)
+	utils.StopTablet(t, tablets[0], true)
+
+	// We expect this to succeed since we only have 1 primary eligible tablet which is down
+	out, err := utils.Ers(clusterInstance, nil, "", "")
+	require.NoError(t, err, out)
 
-// rowNumberFromPosition gets the row number from the position
-func rowNumberFromPosition(pos string) int {
-	rowNumStr := pos[len(pos)-4:]
-	rowNum, _ := strconv.Atoi(rowNumStr)
-	return rowNum
+	newPrimary := utils.GetNewPrimary(t, clusterInstance)
+	utils.ConfirmReplication(t, newPrimary, []*cluster.Vttablet{tablets[2], tablets[3]})
 }
diff --git a/go/test/endtoend/reparent/plannedreparent/reparent_test.go b/go/test/endtoend/reparent/plannedreparent/reparent_test.go
index 66db2908380..72f560a9bc0 100644
--- a/go/test/endtoend/reparent/plannedreparent/reparent_test.go
+++ b/go/test/endtoend/reparent/plannedreparent/reparent_test.go
@@ -19,15 +19,20 @@ package plannedreparent
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"testing"
 	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
+	"google.golang.org/protobuf/encoding/protojson"
+
+	"vitess.io/vitess/go/mysql"
 	"vitess.io/vitess/go/test/endtoend/cluster"
 	"vitess.io/vitess/go/test/endtoend/reparent/utils"
 	"vitess.io/vitess/go/vt/log"
+	replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
 )
 
 func TestPrimaryToSpareStateChangeImpossible(t *testing.T) {
@@ -376,3 +381,142 @@ func TestReparentDoesntHangIfPrimaryFails(t *testing.T) {
 	require.Error(t, err)
 	assert.Contains(t, out, "primary failed to PopulateReparentJournal")
 }
+
+// TestCrossCellDurability tests 2 things -
+// 1. When PRS is run with the cross_cell durability policy setup, then the semi-sync settings on all the tablets are as expected
+// 2. Bringing up a new vttablet should have its replication and semi-sync setup correctly without any manual intervention
+func TestCrossCellDurability(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	clusterInstance := utils.SetupReparentCluster(t, "cross_cell")
+	defer utils.TeardownCluster(clusterInstance)
+	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
+
+	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
+
+	// When tablets[0] is the primary, the only tablet in a different cell is tablets[3].
+	// So the other two should have semi-sync turned off
+	utils.CheckSemiSyncSetupCorrectly(t, tablets[0], "ON")
+	utils.CheckSemiSyncSetupCorrectly(t, tablets[3], "ON")
+	utils.CheckSemiSyncSetupCorrectly(t, tablets[1], "OFF")
+	utils.CheckSemiSyncSetupCorrectly(t, tablets[2], "OFF")
+
+	// Run forced reparent operation, this should proceed unimpeded.
+	out, err := utils.Prs(t, clusterInstance, tablets[3])
+	require.NoError(t, err, out)
+
+	utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2]})
+
+	// All the tablets will have semi-sync setup since tablets[3] is in Cell2 and all
+	// others are in Cell1, so all of them are eligible to send semi-sync ACKs
+	for _, tablet := range tablets {
+		utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON")
+	}
+
+	for i, supportsBackup := range []bool{false, true} {
+		// Bring up a new replica tablet
+		// In this new tablet, we do not disable active reparents, otherwise replication will not be started.
+		newReplica := utils.StartNewVTTablet(t, clusterInstance, 300+i, supportsBackup)
+		// Add the tablet to the list of tablets in this shard
+		clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, newReplica)
+		// Check that we can replicate to it and semi-sync is setup correctly on it
+		utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2], newReplica})
+		utils.CheckSemiSyncSetupCorrectly(t, newReplica, "ON")
+	}
+}
+
+// TestFullStatus tests that the RPC FullStatus works as intended.
+func TestFullStatus(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
+	defer utils.TeardownCluster(clusterInstance)
+	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
+	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
+
+	// Check that full status gives the correct result for a primary tablet
+	primaryStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[0].Alias)
+	require.NoError(t, err)
+	primaryStatus := &replicationdatapb.FullStatus{}
+	err = protojson.Unmarshal([]byte(primaryStatusString), primaryStatus)
+	require.NoError(t, err)
+	assert.NotEmpty(t, primaryStatus.ServerUuid)
+	assert.NotEmpty(t, primaryStatus.ServerId)
+	// For a primary tablet there is no replication status
+	assert.Nil(t, primaryStatus.ReplicationStatus)
+	assert.Contains(t, primaryStatus.PrimaryStatus.String(), "vt-0000000101-bin")
+	assert.Equal(t, primaryStatus.GtidPurged, "MySQL56/")
+	assert.False(t, primaryStatus.ReadOnly)
+	assert.True(t, primaryStatus.SemiSyncPrimaryEnabled)
+	assert.True(t, primaryStatus.SemiSyncReplicaEnabled)
+	assert.True(t, primaryStatus.SemiSyncPrimaryStatus)
+	assert.False(t, primaryStatus.SemiSyncReplicaStatus)
+	assert.EqualValues(t, 3, primaryStatus.SemiSyncPrimaryClients)
+	assert.EqualValues(t, 1000000000000000000, primaryStatus.SemiSyncPrimaryTimeout)
+	assert.EqualValues(t, 1, primaryStatus.SemiSyncWaitForReplicaCount)
+	assert.Equal(t, "ROW", primaryStatus.BinlogFormat)
+	assert.Equal(t, "FULL", primaryStatus.BinlogRowImage)
+	assert.Equal(t, "ON", primaryStatus.GtidMode)
+	assert.True(t, primaryStatus.LogReplicaUpdates)
+	assert.True(t, primaryStatus.LogBinEnabled)
+	assert.Regexp(t, `[58]\.[07].*`, primaryStatus.Version)
+	assert.NotEmpty(t, primaryStatus.VersionComment)
+
+	// Check that full status gives the correct result for a replica tablet
+	replicaStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[1].Alias)
+	require.NoError(t, err)
+	replicaStatus := &replicationdatapb.FullStatus{}
+	err = protojson.Unmarshal([]byte(replicaStatusString), replicaStatus)
+	require.NoError(t, err)
+	assert.NotEmpty(t, replicaStatus.ServerUuid)
+	assert.NotEmpty(t, replicaStatus.ServerId)
+	assert.Contains(t, replicaStatus.ReplicationStatus.Position, "MySQL56/"+replicaStatus.ReplicationStatus.SourceUuid)
+	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.IoState)
+	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.SqlState)
+	assert.Equal(t, fileNameFromPosition(replicaStatus.ReplicationStatus.FilePosition), fileNameFromPosition(primaryStatus.PrimaryStatus.FilePosition))
+	assert.LessOrEqual(t, rowNumberFromPosition(replicaStatus.ReplicationStatus.FilePosition), rowNumberFromPosition(primaryStatus.PrimaryStatus.FilePosition))
+	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition, primaryStatus.PrimaryStatus.FilePosition)
+	assert.Contains(t, replicaStatus.ReplicationStatus.RelayLogFilePosition, "vt-0000000102-relay")
+	assert.Equal(t, replicaStatus.ReplicationStatus.Position, primaryStatus.PrimaryStatus.Position)
+	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogPosition, primaryStatus.PrimaryStatus.Position)
+	assert.Empty(t, replicaStatus.ReplicationStatus.LastIoError)
+	assert.Empty(t, replicaStatus.ReplicationStatus.LastSqlError)
+	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUuid, primaryStatus.ServerUuid)
+	assert.LessOrEqual(t, int(replicaStatus.ReplicationStatus.ReplicationLagSeconds), 1)
+	assert.False(t, replicaStatus.ReplicationStatus.ReplicationLagUnknown)
+	assert.EqualValues(t, 0, replicaStatus.ReplicationStatus.SqlDelay)
+	assert.False(t, replicaStatus.ReplicationStatus.SslAllowed)
+	assert.False(t, replicaStatus.ReplicationStatus.HasReplicationFilters)
+	assert.False(t, replicaStatus.ReplicationStatus.UsingGtid)
+	assert.True(t, replicaStatus.ReplicationStatus.AutoPosition)
+	assert.Equal(t, replicaStatus.ReplicationStatus.SourceHost, utils.Hostname)
+	assert.EqualValues(t, replicaStatus.ReplicationStatus.SourcePort, tablets[0].MySQLPort)
+	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUser, "vt_repl")
+	assert.Contains(t, replicaStatus.PrimaryStatus.String(), "vt-0000000102-bin")
+	assert.Equal(t, replicaStatus.GtidPurged, "MySQL56/")
+	assert.True(t, replicaStatus.ReadOnly)
+	assert.False(t, replicaStatus.SemiSyncPrimaryEnabled)
+	assert.True(t, replicaStatus.SemiSyncReplicaEnabled)
+	assert.False(t, replicaStatus.SemiSyncPrimaryStatus)
+	assert.True(t, replicaStatus.SemiSyncReplicaStatus)
+	assert.EqualValues(t, 0, replicaStatus.SemiSyncPrimaryClients)
+	assert.EqualValues(t, 1000000000000000000, replicaStatus.SemiSyncPrimaryTimeout)
+	assert.EqualValues(t, 1, replicaStatus.SemiSyncWaitForReplicaCount)
+	assert.Equal(t, "ROW", replicaStatus.BinlogFormat)
+	assert.Equal(t, "FULL", replicaStatus.BinlogRowImage)
+	assert.Equal(t, "ON", replicaStatus.GtidMode)
+	assert.True(t, replicaStatus.LogReplicaUpdates)
+	assert.True(t, replicaStatus.LogBinEnabled)
+	assert.Regexp(t, `[58]\.[07].*`, replicaStatus.Version)
+	assert.NotEmpty(t, replicaStatus.VersionComment)
+}
+
+// fileNameFromPosition gets the file name from the position
+func fileNameFromPosition(pos string) string {
+	return pos[0 : len(pos)-4]
+}
+
+// rowNumberFromPosition gets the row number from the position
+func rowNumberFromPosition(pos string) int {
+	rowNumStr := pos[len(pos)-4:]
+	rowNum, _ := strconv.Atoi(rowNumStr)
+	return rowNum
+}
diff --git a/go/test/endtoend/reparent/utils/utils.go b/go/test/endtoend/reparent/utils/utils.go
index a3359d172d5..744a3c44c2b 100644
--- a/go/test/endtoend/reparent/utils/utils.go
+++ b/go/test/endtoend/reparent/utils/utils.go
@@ -129,7 +129,7 @@ func setupCluster(ctx context.Context, t *testing.T, shardName string, cells []s
 		// the replication manager to silently fix the replication in case ERS or PRS mess up. All the
 		// tests in this test suite should work irrespective of this flag. Each run of ERS, PRS should be
 		// setting up the replication correctly.
-		"--disable_active_reparents")
+		"--disable-replication-manager")
 
 	// Initialize Cluster
 	err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard})
diff --git a/go/test/endtoend/tabletmanager/replication_manager/tablet_test.go b/go/test/endtoend/tabletmanager/replication_manager/tablet_test.go
index 3284ab65d49..d9a24b3b444 100644
--- a/go/test/endtoend/tabletmanager/replication_manager/tablet_test.go
+++ b/go/test/endtoend/tabletmanager/replication_manager/tablet_test.go
@@ -159,8 +159,7 @@ func waitForSourcePort(ctx context.Context, t *testing.T, tablet cluster.Vttable
 	for time.Now().Before(timeout) {
 		// Check that initially replication is setup correctly on the replica tablet
 		replicaStatus, err := tmcGetReplicationStatus(ctx, tablet.GrpcPort)
-		require.NoError(t, err)
-		if replicaStatus.SourcePort == expectedPort {
+		if err == nil && replicaStatus.SourcePort == expectedPort {
 			return nil
 		}
 		time.Sleep(300 * time.Millisecond)
diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index 4dcbe0a92ed..8e91028926c 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -33,7 +33,7 @@ import (
 // Also tests that VTOrc can handle multiple failures, if the durability policies allow it
 func TestDownPrimary(t *testing.T) {
 	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
 	}, 1, "semi_sync")
 	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
@@ -64,14 +64,18 @@ func TestDownPrimary(t *testing.T) {
 	// check that the replication is setup correctly before we failover
 	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
 
-	// Make the rdonly tablet unavailable
-	err := rdonly.MysqlctlProcess.Stop()
+	// Make the rdonly vttablet unavailable
+	err := rdonly.VttabletProcess.TearDown()
+	require.NoError(t, err)
+	err = rdonly.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+	// Make the current primary vttablet unavailable.
+	err = curPrimary.VttabletProcess.TearDown()
 	require.NoError(t, err)
-	// Make the current primary database unavailable.
 	err = curPrimary.MysqlctlProcess.Stop()
 	require.NoError(t, err)
 	defer func() {
-		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
+		// we remove the tablet from our global list
 		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
 		utils.PermanentlyRemoveVttablet(clusterInfo, rdonly)
 	}()
diff --git a/go/vt/servenv/servenv.go b/go/vt/servenv/servenv.go
index f53d2f24da8..d705b79b3b7 100644
--- a/go/vt/servenv/servenv.go
+++ b/go/vt/servenv/servenv.go
@@ -79,7 +79,7 @@ var (
 var (
 	lameduckPeriod = 50 * time.Millisecond
 	onTermTimeout  = 10 * time.Second
-	onCloseTimeout = time.Nanosecond
+	onCloseTimeout = 10 * time.Second
 	catchSigpipe   bool
 	maxStackSize   = 64 * 1024 * 1024
 )
diff --git a/go/vt/servenv/servenv_test.go b/go/vt/servenv/servenv_test.go
index 3d835fcea1a..b7bd874989a 100644
--- a/go/vt/servenv/servenv_test.go
+++ b/go/vt/servenv/servenv_test.go
@@ -65,9 +65,7 @@ func TestFireOnCloseHooksTimeout(t *testing.T) {
 		time.Sleep(1 * time.Second)
 	})
 
-	// we deliberatly test the flag to make sure it's not accidently set to a
-	// high value.
-	if finished, want := fireOnCloseHooks(onCloseTimeout), false; finished != want {
+	if finished, want := fireOnCloseHooks(1*time.Nanosecond), false; finished != want {
 		t.Errorf("finished = %v, want %v", finished, want)
 	}
 }
diff --git a/go/vt/topo/locks.go b/go/vt/topo/locks.go
index 5ee60c2ea2f..a9e71a6e7d6 100644
--- a/go/vt/topo/locks.go
+++ b/go/vt/topo/locks.go
@@ -27,6 +27,7 @@ import (
 
 	"github.com/spf13/pflag"
 
+	_flag "vitess.io/vitess/go/internal/flag"
 	"vitess.io/vitess/go/trace"
 	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/proto/vtrpc"
@@ -38,15 +39,14 @@ import (
 // keyspaces and shards.
 
 var (
-	// DefaultLockTimeout is a good value to use as a default for
-	// locking a shard / keyspace.
-	// Now used only for unlock operations
-	defaultLockTimeout = 30 * time.Second
+	// LockTimeout is the maximum duration for which a
+	// shard / keyspace lock can be acquired for.
+	LockTimeout = 45 * time.Second
 
 	// RemoteOperationTimeout is used for operations where we have to
 	// call out to another process.
 	// Used for RPC calls (including topo server calls)
-	RemoteOperationTimeout = 30 * time.Second
+	RemoteOperationTimeout = 15 * time.Second
 )
 
 // Lock describes a long-running lock on a keyspace or a shard.
@@ -70,6 +70,7 @@ func init() {
 
 func registerTopoLockFlags(fs *pflag.FlagSet) {
 	fs.DurationVar(&RemoteOperationTimeout, "remote_operation_timeout", RemoteOperationTimeout, "time to wait for a remote operation")
+	fs.DurationVar(&LockTimeout, "lock-timeout", LockTimeout, "Maximum time for which a shard/keyspace lock can be acquired for")
 }
 
 // newLock creates a new Lock.
@@ -244,7 +245,7 @@ func CheckKeyspaceLockedAndRenew(ctx context.Context, keyspace string) error {
 func (l *Lock) lockKeyspace(ctx context.Context, ts *Server, keyspace string) (LockDescriptor, error) {
 	log.Infof("Locking keyspace %v for action %v", keyspace, l.Action)
 
-	ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout)
+	ctx, cancel := context.WithTimeout(ctx, getLockTimeout())
 	defer cancel()
 
 	span, ctx := trace.NewSpan(ctx, "TopoServer.LockKeyspaceForAction")
@@ -265,10 +266,8 @@ func (l *Lock) unlockKeyspace(ctx context.Context, ts *Server, keyspace string,
 	// Detach from the parent timeout, but copy the trace span.
 	// We need to still release the lock even if the parent
 	// context timed out.
-	// Note that we are not using the user provided RemoteOperationTimeout
-	// here because it is possible that that timeout is too short.
 	ctx = trace.CopySpan(context.TODO(), ctx)
-	ctx, cancel := context.WithTimeout(ctx, defaultLockTimeout)
+	ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout)
 	defer cancel()
 
 	span, ctx := trace.NewSpan(ctx, "TopoServer.UnlockKeyspaceForAction")
@@ -385,7 +384,7 @@ func CheckShardLocked(ctx context.Context, keyspace, shard string) error {
 func (l *Lock) lockShard(ctx context.Context, ts *Server, keyspace, shard string) (LockDescriptor, error) {
 	log.Infof("Locking shard %v/%v for action %v", keyspace, shard, l.Action)
 
-	ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout)
+	ctx, cancel := context.WithTimeout(ctx, getLockTimeout())
 	defer cancel()
 
 	span, ctx := trace.NewSpan(ctx, "TopoServer.LockShardForAction")
@@ -406,10 +405,8 @@ func (l *Lock) lockShard(ctx context.Context, ts *Server, keyspace, shard string
 func (l *Lock) unlockShard(ctx context.Context, ts *Server, keyspace, shard string, lockDescriptor LockDescriptor, actionError error) error {
 	// Detach from the parent timeout, but copy the trace span.
 	// We need to still release the lock even if the parent context timed out.
-	// Note that we are not using the user provided RemoteOperationTimeout
-	// here because it is possible that that timeout is too short.
 	ctx = trace.CopySpan(context.TODO(), ctx)
-	ctx, cancel := context.WithTimeout(ctx, defaultLockTimeout)
+	ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout)
 	defer cancel()
 
 	span, ctx := trace.NewSpan(ctx, "TopoServer.UnlockShardForAction")
@@ -428,3 +425,15 @@ func (l *Lock) unlockShard(ctx context.Context, ts *Server, keyspace, shard stri
 	}
 	return lockDescriptor.Unlock(ctx)
 }
+
+// getLockTimeout is shim code used for backward compatibility with v15
+// This code can be removed in v17+ and LockTimeout can be used directly
+func getLockTimeout() time.Duration {
+	if _flag.IsFlagProvided("lock-timeout") {
+		return LockTimeout
+	}
+	if _flag.IsFlagProvided("remote_operation_timeout") {
+		return RemoteOperationTimeout
+	}
+	return LockTimeout
+}
diff --git a/go/vt/topo/locks_test.go b/go/vt/topo/locks_test.go
new file mode 100644
index 00000000000..da4f179f83c
--- /dev/null
+++ b/go/vt/topo/locks_test.go
@@ -0,0 +1,85 @@
+/*
+Copyright 2022 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package topo
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/spf13/pflag"
+	"github.com/stretchr/testify/require"
+
+	"vitess.io/vitess/go/internal/flag"
+)
+
+// TestGetLockTimeout tests the behaviour of
+// getLockTimeout function in different situations where
+// the two flags `remote_operation_timeout` and `lock-timeout` are
+// provided or not.
+func TestGetLockTimeout(t *testing.T) {
+	tests := []struct {
+		description                 string
+		lockTimeoutValue            string
+		remoteOperationTimeoutValue string
+		expectedLockTimeout         time.Duration
+	}{
+		{
+			description:                 "no flags specified",
+			lockTimeoutValue:            "",
+			remoteOperationTimeoutValue: "",
+			expectedLockTimeout:         45 * time.Second,
+		}, {
+			description:                 "lock-timeout flag specified",
+			lockTimeoutValue:            "33s",
+			remoteOperationTimeoutValue: "",
+			expectedLockTimeout:         33 * time.Second,
+		}, {
+			description:                 "remote operation timeout flag specified",
+			lockTimeoutValue:            "",
+			remoteOperationTimeoutValue: "33s",
+			expectedLockTimeout:         33 * time.Second,
+		}, {
+			description:                 "both flags specified",
+			lockTimeoutValue:            "33s",
+			remoteOperationTimeoutValue: "22s",
+			expectedLockTimeout:         33 * time.Second,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.description, func(t *testing.T) {
+			var args []string
+			if tt.lockTimeoutValue != "" {
+				args = append(args, "--lock-timeout", tt.lockTimeoutValue)
+			}
+			if tt.remoteOperationTimeoutValue != "" {
+				args = append(args, "--remote_operation_timeout", tt.remoteOperationTimeoutValue)
+			}
+			os.Args = os.Args[0:1]
+			os.Args = append(os.Args, args...)
+
+			fs := pflag.NewFlagSet("test", pflag.ExitOnError)
+			registerTopoLockFlags(fs)
+			flag.Parse(fs)
+
+			val := getLockTimeout()
+			require.Equal(t, tt.expectedLockTimeout, val)
+		})
+	}
+
+}
diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go
index ba846ebc147..9e4ac550a8f 100644
--- a/go/vt/vtctl/reparentutil/emergency_reparenter.go
+++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go
@@ -192,7 +192,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
 	}
 
 	// Stop replication on all the tablets and build their status map
-	stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, opts.WaitReplicasTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger)
+	stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, topo.RemoteOperationTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger)
 	if err != nil {
 		return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err)
 	}
diff --git a/go/vt/vtctl/reparentutil/replication.go b/go/vt/vtctl/reparentutil/replication.go
index b1510ffaf09..512b3a60221 100644
--- a/go/vt/vtctl/reparentutil/replication.go
+++ b/go/vt/vtctl/reparentutil/replication.go
@@ -214,7 +214,7 @@ func stopReplicationAndBuildStatusMaps(
 	tmc tmclient.TabletManagerClient,
 	ev *events.Reparent,
 	tabletMap map[string]*topo.TabletInfo,
-	waitReplicasTimeout time.Duration,
+	stopReplicationTimeout time.Duration,
 	ignoredTablets sets.String,
 	tabletToWaitFor *topodatapb.TabletAlias,
 	durability Durabler,
@@ -233,7 +233,7 @@ func stopReplicationAndBuildStatusMaps(
 		}
 	)
 
-	groupCtx, groupCancel := context.WithTimeout(ctx, waitReplicasTimeout)
+	groupCtx, groupCancel := context.WithTimeout(ctx, stopReplicationTimeout)
 	defer groupCancel()
 
 	fillStatus := func(alias string, tabletInfo *topo.TabletInfo, mustWaitForTablet bool) {
diff --git a/go/vt/vtctl/reparentutil/replication_test.go b/go/vt/vtctl/reparentutil/replication_test.go
index 42b01cac770..01f043ac827 100644
--- a/go/vt/vtctl/reparentutil/replication_test.go
+++ b/go/vt/vtctl/reparentutil/replication_test.go
@@ -18,27 +18,33 @@ package reparentutil
 
 import (
 	"context"
+	"os"
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
-	"vitess.io/vitess/go/vt/vterrors"
-
-	"github.com/stretchr/testify/assert"
 	"k8s.io/apimachinery/pkg/util/sets"
 
+	_flag "vitess.io/vitess/go/internal/flag"
 	"vitess.io/vitess/go/mysql"
 	"vitess.io/vitess/go/vt/logutil"
 	"vitess.io/vitess/go/vt/topo"
 	"vitess.io/vitess/go/vt/topo/topoproto"
 	"vitess.io/vitess/go/vt/topotools/events"
+	"vitess.io/vitess/go/vt/vterrors"
 	"vitess.io/vitess/go/vt/vttablet/tmclient"
 
 	replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
 	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 )
 
+func TestMain(m *testing.M) {
+	_flag.ParseFlagsForTest()
+	os.Exit(m.Run())
+}
+
 func TestFindValidEmergencyReparentCandidates(t *testing.T) {
 	t.Parallel()
 
@@ -278,7 +284,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
 		durability               string
 		tmc                      *stopReplicationAndBuildStatusMapsTestTMClient
 		tabletMap                map[string]*topo.TabletInfo
-		waitReplicasTimeout      time.Duration
+		stopReplicasTimeout      time.Duration
 		ignoredTablets           sets.String
 		tabletToWaitFor          *topodatapb.TabletAlias
 		expectedStatusMap        map[string]*replicationdatapb.StopReplicationStatus
@@ -796,7 +802,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
 			shouldErr:                true, // we get multiple errors, so we fail
 		},
 		{
-			name:       "waitReplicasTimeout exceeded",
+			name:       "stopReplicasTimeout exceeded",
 			durability: "none",
 			tmc: &stopReplicationAndBuildStatusMapsTestTMClient{
 				stopReplicationAndGetStatusDelays: map[string]time.Duration{
@@ -840,7 +846,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
 					},
 				},
 			},
-			waitReplicasTimeout: time.Millisecond * 5,
+			stopReplicasTimeout: time.Millisecond * 5,
 			ignoredTablets:      sets.NewString(),
 			expectedStatusMap: map[string]*replicationdatapb.StopReplicationStatus{
 				"zone1-0000000101": {
@@ -1098,7 +1104,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
 					Uid:  102,
 				},
 			}},
-			waitReplicasTimeout:      time.Minute,
+			stopReplicasTimeout:      time.Minute,
 			expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
 			shouldErr:                false,
 		},
@@ -1110,7 +1116,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			durability, err := GetDurabilityPolicy(tt.durability)
 			require.NoError(t, err)
-			res, err := stopReplicationAndBuildStatusMaps(ctx, tt.tmc, &events.Reparent{}, tt.tabletMap, tt.waitReplicasTimeout, tt.ignoredTablets, tt.tabletToWaitFor, durability, logger)
+			res, err := stopReplicationAndBuildStatusMaps(ctx, tt.tmc, &events.Reparent{}, tt.tabletMap, tt.stopReplicasTimeout, tt.ignoredTablets, tt.tabletToWaitFor, durability, logger)
 			if tt.shouldErr {
 				assert.Error(t, err)
 				return
diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go
index 0c5cadd2431..fd54e9ed582 100644
--- a/go/vt/vtorc/config/config.go
+++ b/go/vt/vtorc/config/config.go
@@ -64,7 +64,6 @@ var (
 	auditPurgeDuration             = 7 * 24 * time.Hour // Equivalent of 7 days
 	recoveryPeriodBlockDuration    = 30 * time.Second
 	preventCrossCellFailover       = false
-	lockShardTimeout               = 30 * time.Second
 	waitReplicasTimeout            = 30 * time.Second
 	topoInformationRefreshDuration = 15 * time.Second
 	recoveryPollDuration           = 1 * time.Second
@@ -82,7 +81,8 @@ func RegisterFlags(fs *pflag.FlagSet) {
 	fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days")
 	fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery")
 	fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover")
-	fs.DurationVar(&lockShardTimeout, "lock-shard-timeout", lockShardTimeout, "Duration for which a shard lock is held when running a recovery")
+	fs.Duration("lock-shard-timeout", 30*time.Second, "Duration for which a shard lock is held when running a recovery")
+	_ = fs.MarkDeprecated("lock-shard-timeout", "Please use lock-timeout instead.")
 	fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs")
 	fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server")
 	fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery")
@@ -103,8 +103,7 @@ type Configuration struct {
 	AuditPurgeDays                        uint   // Days after which audit entries are purged from the database
 	RecoveryPeriodBlockSeconds            int    // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping
 	PreventCrossDataCenterPrimaryFailover bool   // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all.
-	LockShardTimeoutSeconds               int    // Timeout on context used to lock shard. Should be a small value because we should fail-fast
-	WaitReplicasTimeoutSeconds            int    // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockShardTimeoutSeconds since that is the total time we use for an ERS.
+	WaitReplicasTimeoutSeconds            int    // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS.
 	TopoInformationRefreshSeconds         int    // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server.
 	RecoveryPollSeconds                   int    // Timer duration on which VTOrc recovery analysis runs
 }
@@ -133,7 +132,6 @@ func UpdateConfigValuesFromFlags() {
 	Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24))
 	Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second)
 	Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover
-	Config.LockShardTimeoutSeconds = int(lockShardTimeout / time.Second)
 	Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second)
 	Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second)
 	Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second)
@@ -157,7 +155,6 @@ func newConfiguration() *Configuration {
 		AuditPurgeDays:                        7,
 		RecoveryPeriodBlockSeconds:            30,
 		PreventCrossDataCenterPrimaryFailover: false,
-		LockShardTimeoutSeconds:               30,
 		WaitReplicasTimeoutSeconds:            30,
 		TopoInformationRefreshSeconds:         15,
 		RecoveryPollSeconds:                   1,
diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go
index 90e78d56623..2009b476f1d 100644
--- a/go/vt/vtorc/config/config_test.go
+++ b/go/vt/vtorc/config/config_test.go
@@ -187,21 +187,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) {
 		require.Equal(t, testConfig, Config)
 	})
 
-	t.Run("override lockShardTimeout", func(t *testing.T) {
-		oldLockShardTimeout := lockShardTimeout
-		lockShardTimeout = 3 * time.Hour
-		// Restore the changes we make
-		defer func() {
-			Config = newConfiguration()
-			lockShardTimeout = oldLockShardTimeout
-		}()
-
-		testConfig := newConfiguration()
-		testConfig.LockShardTimeoutSeconds = 10800
-		UpdateConfigValuesFromFlags()
-		require.Equal(t, testConfig, Config)
-	})
-
 	t.Run("override waitReplicasTimeout", func(t *testing.T) {
 		oldWaitReplicasTimeout := waitReplicasTimeout
 		waitReplicasTimeout = 3*time.Minute + 4*time.Second
diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go
index 85ccddc2210..cf4d78ed1f9 100644
--- a/go/vt/vtorc/logic/tablet_discovery.go
+++ b/go/vt/vtorc/logic/tablet_discovery.go
@@ -202,20 +202,21 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an
 	// Discover new tablets.
 	// TODO(sougou): enhance this to work with multi-schema,
 	// where each instanceKey can have multiple tablets.
-	latestInstances := make(map[inst.InstanceKey]bool)
+	latestInstances := make(map[string]bool)
+	var wg sync.WaitGroup
 	for _, tabletInfo := range tablets {
 		tablet := tabletInfo.Tablet
-		if tablet.MysqlHostname == "" {
+		if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) {
 			continue
 		}
-		if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) {
+		latestInstances[topoproto.TabletAliasString(tablet.Alias)] = true
+		if tablet.MysqlHostname == "" {
 			continue
 		}
 		instanceKey := inst.InstanceKey{
 			Hostname: tablet.MysqlHostname,
 			Port:     int(tablet.MysqlPort),
 		}
-		latestInstances[instanceKey] = true
 		old, err := inst.ReadTablet(instanceKey)
 		if err != nil && err != inst.ErrTabletAliasNil {
 			log.Error(err)
@@ -228,9 +229,14 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an
 			log.Error(err)
 			continue
 		}
-		loader(&instanceKey)
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			loader(&instanceKey)
+		}()
 		log.Infof("Discovered: %v", tablet)
 	}
+	wg.Wait()
 
 	// Forget tablets that were removed.
 	toForget := make(map[inst.InstanceKey]*topodatapb.Tablet)
@@ -239,12 +245,12 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an
 			Hostname: row.GetString("hostname"),
 			Port:     row.GetInt("port"),
 		}
-		if !latestInstances[curKey] {
-			tablet := &topodatapb.Tablet{}
-			if err := prototext.Unmarshal([]byte(row.GetString("info")), tablet); err != nil {
-				log.Error(err)
-				return nil
-			}
+		tablet := &topodatapb.Tablet{}
+		if err := prototext.Unmarshal([]byte(row.GetString("info")), tablet); err != nil {
+			log.Error(err)
+			return nil
+		}
+		if !latestInstances[topoproto.TabletAliasString(tablet.Alias)] {
 			toForget[curKey] = tablet
 		}
 		return nil
@@ -285,18 +291,16 @@ func LockShard(ctx context.Context, instanceKey inst.InstanceKey) (context.Conte
 	if err != nil {
 		return nil, nil, err
 	}
-	ctx, cancel := context.WithTimeout(ctx, time.Duration(config.Config.LockShardTimeoutSeconds)*time.Second)
+
 	atomic.AddInt32(&shardsLockCounter, 1)
 	ctx, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery")
 	if err != nil {
-		cancel()
 		atomic.AddInt32(&shardsLockCounter, -1)
 		return nil, nil, err
 	}
 	return ctx, func(e *error) {
 		defer atomic.AddInt32(&shardsLockCounter, -1)
 		unlock(e)
-		cancel()
 	}, nil
 }
 
diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go
index 64262eff250..6024aac7fe8 100644
--- a/go/vt/vtorc/logic/tablet_discovery_test.go
+++ b/go/vt/vtorc/logic/tablet_discovery_test.go
@@ -18,6 +18,7 @@ package logic
 
 import (
 	"context"
+	"sync/atomic"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -137,10 +138,33 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) {
 		verifyRefreshTabletsInKeyspaceShard(t, true, 3, tablets)
 	})
 
+	t.Run("tablet shutdown removes mysql hostname and port. We shouldn't forget the tablet", func(t *testing.T) {
+		defer func() {
+			_, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error {
+				tablet.MysqlHostname = hostname
+				tablet.MysqlPort = 100
+				return nil
+			})
+		}()
+		// Let's assume tab100 shutdown. This would clear its tablet hostname and port
+		_, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error {
+			tablet.MysqlHostname = ""
+			tablet.MysqlPort = 0
+			return nil
+		})
+		require.NoError(t, err)
+		// We expect no tablets to be refreshed. Also, tab100 shouldn't be forgotten
+		verifyRefreshTabletsInKeyspaceShard(t, false, 0, tablets)
+	})
+
 	t.Run("change a tablet and call refreshTabletsInKeyspaceShard again", func(t *testing.T) {
 		startTimeInitially := tab100.PrimaryTermStartTime.Seconds
 		defer func() {
 			tab100.PrimaryTermStartTime.Seconds = startTimeInitially
+			_, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error {
+				tablet.PrimaryTermStartTime.Seconds = startTimeInitially
+				return nil
+			})
 		}()
 		tab100.PrimaryTermStartTime.Seconds = 1000
 		_, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error {
@@ -224,17 +248,18 @@ func TestShardPrimary(t *testing.T) {
 // verifyRefreshTabletsInKeyspaceShard calls refreshTabletsInKeyspaceShard with the forceRefresh parameter provided and verifies that
 // the number of instances refreshed matches the parameter and all the tablets match the ones provided
 func verifyRefreshTabletsInKeyspaceShard(t *testing.T, forceRefresh bool, instanceRefreshRequired int, tablets []*topodatapb.Tablet) {
-	instancesRefreshed := 0
+	var instancesRefreshed int32
+	atomic.StoreInt32(&instancesRefreshed, 0)
 	// call refreshTabletsInKeyspaceShard while counting all the instances that are refreshed
 	refreshTabletsInKeyspaceShard(context.Background(), keyspace, shard, func(instanceKey *inst.InstanceKey) {
-		instancesRefreshed++
+		atomic.AddInt32(&instancesRefreshed, 1)
 	}, forceRefresh)
 	// Verify that all the tablets are present in the database
 	for _, tablet := range tablets {
 		verifyTabletInfo(t, tablet, "")
 	}
 	// Verify that refresh as many tablets as expected
-	assert.EqualValues(t, instanceRefreshRequired, instancesRefreshed)
+	assert.EqualValues(t, instanceRefreshRequired, atomic.LoadInt32(&instancesRefreshed))
 }
 
 // verifyTabletInfo verifies that the tablet information read from the vtorc database
diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go
index af12587177c..808f6d3e21b 100644
--- a/go/vt/vtorc/logic/topology_recovery.go
+++ b/go/vt/vtorc/logic/topology_recovery.go
@@ -289,6 +289,8 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly
 			log.Warningf("ERS - %s", value)
 		case logutilpb.Level_ERROR:
 			log.Errorf("ERS - %s", value)
+		default:
+			log.Infof("ERS - %s", value)
 		}
 		_ = AuditTopologyRecovery(topologyRecovery, value)
 	})).ReparentShard(ctx,
@@ -301,6 +303,9 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly
 			PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover,
 		},
 	)
+	if err != nil {
+		log.Errorf("Error running ERS - %v", err)
+	}
 
 	if ev != nil && ev.NewPrimary != nil {
 		promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{
diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go
index 0783db6c501..5ef29646523 100644
--- a/go/vt/vttablet/tabletmanager/tm_init.go
+++ b/go/vt/vttablet/tabletmanager/tm_init.go
@@ -917,6 +917,10 @@ func (tm *TabletManager) initializeReplication(ctx context.Context, tabletType t
 	}
 
 	// Set primary and start replication.
+	if currentPrimary.Tablet.MysqlHostname == "" {
+		log.Warningf("primary tablet in the shard record does not have mysql hostname specified, possibly because that tablet has been shut down.")
+		return nil, nil
+	}
 	if err := tm.MysqlDaemon.SetReplicationSource(ctx, currentPrimary.Tablet.MysqlHostname, int(currentPrimary.Tablet.MysqlPort), false /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil {
 		return nil, vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed")
 	}