@@ -276,10 +276,12 @@ func NewServerWithInterceptor(
276276
277277 s := grpc .NewServer (opts ... )
278278 RegisterHeartbeatServer (s , & HeartbeatService {
279- clock : ctx .LocalClock ,
280- remoteClockMonitor : ctx .RemoteClocks ,
281- clusterID : & ctx .ClusterID ,
282- version : ctx .version ,
279+ clock : ctx .LocalClock ,
280+ remoteClockMonitor : ctx .RemoteClocks ,
281+ clusterID : & ctx .ClusterID ,
282+ nodeID : & ctx .NodeID ,
283+ version : ctx .version ,
284+ testingAllowNamedRPCToAnonymousServer : ctx .TestingAllowNamedRPCToAnonymousServer ,
283285 })
284286 return s
285287}
@@ -298,14 +300,20 @@ type Connection struct {
298300 initialHeartbeatDone chan struct {} // closed after first heartbeat
299301 stopper * stop.Stopper
300302
303+ // remoteNodeID implies checking the remote node ID. 0 when unknown,
304+ // non-zero to check with remote node. This is constant throughout
305+ // the lifetime of a Connection object.
306+ remoteNodeID roachpb.NodeID
307+
301308 initOnce sync.Once
302309 validatedOnce sync.Once
303310}
304311
305- func newConnection (stopper * stop.Stopper ) * Connection {
312+ func newConnectionToNodeID (stopper * stop.Stopper , remoteNodeID roachpb. NodeID ) * Connection {
306313 c := & Connection {
307314 initialHeartbeatDone : make (chan struct {}),
308315 stopper : stopper ,
316+ remoteNodeID : remoteNodeID ,
309317 }
310318 c .heartbeatResult .Store (heartbeatResult {err : ErrNotHeartbeated })
311319 return c
@@ -372,11 +380,23 @@ type Context struct {
372380 stats StatsHandler
373381
374382 ClusterID base.ClusterIDContainer
383+ NodeID base.NodeIDContainer
375384 version * cluster.ExposedClusterVersion
376385
377386 // For unittesting.
378387 BreakerFactory func () * circuit.Breaker
379388 testingDialOpts []grpc.DialOption
389+
390+ // For testing. See the comment on the same field in HeartbeatService.
391+ TestingAllowNamedRPCToAnonymousServer bool
392+ }
393+
394+ // connKey is used as key in the Context.conns map. Different remote
395+ // node IDs get different *Connection objects, to ensure that we don't
396+ // mis-route RPC requests.
397+ type connKey struct {
398+ targetAddr string
399+ nodeID roachpb.NodeID
380400}
381401
382402// NewContext creates an rpc Context with the supplied values.
@@ -422,7 +442,7 @@ func NewContext(
422442 conn .dialErr = & roachpb.NodeUnavailableError {}
423443 }
424444 })
425- ctx .removeConn (k .(string ), conn )
445+ ctx .removeConn (k .(connKey ), conn )
426446 return true
427447 })
428448 })
@@ -439,8 +459,10 @@ func (ctx *Context) GetStatsMap() *syncmap.Map {
439459
440460// GetLocalInternalClientForAddr returns the context's internal batch client
441461// for target, if it exists.
442- func (ctx * Context ) GetLocalInternalClientForAddr (target string ) roachpb.InternalClient {
443- if target == ctx .AdvertiseAddr {
462+ func (ctx * Context ) GetLocalInternalClientForAddr (
463+ target string , nodeID roachpb.NodeID ,
464+ ) roachpb.InternalClient {
465+ if target == ctx .AdvertiseAddr && nodeID == ctx .NodeID .Get () {
444466 return ctx .localInternalClient
445467 }
446468 return nil
@@ -544,15 +566,15 @@ func (ctx *Context) SetLocalInternalServer(internalServer roachpb.InternalServer
544566 ctx .localInternalClient = internalClientAdapter {internalServer }
545567}
546568
547- func (ctx * Context ) removeConn (key string , conn * Connection ) {
569+ func (ctx * Context ) removeConn (key connKey , conn * Connection ) {
548570 ctx .conns .Delete (key )
549571 if log .V (1 ) {
550- log .Infof (ctx .masterCtx , "closing %s " , key )
572+ log .Infof (ctx .masterCtx , "closing %+v " , key )
551573 }
552574 if grpcConn := conn .grpcConn ; grpcConn != nil {
553575 if err := grpcConn .Close (); err != nil && ! grpcutil .IsClosedConnection (err ) {
554576 if log .V (1 ) {
555- log .Errorf (ctx .masterCtx , "failed to close client connection: %s " , err )
577+ log .Errorf (ctx .masterCtx , "failed to close client connection: %v " , err )
556578 }
557579 }
558580 }
@@ -675,11 +697,43 @@ func (ctx *Context) GRPCDialRaw(target string) (*grpc.ClientConn, <-chan struct{
675697 return conn , dialer .redialChan , err
676698}
677699
678- // GRPCDial calls grpc.Dial with options appropriate for the context.
679- func (ctx * Context ) GRPCDial (target string ) * Connection {
680- value , ok := ctx .conns .Load (target )
700+ // GRPCUnvalidatedDial uses GRPCDialNode and disables validation of the
701+ // node ID between client and server. This function should only be
702+ // used with the gossip client and CLI commands which can talk to any
703+ // node.
704+ func (ctx * Context ) GRPCUnvalidatedDial (target string ) * Connection {
705+ return ctx .grpcDialNodeInternal (target , 0 )
706+ }
707+
708+ // GRPCDialNode calls grpc.Dial with options appropriate for the context.
709+ //
710+ // The remoteNodeID becomes a constraint on the expected node ID of
711+ // the remote node; this is checked during heartbeats. The caller is
712+ // responsible for ensuring the remote node ID is known prior to using
713+ // this function.
714+ func (ctx * Context ) GRPCDialNode (target string , remoteNodeID roachpb.NodeID ) * Connection {
715+ if remoteNodeID == 0 && ! ctx .TestingAllowNamedRPCToAnonymousServer {
716+ log .Fatalf (context .TODO (), "invalid node ID 0 in GRPCDialNode()" )
717+ }
718+ return ctx .grpcDialNodeInternal (target , remoteNodeID )
719+ }
720+
721+ func (ctx * Context ) grpcDialNodeInternal (target string , remoteNodeID roachpb.NodeID ) * Connection {
722+ thisConnKey := connKey {target , remoteNodeID }
723+ value , ok := ctx .conns .Load (thisConnKey )
681724 if ! ok {
682- value , _ = ctx .conns .LoadOrStore (target , newConnection (ctx .Stopper ))
725+ value , _ = ctx .conns .LoadOrStore (thisConnKey , newConnectionToNodeID (ctx .Stopper , remoteNodeID ))
726+ if remoteNodeID != 0 {
727+ // If the first connection established at a target address is
728+ // for a specific node ID, then we want to reuse that connection
729+ // also for other dials (eg for gossip) which don't require a
730+ // specific node ID. (We do this as an optimization to reduce
731+ // the number of TCP connections alive between nodes. This is
732+ // not strictly required for correctness.) This LoadOrStore will
733+ // ensure we're registering the connection we just created for
734+ // future use by these other dials.
735+ _ , _ = ctx .conns .LoadOrStore (connKey {target , 0 }, value )
736+ }
683737 }
684738
685739 conn := value .(* Connection )
@@ -694,11 +748,11 @@ func (ctx *Context) GRPCDial(target string) *Connection {
694748 if err != nil && ! grpcutil .IsClosedConnection (err ) {
695749 log .Errorf (masterCtx , "removing connection to %s due to error: %s" , target , err )
696750 }
697- ctx .removeConn (target , conn )
751+ ctx .removeConn (thisConnKey , conn )
698752 })
699753 }); err != nil {
700754 conn .dialErr = err
701- ctx .removeConn (target , conn )
755+ ctx .removeConn (thisConnKey , conn )
702756 }
703757 }
704758 })
@@ -720,7 +774,7 @@ func (ctx *Context) NewBreaker(name string) *circuit.Breaker {
720774// the first heartbeat.
721775var ErrNotHeartbeated = errors .New ("not yet heartbeated" )
722776
723- // ConnHealth returns nil if we have an open connection to the given
777+ // TestingConnHealth returns nil if we have an open connection to the given
724778// target that succeeded on its most recent heartbeat. Otherwise, it
725779// kicks off a connection attempt (unless one is already in progress
726780// or we are in a backoff state) and returns an error (typically
@@ -729,27 +783,26 @@ var ErrNotHeartbeated = errors.New("not yet heartbeated")
729783// error will be returned. This method should therefore be used to
730784// prioritize among a list of candidate nodes, but not to filter out
731785// "unhealthy" nodes.
732- func (ctx * Context ) ConnHealth (target string ) error {
733- if ctx .GetLocalInternalClientForAddr (target ) != nil {
786+ //
787+ // This is used in tests only; in clusters use (*Dialer).ConnHealth()
788+ // instead which automates the address resolution.
789+ //
790+ // TODO(knz): remove this altogether. Use the dialer in all cases.
791+ func (ctx * Context ) TestingConnHealth (target string , nodeID roachpb.NodeID ) error {
792+ if ctx .GetLocalInternalClientForAddr (target , nodeID ) != nil {
734793 // The local server is always considered healthy.
735794 return nil
736795 }
737- conn := ctx .GRPCDial (target )
796+ conn := ctx .GRPCDialNode (target , nodeID )
738797 return conn .Health ()
739798}
740799
741800func (ctx * Context ) runHeartbeat (
742801 conn * Connection , target string , redialChan <- chan struct {},
743802) error {
744803 maxOffset := ctx .LocalClock .MaxOffset ()
745- clusterID := ctx . ClusterID . Get ()
804+ maxOffsetNanos := maxOffset . Nanoseconds ()
746805
747- request := PingRequest {
748- Addr : ctx .Addr ,
749- MaxOffsetNanos : maxOffset .Nanoseconds (),
750- ClusterID : & clusterID ,
751- ServerVersion : ctx .version .ServerVersion ,
752- }
753806 heartbeatClient := NewHeartbeatClient (conn .grpcConn )
754807
755808 var heartbeatTimer timeutil.Timer
@@ -768,14 +821,24 @@ func (ctx *Context) runHeartbeat(
768821 heartbeatTimer .Read = true
769822 }
770823
824+ // We re-mint the PingRequest to pick up any asynchronous update to clusterID.
825+ clusterID := ctx .ClusterID .Get ()
826+ request := & PingRequest {
827+ Addr : ctx .Addr ,
828+ MaxOffsetNanos : maxOffsetNanos ,
829+ ClusterID : & clusterID ,
830+ NodeID : conn .remoteNodeID ,
831+ ServerVersion : ctx .version .ServerVersion ,
832+ }
833+
771834 var response * PingResponse
772835 sendTime := ctx .LocalClock .PhysicalTime ()
773836 err := contextutil .RunWithTimeout (ctx .masterCtx , "rpc heartbeat" , ctx .heartbeatTimeout ,
774837 func (goCtx context.Context ) error {
775838 // NB: We want the request to fail-fast (the default), otherwise we won't
776839 // be notified of transport failures.
777840 var err error
778- response , err = heartbeatClient .Ping (goCtx , & request )
841+ response , err = heartbeatClient .Ping (goCtx , request )
779842 return err
780843 })
781844
0 commit comments