2121import static org .apache .hadoop .fs .CommonConfigurationKeysPublic .IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY ;
2222import static org .apache .hadoop .fs .CommonConfigurationKeysPublic .IPC_CLIENT_CONNECT_TIMEOUT_KEY ;
2323
24+ import java .io .EOFException ;
2425import java .io .FileNotFoundException ;
2526import java .io .IOException ;
2627import java .lang .reflect .Constructor ;
@@ -436,8 +437,7 @@ private Object invokeMethod(
436437 this .rpcMonitor .proxyOpFailureStandby ();
437438 }
438439 failover = true ;
439- } else if (ioe instanceof ConnectException ||
440- ioe instanceof ConnectTimeoutException ) {
440+ } else if (isUnavailableException (ioe )) {
441441 if (this .rpcMonitor != null ) {
442442 this .rpcMonitor .proxyOpFailureCommunicate ();
443443 }
@@ -503,8 +503,7 @@ private Object invokeMethod(
503503 if (ioe instanceof StandbyException ) {
504504 LOG .error ("{} at {} is in Standby: {}" ,
505505 nnKey , addr , ioe .getMessage ());
506- } else if (ioe instanceof ConnectException ||
507- ioe instanceof ConnectTimeoutException ) {
506+ } else if (isUnavailableException (ioe )) {
508507 exConnect ++;
509508 LOG .error ("{} at {} cannot be reached: {}" ,
510509 nnKey , addr , ioe .getMessage ());
@@ -563,8 +562,7 @@ private Object invoke(String nsId, int retryCount, final Method method,
563562 // failover, invoker looks for standby exceptions for failover.
564563 if (ioe instanceof StandbyException ) {
565564 throw ioe ;
566- } else if (ioe instanceof ConnectException ||
567- ioe instanceof ConnectTimeoutException ) {
565+ } else if (isUnavailableException (ioe )) {
568566 throw ioe ;
569567 } else {
570568 throw new StandbyException (ioe .getMessage ());
@@ -578,6 +576,27 @@ private Object invoke(String nsId, int retryCount, final Method method,
578576 }
579577 }
580578
579+ /**
580+ * Check if the exception comes from an unavailable subcluster.
581+ * @param ioe IOException to check.
582+ * @return If the exception comes from an unavailable subcluster.
583+ */
584+ public static boolean isUnavailableException (IOException ioe ) {
585+ if (ioe instanceof ConnectException ||
586+ ioe instanceof ConnectTimeoutException ||
587+ ioe instanceof EOFException ||
588+ ioe instanceof StandbyException ) {
589+ return true ;
590+ }
591+ if (ioe instanceof RetriableException ) {
592+ Throwable cause = ioe .getCause ();
593+ if (cause instanceof NoNamenodesAvailableException ) {
594+ return true ;
595+ }
596+ }
597+ return false ;
598+ }
599+
581600 /**
582601 * Check if the cluster of given nameservice id is available.
583602 * @param nsId nameservice ID.
@@ -833,8 +852,7 @@ public <T> T invokeSequential(
833852
834853 final UserGroupInformation ugi = RouterRpcServer .getRemoteUser ();
835854 final Method m = remoteMethod .getMethod ();
836- IOException firstThrownException = null ;
837- IOException lastThrownException = null ;
855+ List <IOException > thrownExceptions = new ArrayList <>();
838856 Object firstResult = null ;
839857 // Invoke in priority order
840858 for (final RemoteLocationContext loc : locations ) {
@@ -862,29 +880,33 @@ public <T> T invokeSequential(
862880 ioe = processException (ioe , loc );
863881
864882 // Record it and move on
865- lastThrownException = ioe ;
866- if (firstThrownException == null ) {
867- firstThrownException = lastThrownException ;
868- }
883+ thrownExceptions .add (ioe );
869884 } catch (Exception e ) {
870885 // Unusual error, ClientProtocol calls always use IOException (or
871886 // RemoteException). Re-wrap in IOException for compatibility with
872887 // ClientProtcol.
873888 LOG .error ("Unexpected exception {} proxying {} to {}" ,
874889 e .getClass (), m .getName (), ns , e );
875- lastThrownException = new IOException (
890+ IOException ioe = new IOException (
876891 "Unexpected exception proxying API " + e .getMessage (), e );
877- if (firstThrownException == null ) {
878- firstThrownException = lastThrownException ;
879- }
892+ thrownExceptions .add (ioe );
880893 }
881894 }
882895
883- if (firstThrownException != null ) {
884- // re-throw the last exception thrown for compatibility
885- throw firstThrownException ;
896+ if (!thrownExceptions .isEmpty ()) {
897+ // An unavailable subcluster may be the actual cause
898+ // We cannot surface other exceptions (e.g., FileNotFoundException)
899+ for (int i = 0 ; i < thrownExceptions .size (); i ++) {
900+ IOException ioe = thrownExceptions .get (i );
901+ if (isUnavailableException (ioe )) {
902+ throw ioe ;
903+ }
904+ }
905+
906+ // re-throw the first exception thrown for compatibility
907+ throw thrownExceptions .get (0 );
886908 }
887- // Return the last result, whether it is the value we are looking for or a
909+ // Return the first result, whether it is the value or not
888910 @ SuppressWarnings ("unchecked" )
889911 T ret = (T )firstResult ;
890912 return ret ;
0 commit comments