2626import static org .apache .hadoop .hdfs .server .federation .router .RBFConfigKeys .DFS_ROUTER_READER_COUNT_KEY ;
2727import static org .apache .hadoop .hdfs .server .federation .router .RBFConfigKeys .DFS_ROUTER_READER_QUEUE_SIZE_DEFAULT ;
2828import static org .apache .hadoop .hdfs .server .federation .router .RBFConfigKeys .DFS_ROUTER_READER_QUEUE_SIZE_KEY ;
29+ import static org .apache .hadoop .hdfs .server .federation .router .RBFConfigKeys .DN_REPORT_CACHE_EXPIRE ;
30+ import static org .apache .hadoop .hdfs .server .federation .router .RBFConfigKeys .DN_REPORT_CACHE_EXPIRE_MS_DEFAULT ;
2931
3032import java .io .FileNotFoundException ;
3133import java .io .IOException ;
4143import java .util .Map ;
4244import java .util .Map .Entry ;
4345import java .util .Set ;
44-
46+ import java .util .concurrent .Callable ;
47+ import java .util .concurrent .ExecutionException ;
48+ import java .util .concurrent .Executors ;
49+ import java .util .concurrent .ThreadFactory ;
50+ import java .util .concurrent .TimeUnit ;
51+
52+ import com .google .common .cache .CacheBuilder ;
53+ import com .google .common .cache .CacheLoader ;
54+ import com .google .common .cache .LoadingCache ;
55+ import com .google .common .util .concurrent .ListenableFuture ;
56+ import com .google .common .util .concurrent .ListeningExecutorService ;
57+ import com .google .common .util .concurrent .MoreExecutors ;
58+ import com .google .common .util .concurrent .ThreadFactoryBuilder ;
4559import org .apache .hadoop .conf .Configuration ;
4660import org .apache .hadoop .crypto .CryptoProtocolVersion ;
4761import org .apache .hadoop .fs .BatchedRemoteIterator .BatchedEntries ;
@@ -219,6 +233,9 @@ public class RouterRpcServer extends AbstractService implements ClientProtocol,
219233 private static final ThreadLocal <UserGroupInformation > CUR_USER =
220234 new ThreadLocal <>();
221235
236+ /** DN type -> full DN report. */
237+ private final LoadingCache <DatanodeReportType , DatanodeInfo []> dnCache ;
238+
222239 /**
223240 * Construct a router RPC server.
224241 *
@@ -361,6 +378,23 @@ public RouterRpcServer(Configuration configuration, Router router,
361378 this .nnProto = new RouterNamenodeProtocol (this );
362379 this .clientProto = new RouterClientProtocol (conf , this );
363380 this .routerProto = new RouterUserProtocol (this );
381+
382+ long dnCacheExpire = conf .getTimeDuration (
383+ DN_REPORT_CACHE_EXPIRE ,
384+ DN_REPORT_CACHE_EXPIRE_MS_DEFAULT , TimeUnit .MILLISECONDS );
385+ this .dnCache = CacheBuilder .newBuilder ()
386+ .build (new DatanodeReportCacheLoader ());
387+
388+ // Actively refresh the dn cache in a configured interval
389+ Executors
390+ .newSingleThreadScheduledExecutor ()
391+ .scheduleWithFixedDelay (() -> this .dnCache
392+ .asMap ()
393+ .keySet ()
394+ .parallelStream ()
395+ .forEach ((key ) -> this .dnCache .refresh (key )),
396+ 0 ,
397+ dnCacheExpire , TimeUnit .MILLISECONDS );
364398 }
365399
366400 @ Override
@@ -868,6 +902,50 @@ public DatanodeInfo[] getDatanodeReport(DatanodeReportType type)
868902 return clientProto .getDatanodeReport (type );
869903 }
870904
905+ /**
906+ * Get the datanode report from cache.
907+ *
908+ * @param type Type of the datanode.
909+ * @return List of datanodes.
910+ * @throws IOException If it cannot get the report.
911+ */
912+ DatanodeInfo [] getCachedDatanodeReport (DatanodeReportType type )
913+ throws IOException {
914+ try {
915+ DatanodeInfo [] dns = this .dnCache .get (type );
916+ if (dns == null ) {
917+ LOG .debug ("Get null DN report from cache" );
918+ dns = getCachedDatanodeReportImpl (type );
919+ this .dnCache .put (type , dns );
920+ }
921+ return dns ;
922+ } catch (ExecutionException e ) {
923+ LOG .error ("Cannot get the DN report for {}" , type , e );
924+ Throwable cause = e .getCause ();
925+ if (cause instanceof IOException ) {
926+ throw (IOException ) cause ;
927+ } else {
928+ throw new IOException (cause );
929+ }
930+ }
931+ }
932+
933+ private DatanodeInfo [] getCachedDatanodeReportImpl (
934+ final DatanodeReportType type ) throws IOException {
935+ // We need to get the DNs as a privileged user
936+ UserGroupInformation loginUser = UserGroupInformation .getLoginUser ();
937+ RouterRpcServer .setCurrentUser (loginUser );
938+
939+ try {
940+ DatanodeInfo [] dns = clientProto .getDatanodeReport (type );
941+ LOG .debug ("Refresh cached DN report with {} datanodes" , dns .length );
942+ return dns ;
943+ } finally {
944+ // Reset ugi to remote user for remaining operations.
945+ RouterRpcServer .resetCurrentUser ();
946+ }
947+ }
948+
871949 /**
872950 * Get the datanode report with a timeout.
873951 * @param type Type of the datanode.
@@ -1748,4 +1826,45 @@ public void refreshSuperUserGroupsConfiguration() throws IOException {
17481826 public String [] getGroupsForUser (String user ) throws IOException {
17491827 return routerProto .getGroupsForUser (user );
17501828 }
1751- }
1829+
1830+ /**
1831+ * Deals with loading datanode report into the cache and refresh.
1832+ */
1833+ private class DatanodeReportCacheLoader
1834+ extends CacheLoader <DatanodeReportType , DatanodeInfo []> {
1835+
1836+ private ListeningExecutorService executorService ;
1837+
1838+ DatanodeReportCacheLoader () {
1839+ ThreadFactory threadFactory = new ThreadFactoryBuilder ()
1840+ .setNameFormat ("DatanodeReport-Cache-Reload" )
1841+ .setDaemon (true )
1842+ .build ();
1843+
1844+ executorService = MoreExecutors .listeningDecorator (
1845+ Executors .newSingleThreadExecutor (threadFactory ));
1846+ }
1847+
1848+ @ Override
1849+ public DatanodeInfo [] load (DatanodeReportType type ) throws Exception {
1850+ return getCachedDatanodeReportImpl (type );
1851+ }
1852+
1853+ /**
1854+ * Override the reload method to provide an asynchronous implementation,
1855+ * so that the query will not be slowed down by the cache refresh. It
1856+ * will return the old cache value and schedule a background refresh.
1857+ */
1858+ @ Override
1859+ public ListenableFuture <DatanodeInfo []> reload (
1860+ final DatanodeReportType type , DatanodeInfo [] oldValue )
1861+ throws Exception {
1862+ return executorService .submit (new Callable <DatanodeInfo []>() {
1863+ @ Override
1864+ public DatanodeInfo [] call () throws Exception {
1865+ return load (type );
1866+ }
1867+ });
1868+ }
1869+ }
1870+ }
0 commit comments