@@ -35,7 +35,7 @@ import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils}
35
35
import org .apache .spark .sql .delta .sources ._
36
36
import org .apache .spark .sql .delta .storage .LogStoreProvider
37
37
import org .apache .spark .sql .delta .util .FileNames
38
- import com .google .common .cache .{CacheBuilder , RemovalNotification }
38
+ import com .google .common .cache .{Cache , CacheBuilder , RemovalNotification }
39
39
import org .apache .hadoop .conf .Configuration
40
40
import org .apache .hadoop .fs .{FileStatus , FileSystem , Path }
41
41
@@ -615,21 +615,42 @@ object DeltaLog extends DeltaLogging {
615
615
* We create only a single [[DeltaLog ]] for any given `DeltaLogCacheKey` to avoid wasted work
616
616
* in reconstructing the log.
617
617
*/
618
- private val deltaLogCache = {
619
- val builder = CacheBuilder .newBuilder()
620
- .expireAfterAccess(60 , TimeUnit .MINUTES )
621
- .removalListener((removalNotification : RemovalNotification [DeltaLogCacheKey , DeltaLog ]) => {
622
- val log = removalNotification.getValue
623
- // TODO: We should use ref-counting to uncache snapshots instead of a manual timed op
624
- try log.unsafeVolatileSnapshot.uncache() catch {
625
- case _ : java.lang.NullPointerException =>
626
- // Various layers will throw null pointer if the RDD is already gone.
627
- }
628
- })
629
- sys.props.get(" delta.log.cacheSize" )
630
- .flatMap(v => Try (v.toLong).toOption)
631
- .foreach(builder.maximumSize)
632
- builder.build[DeltaLogCacheKey , DeltaLog ]()
618
+ type CacheKey = (Path , Map [String , String ])
619
+ private [delta] def getOrCreateCache (conf : SQLConf ):
620
+ Cache [CacheKey , DeltaLog ] = synchronized {
621
+ deltaLogCache match {
622
+ case Some (c) => c
623
+ case None =>
624
+ val builder = createCacheBuilder(conf)
625
+ .removalListener(
626
+ (removalNotification : RemovalNotification [DeltaLogCacheKey , DeltaLog ]) => {
627
+ val log = removalNotification.getValue
628
+ // TODO: We should use ref-counting to uncache snapshots instead of a manual timed op
629
+ try log.unsafeVolatileSnapshot.uncache() catch {
630
+ case _ : java.lang.NullPointerException =>
631
+ // Various layers will throw null pointer if the RDD is already gone.
632
+ }
633
+ })
634
+ deltaLogCache = Some (builder.build[CacheKey , DeltaLog ]())
635
+ deltaLogCache.get
636
+ }
637
+ }
638
+
639
+ private var deltaLogCache : Option [Cache [CacheKey , DeltaLog ]] = None
640
+
641
+ /**
642
+ * Helper to create delta log caches
643
+ */
644
+ private def createCacheBuilder (conf : SQLConf ): CacheBuilder [AnyRef , AnyRef ] = {
645
+ val cacheRetention = conf.getConf(DeltaSQLConf .DELTA_LOG_CACHE_RETENTION_MINUTES )
646
+ val cacheSize = conf
647
+ .getConf(DeltaSQLConf .DELTA_LOG_CACHE_SIZE )
648
+ .max(sys.props.get(" delta.log.cacheSize" ).map(_.toLong).getOrElse(0L ))
649
+
650
+ CacheBuilder
651
+ .newBuilder()
652
+ .expireAfterAccess(cacheRetention, TimeUnit .MINUTES )
653
+ .maximumSize(cacheSize)
633
654
}
634
655
635
656
@@ -787,7 +808,8 @@ object DeltaLog extends DeltaLogging {
787
808
// - Different `authority` (e.g., different user tokens in the path)
788
809
// - Different mount point.
789
810
try {
790
- deltaLogCache.get(path -> fileSystemOptions, () => {
811
+ getOrCreateCache(spark.sessionState.conf)
812
+ .get(path -> fileSystemOptions, () => {
791
813
createDeltaLog()
792
814
}
793
815
)
@@ -801,7 +823,7 @@ object DeltaLog extends DeltaLogging {
801
823
if (Option (deltaLog.sparkContext.get).map(_.isStopped).getOrElse(true )) {
802
824
// Invalid the cached `DeltaLog` and create a new one because the `SparkContext` of the cached
803
825
// `DeltaLog` has been stopped.
804
- deltaLogCache .invalidate(path -> fileSystemOptions)
826
+ getOrCreateCache(spark.sessionState.conf) .invalidate(path -> fileSystemOptions)
805
827
getDeltaLogFromCache()
806
828
} else {
807
829
deltaLog
@@ -819,6 +841,7 @@ object DeltaLog extends DeltaLogging {
819
841
// scalastyle:on deltahadoopconfiguration
820
842
val path = fs.makeQualified(rawPath)
821
843
844
+ val deltaLogCache = getOrCreateCache(spark.sessionState.conf)
822
845
if (spark.sessionState.conf.getConf(
823
846
DeltaSQLConf .LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS )) {
824
847
// We rely on the fact that accessing the key set doesn't modify the entry access time. See
@@ -841,12 +864,19 @@ object DeltaLog extends DeltaLogging {
841
864
}
842
865
843
866
def clearCache (): Unit = {
844
- deltaLogCache.invalidateAll()
867
+ deltaLogCache.foreach(_.invalidateAll())
868
+ }
869
+
870
+ /** Unset the caches. Exposing for testing */
871
+ private [delta] def unsetCache (): Unit = {
872
+ synchronized {
873
+ deltaLogCache = None
874
+ }
845
875
}
846
876
847
877
/** Return the number of cached `DeltaLog`s. Exposing for testing */
848
878
private [delta] def cacheSize : Long = {
849
- deltaLogCache.size()
879
+ deltaLogCache.map(_. size()).getOrElse( 0L )
850
880
}
851
881
852
882
/**
0 commit comments