Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f5c717e
HIVE-29016: rebasing;
henrib Jun 30, 2025
563c31d
HIVE-29016: fixing cache handling for REST catalog;
henrib Jun 20, 2025
364a257
HIVE-29016: changing default to disable catalog caching;
henrib Jun 21, 2025
dc4c029
HIVE-29016: rebasing;
henrib Jun 28, 2025
d691ff1
HIVE-29016: fixing rebasing;
henrib Jun 30, 2025
7a37552
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Jun 30, 2025
6f5bd6c
HIVE-29016: fixing dependency;
henrib Jun 30, 2025
58f38c1
HIVE-29016: revert to simpler cache but check that cached table locat…
henrib Jul 3, 2025
3db7b2f
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Jul 7, 2025
54d51ce
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Jul 7, 2025
7d611d3
HIVE-29016: clean up;
henrib Jul 8, 2025
1a7cf16
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Aug 1, 2025
2203dab
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Aug 3, 2025
4e9a52f
- Improved loadTable cache handling
henrib Aug 4, 2025
f8eb5e4
Merge branch 'apache:master' into HIVE-29016-henrib
henrib Aug 6, 2025
8c7de03
Update standalone-metastore/metastore-rest-catalog/src/main/java/org/…
henrib Aug 6, 2025
235771d
HIVE-29035: removed extensibility from HMSCatalogFactory;
henrib Aug 7, 2025
210a2cc
HIVE-29035: fixing table metadata location check mistake;
henrib Aug 12, 2025
717766c
Merge branch 'apache:master' into HIVE-29035
henrib Aug 12, 2025
308892f
HIVE-29035 : renaming, addressing review comments;
henrib Aug 14, 2025
99b6d44
Merge branch 'apache:master' into HIVE-29035
henrib Aug 15, 2025
dc6fc63
Merge branch 'apache:master' into HIVE-29035
henrib Aug 27, 2025
4e5249f
Merge branch 'apache:master' into HIVE-29035
henrib Sep 3, 2025
ce5fb8a
HIVE-29016: rebasing;
henrib Jun 30, 2025
e51cbd2
HIVE-29016: fixing cache handling for REST catalog;
henrib Jun 20, 2025
71ecdaf
HIVE-29016: changing default to disable catalog caching;
henrib Jun 21, 2025
7c79f15
HIVE-29016: rebasing;
henrib Jun 28, 2025
7d80bc2
HIVE-29016: fixing rebasing;
henrib Jun 30, 2025
2c307df
HIVE-29016: fixing dependency;
henrib Jun 30, 2025
4ca34b4
HIVE-29016: revert to simpler cache but check that cached table locat…
henrib Jul 3, 2025
ff34452
HIVE-29016: clean up;
henrib Jul 8, 2025
9e1c76b
- Improved loadTable cache handling
henrib Aug 4, 2025
6314755
Update standalone-metastore/metastore-rest-catalog/src/main/java/org/…
henrib Aug 6, 2025
6f22e78
HIVE-29035: removed extensibility from HMSCatalogFactory;
henrib Aug 7, 2025
eef7525
HIVE-29035: fixing table metadata location check mistake;
henrib Aug 12, 2025
17b159c
HIVE-29035 : renaming, addressing review comments;
henrib Aug 14, 2025
8a818c5
HIVE-29035 : nitpicks;
henrib Sep 15, 2025
58d94ee
Merge branch 'HIVE-29035' of https://github.com/henrib/hive into HIVE…
henrib Sep 15, 2025
3087edb
Merge branch 'apache:master' into HIVE-29035
henrib Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configurable;
Expand All @@ -31,6 +32,7 @@
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.AlreadyExistsException;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.GetTableRequest;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.PrincipalType;
Expand Down Expand Up @@ -408,23 +410,46 @@ private void validateTableIsIcebergTableOrView(
*/
@Override
public boolean tableExists(TableIdentifier identifier) {
return Objects.nonNull(fetchTable(identifier));
}

/**
* Check whether table exists and return its current metadata location.
*
* <p>Note: If a hive table with the same identifier exists in catalog, this method will return
* {@code null}.
*
* @param identifier a table identifier
* @return the location of the table if it exists, null otherwise
*/
public String getTableMetadataLocation(TableIdentifier identifier) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please do not change anything in iceberg-catalog or submit an iceberg PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iceberg PR: apache/iceberg#13800

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks!

Table table = fetchTable(identifier);
if (table == null) {
return null;
}
return table.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP);
}

private Table fetchTable(TableIdentifier identifier) {
TableIdentifier baseTableIdentifier = identifier;
if (!isValidIdentifier(identifier)) {
if (!isValidMetadataIdentifier(identifier)) {
return false;
return null;
} else {
baseTableIdentifier = TableIdentifier.of(identifier.namespace().levels());
}
}

String database = baseTableIdentifier.namespace().level(0);
String tableName = baseTableIdentifier.name();
try {
Table table = clients.run(client -> client.getTable(database, tableName));
GetTableRequest request = new GetTableRequest();
request.setDbName(database);
request.setTblName(tableName);
Table table = clients.run(client -> client.getTable(request));
HiveOperationsBase.validateTableIsIceberg(table, fullTableName(name, baseTableIdentifier));
return true;
return table;
} catch (NoSuchTableException | NoSuchObjectException e) {
return false;
return null;
} catch (TException e) {
throw new RuntimeException("Failed to check table existence of " + baseTableIdentifier, e);
} catch (InterruptedException e) {
Expand All @@ -434,6 +459,7 @@ public boolean tableExists(TableIdentifier identifier) {
}
}


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache in HIVE-29035 is limited to serving loadTable() for REST and resides server-side; the Table objects it serves are marshaled by to a client so there is no 'external' instance sharing. It is dependent upon HMS being the actual catalog implementation to acquire the latest known metadata location for a given table. This makes this PR pretty much tied to Hive; there is no need to involve Iceberg.
If/when the Iceberg community accepts the TableMetadata caching you propose, we can then assess what it means for this cache implementation.

Copy link
Contributor

@okumin okumin Aug 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without using a REST catalog, a client retrieves table metadata through XYZCatalog -> TableMetadataParser -> S3/HDFS/etc. With a REST catalog, a client does it through RESTCatalog, where REST API(in our case, the servlet) serves metadata via XYZCatalog(in our case, HiveCatalog or HMSCachingCatalog) -> TableMetadataParser -> S3/HDFS/etc. So, TableMetadataParser might be a better place to maintain. It can support our use case, and we can remove HMSCachingCatalog, which uses CachingCatalog introduced for client-side caching and utilized only in SparkCatalog and FlinkCatalog.
If my idea has some defects, I probably won't send the patch

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without using a REST catalog, a client retrieves table metadata through XYZCatalog -> TableMetadataParser -> S3/HDFS/etc. With a REST catalog, a client does it through RESTCatalog, where REST API(in our case, the servlet) serves metadata via XYZCatalog(in our case, HiveCatalog or HMSCachingCatalog) -> TableMetadataParser -> S3/HDFS/etc. So, TableMetadataParser might be a better place to maintain. It can support our use case, and we can remove HMSCachingCatalog, which uses CachingCatalog introduced for client-side caching and utilized only in SparkCatalog and FlinkCatalog. If my idea has some defects, I probably won't send the patch

@okumin are you saying TableMetadataParser is the common place for both direct and REST catalog invocations?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I am.

Using HiveCatalog directly, everything works on the client side as follows.
image

Using REST, some steps will be delegated to the server-side. In either case, TableMetadataParser would interact with an object storage.
image

Copy link
Member

@deniskuzZ deniskuzZ Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some notes from iceberg dev lists:

Caching Metadata on the Client Side: Reloading table metadata for a
particular snapshot could leverage the ETag mechanism to reduce the amount
of network traffic.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked the Iceberg community with the sample PR.
apache/iceberg#14137

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@okumin, do you know if apache/iceberg@72d5fd6 solves the same problem?

@Override
public boolean viewExists(TableIdentifier viewIdentifier) {
if (!isValidIdentifier(viewIdentifier)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1881,7 +1881,7 @@ public enum ConfVars {
"HMS Iceberg Catalog servlet path component of URL endpoint."
),
ICEBERG_CATALOG_CACHE_EXPIRY("metastore.iceberg.catalog.cache.expiry",
"hive.metastore.iceberg.catalog.cache.expiry", -1,
"hive.metastore.iceberg.catalog.cache.expiry", 600_000L,
"HMS Iceberg Catalog cache expiry."
),
HTTPSERVER_THREADPOOL_MIN("hive.metastore.httpserver.threadpool.min",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,19 @@

package org.apache.iceberg.rest;

import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Ticker;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.BaseMetadataTable;
import org.apache.iceberg.CachingCatalog;
import org.apache.iceberg.HasTableOperations;
import org.apache.iceberg.MetadataTableType;
import org.apache.iceberg.MetadataTableUtils;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.SupportsNamespaces;
Expand All @@ -35,63 +42,261 @@
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.view.View;
import org.apache.iceberg.view.ViewBuilder;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Class that wraps an Iceberg Catalog to cache tables.
*/
public class HMSCachingCatalog extends CachingCatalog implements SupportsNamespaces, ViewCatalog {
private final HiveCatalog hiveCatalog;
protected static final Logger LOG = LoggerFactory.getLogger(HMSCachingCatalog.class);
protected final HiveCatalog hiveCatalog;

public HMSCachingCatalog(HiveCatalog catalog, long expiration) {
super(catalog, true, expiration, Ticker.systemTicker());
super(catalog, false, expiration, Ticker.systemTicker());
this.hiveCatalog = catalog;
}

@Override
public Catalog.TableBuilder buildTable(TableIdentifier identifier, Schema schema) {
return hiveCatalog.buildTable(identifier, schema);
public void createNamespace(Namespace namespace, Map<String, String> map) {
hiveCatalog.createNamespace(namespace, map);
}

@Override
public void createNamespace(Namespace nmspc, Map<String, String> map) {
hiveCatalog.createNamespace(nmspc, map);
public List<Namespace> listNamespaces(Namespace namespace) throws NoSuchNamespaceException {
return hiveCatalog.listNamespaces(namespace);
}

/**
* Callback when cache invalidates the entry for a given table identifier.
*
* @param tid the table identifier to invalidate
*/
protected void onCacheInvalidate(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache loads a table for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheLoad(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache hit for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheHit(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache miss occurs for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheMiss(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}
/**
* Callback when cache loads a metadata table for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheMetaLoad(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

@Override
public List<Namespace> listNamespaces(Namespace nmspc) throws NoSuchNamespaceException {
return hiveCatalog.listNamespaces(nmspc);
public Table loadTable(final TableIdentifier identifier) {
final TableIdentifier canonicalized = identifier.toLowerCase();
final Table cachedTable = tableCache.getIfPresent(canonicalized);
if (cachedTable != null) {
final String location = hiveCatalog.getTableMetadataLocation(canonicalized);
if (location == null) {
LOG.debug("Table {} has no location, returning cached table without location", canonicalized);
} else {
String cachedLocation = cachedTable instanceof HasTableOperations tableOps
? tableOps.operations().current().metadataFileLocation()
: null;
if (!location.equals(cachedLocation)) {
LOG.debug("Invalidate table {}, cached {} != actual {}", canonicalized, cachedLocation, location);
// Invalidate the cached table if the location is different
invalidateTable(canonicalized);
onCacheInvalidate(canonicalized);
} else {
LOG.debug("Returning cached table: {}", canonicalized);
onCacheHit(canonicalized);
return cachedTable;
}
}
} else {
LOG.debug("Cache miss for table: {}", canonicalized);
onCacheMiss(canonicalized);
}
final Table table = tableCache.get(canonicalized, hiveCatalog::loadTable);
if (table instanceof BaseMetadataTable) {
// Cache underlying table
TableIdentifier originTableIdentifier =
TableIdentifier.of(canonicalized.namespace().levels());
Table originTable = tableCache.get(originTableIdentifier, hiveCatalog::loadTable);
// Share TableOperations instance of origin table for all metadata tables, so that metadata
// table instances are refreshed as well when origin table instance is refreshed.
if (originTable instanceof HasTableOperations tableOps) {
TableOperations ops = tableOps.operations();
MetadataTableType type = MetadataTableType.from(canonicalized.name());
Table metadataTable =
MetadataTableUtils.createMetadataTableInstance(
ops, hiveCatalog.name(), originTableIdentifier, canonicalized, type);
tableCache.put(canonicalized, metadataTable);
onCacheMetaLoad(canonicalized);
LOG.debug("Loaded metadata table: {} for origin table: {}", canonicalized, originTableIdentifier);
// Return the metadata table instead of the original table
return metadataTable;
}
}
onCacheLoad(canonicalized);
LOG.debug("Loaded table: {} ", canonicalized);
return table;
}

/**
* Callback when cache invalidates the entry for a given table identifier.
*
* @param tid the table identifier to invalidate
*/
protected void onCacheInvalidate(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache loads a table for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheLoad(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache hit for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheHit(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

/**
* Callback when cache miss occurs for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheMiss(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}
/**
* Callback when cache loads a metadata table for a given table identifier.
*
* @param tid the table identifier
*/
protected void onCacheMetaLoad(TableIdentifier tid) {
// This method is intentionally left empty. It can be overridden in subclasses if needed.
}

@Override
public Table loadTable(final TableIdentifier identifier) {
final Cache<TableIdentifier, Table> cache = this.tableCache;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the point in those local vars?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Usual (old?) pattern of using locals instead of dereferencing members (marginally faster).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tableCache field is final so there is no point in defining local vars

final HiveCatalog catalog = this.hiveCatalog;
final TableIdentifier canonicalized = identifier.toLowerCase();
Table cachedTable = cache.getIfPresent(canonicalized);
if (cachedTable != null) {
final String location = catalog.getTableMetadataLocation(canonicalized);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't you use catalog.loadTable(canonicalized) and get the location instead of adding new method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this imply the cache is rematerializing a table object to verify it might have avoided it ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, I might be missing something. does the catalog.loadTable(canonicalized) use the cache?

if (location == null) {
LOG.debug("Table {} has no location, returning cached table without location", canonicalized);
} else {
String cachedLocation = cachedTable instanceof HasTableOperations tableOps
? tableOps.operations().current().metadataFileLocation()
: null;
if (!location.equals(cachedLocation)) {
LOG.debug("Invalidate table {}, cached {} != actual {}", canonicalized, cachedLocation, location);
// Invalidate the cached table if the location is different
invalidateTable(canonicalized);
onCacheInvalidate(canonicalized);
} else {
LOG.debug("Returning cached table: {}", canonicalized);
onCacheHit(canonicalized);
return cachedTable;
}
}
} else {
LOG.debug("Cache miss for table: {}", canonicalized);
onCacheMiss(canonicalized);
}
Table table = cache.get(canonicalized, catalog::loadTable);
if (table instanceof BaseMetadataTable) {
// Cache underlying table
TableIdentifier originTableIdentifier =
TableIdentifier.of(canonicalized.namespace().levels());
Table originTable = cache.get(originTableIdentifier, catalog::loadTable);
// Share TableOperations instance of origin table for all metadata tables, so that metadata
// table instances are refreshed as well when origin table instance is refreshed.
if (originTable instanceof HasTableOperations tableOps) {
TableOperations ops = tableOps.operations();
MetadataTableType type = MetadataTableType.from(canonicalized.name());
Table metadataTable =
MetadataTableUtils.createMetadataTableInstance(
ops, catalog.name(), originTableIdentifier, canonicalized, type);
cache.put(canonicalized, metadataTable);
onCacheMetaLoad(canonicalized);
LOG.debug("Loaded metadata table: {} for origin table: {}", canonicalized, originTableIdentifier);
// Return the metadata table instead of the original table
return metadataTable;
}
}
onCacheLoad(canonicalized);
LOG.debug("Loaded table: {} ", canonicalized);
return table;
}

@Override
public Map<String, String> loadNamespaceMetadata(Namespace nmspc) throws NoSuchNamespaceException {
return hiveCatalog.loadNamespaceMetadata(nmspc);
public Map<String, String> loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException {
return hiveCatalog.loadNamespaceMetadata(namespace);
}

@Override
public boolean dropNamespace(Namespace nmspc) throws NamespaceNotEmptyException {
List<TableIdentifier> tables = listTables(nmspc);
public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyException {
List<TableIdentifier> tables = listTables(namespace);
for (TableIdentifier ident : tables) {
invalidateTable(ident);
}
return hiveCatalog.dropNamespace(nmspc);
return hiveCatalog.dropNamespace(namespace);
}

@Override
public boolean setProperties(Namespace nmspc, Map<String, String> map) throws NoSuchNamespaceException {
return hiveCatalog.setProperties(nmspc, map);
public boolean setProperties(Namespace namespace, Map<String, String> map) throws NoSuchNamespaceException {
return hiveCatalog.setProperties(namespace, map);
}

@Override
public boolean removeProperties(Namespace nmspc, Set<String> set) throws NoSuchNamespaceException {
return hiveCatalog.removeProperties(nmspc, set);
public boolean removeProperties(Namespace namespace, Set<String> set) throws NoSuchNamespaceException {
return hiveCatalog.removeProperties(namespace, set);
}

@Override
public boolean namespaceExists(Namespace namespace) {
return hiveCatalog.namespaceExists(namespace);
}

@Override
public Catalog.TableBuilder buildTable(TableIdentifier identifier, Schema schema) {
return hiveCatalog.buildTable(identifier, schema);
}

@Override
public List<TableIdentifier> listViews(Namespace namespace) {
return hiveCatalog.listViews(namespace);
Expand Down
Loading
Loading