Skip to content

Commit

Permalink
Propagate stats for BigLake Managed tables (GoogleCloudDataproc#1156)
Browse files Browse the repository at this point in the history
* Propagate stats for Blmts

* Refactor: Use a varible to represent tableDefinition

* populate row_count for BigLake Managed tables

* Refactor: Moved util methods from SparkBigQueryUtil -> BigQueryUtil

* Add PR description in release notes
  • Loading branch information
vteja11 authored Feb 26, 2024
1 parent 1aa740c commit 6cce9f1
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Release Notes

## Next
* PR #1156: Propagate stats for BigLake Managed tables

* PR #1181: Add caching during protobuf generation

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,4 +716,33 @@ public static String sanitizeLabelValue(String value) {
}
return buf.toString();
}

/**
* BigLake Managed tables are not represented by a dedicated type. Instead, their presence is
* indicated by the field 'bigLakeConfiguration' within the StandardTableDefinition {@link
* StandardTableDefinition#getBigLakeConfiguration()}.
*
* @param table the table to check
* @return Returns true if the table is a BigLake Managed table.
*/
public static boolean isBigLakeManagedTable(TableInfo table) {
TableDefinition tableDefinition = table.getDefinition();
return tableDefinition.getType() == TableDefinition.Type.TABLE
&& tableDefinition instanceof StandardTableDefinition
&& ((StandardTableDefinition) tableDefinition).getBigLakeConfiguration() != null;
}

/**
* Since StandardTableDefinition (table_type == TableDefinition.Type.TABLE) can represent both
* BigQuery native tables and BigLake Managed tables, the absence of the "bigLakeConfiguration"
* field within the StandardTableDefinition {@link
* StandardTableDefinition#getBigLakeConfiguration()}. indicates a BigQuery native table.
*
* @param table the table to check
* @return Returns true if the table is a BigQuery Native table.
*/
public static boolean isBigQueryNativeTable(TableInfo table) {
return table.getDefinition().getType() == TableDefinition.Type.TABLE
&& !isBigLakeManagedTable(table);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
package com.google.cloud.bigquery.connector.common;

import static com.google.common.truth.Truth.assertThat;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

import com.google.cloud.bigquery.BigLakeConfiguration;
import com.google.cloud.bigquery.BigQueryError;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.Clustering;
Expand Down Expand Up @@ -667,4 +670,84 @@ public void testSanitizeLabelValue() {
"1234567890123456789012345678901234567890123456789012345678901234567890"))
.isEqualTo("123456789012345678901234567890123456789012345678901234567890123");
}

@Test
public void testIsBigLakeManagedTable_with_BigLakeManagedTable() {
TableInfo bigLakeManagedTable =
TableInfo.of(
TableId.of("dataset", "biglakemanagedtable"),
StandardTableDefinition.newBuilder()
.setBigLakeConfiguration(
BigLakeConfiguration.newBuilder()
.setTableFormat("ICEBERG")
.setConnectionId("us-connection")
.setFileFormat("PARQUET")
.setStorageUri("gs://bigquery/blmt/nations.parquet")
.build())
.build());

assertTrue(BigQueryUtil.isBigLakeManagedTable(bigLakeManagedTable));
}

@Test
public void testIsBigLakeManagedTable_with_BigQueryExternalTable() {
TableInfo bigQueryExternalTable =
TableInfo.of(
TableId.of("dataset", "bigqueryexternaltable"),
ExternalTableDefinition.newBuilder(
"gs://bigquery/nations.parquet", FormatOptions.avro())
.build());

assertFalse(BigQueryUtil.isBigLakeManagedTable(bigQueryExternalTable));
}

@Test
public void testIsBigLakeManagedTable_with_BigQueryNativeTable() {
TableInfo bigQueryNativeTable =
TableInfo.of(
TableId.of("dataset", "bigquerynativetable"),
StandardTableDefinition.newBuilder().setLocation("us-east-1").build());

assertFalse(BigQueryUtil.isBigLakeManagedTable(bigQueryNativeTable));
}

@Test
public void testIsBigQueryNativeTable_with_BigLakeManagedTable() {
TableInfo bigLakeManagedTable =
TableInfo.of(
TableId.of("dataset", "biglakemanagedtable"),
StandardTableDefinition.newBuilder()
.setBigLakeConfiguration(
BigLakeConfiguration.newBuilder()
.setTableFormat("ICEBERG")
.setConnectionId("us-connection")
.setFileFormat("PARQUET")
.setStorageUri("gs://bigquery/blmt/nations.parquet")
.build())
.build());

assertFalse(BigQueryUtil.isBigQueryNativeTable(bigLakeManagedTable));
}

@Test
public void testIsBigQueryNativeTable_with_BigQueryExternalTable() {
TableInfo bigQueryExternalTable =
TableInfo.of(
TableId.of("dataset", "bigqueryexternaltable"),
ExternalTableDefinition.newBuilder(
"gs://bigquery/nations.parquet", FormatOptions.avro())
.build());

assertFalse(BigQueryUtil.isBigQueryNativeTable(bigQueryExternalTable));
}

@Test
public void testIsBigQueryNativeTable_with_BigQueryNativeTable() {
TableInfo bigQueryNativeTable =
TableInfo.of(
TableId.of("dataset", "bigquerynativetable"),
StandardTableDefinition.newBuilder().setLocation("us-east-1").build());

assertTrue(BigQueryUtil.isBigQueryNativeTable(bigQueryNativeTable));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,8 @@ public void pruneColumns(StructType requiredSchema) {
}

public StatisticsContext estimateStatistics() {
if (table.getDefinition().getType() == TableDefinition.Type.TABLE) {
boolean isBigLakeManagedTable = BigQueryUtil.isBigLakeManagedTable(table);
if (BigQueryUtil.isBigQueryNativeTable(table)) {
// Create StatisticsContext with information from read session response.
final long tableSizeInBytes;
final long numRowsInTable;
Expand Down Expand Up @@ -455,11 +456,13 @@ public OptionalLong numRows() {
};

return tableStatisticsContext;
} else if (table.getDefinition().getType() == TableDefinition.Type.EXTERNAL) {
} else if (table.getDefinition().getType() == TableDefinition.Type.EXTERNAL
|| isBigLakeManagedTable) {
ReadSession readSession = readSessionResponse.get().getReadSession();
// Physical file size for BigLake tables is the size of the files post file pruning and
// includes all fields.
final long tablePhysicalSizeInBytes = readSession.getEstimatedTotalPhysicalFileSize();
final long tableRowCount = readSession.getEstimatedRowCount();
final long originalRowSize = getRowSize(fields.values());
final long projectedRowSize =
schema.map(schema -> getRowSize(Arrays.asList(schema.fields()))).orElse(originalRowSize);
Expand All @@ -479,7 +482,8 @@ public OptionalLong sizeInBytes() {

@Override
public OptionalLong numRows() {
return OptionalLong.empty();
// Number of rows are only available for BigLake Managed tables.
return isBigLakeManagedTable ? OptionalLong.of(tableRowCount) : OptionalLong.empty();
}
};
return tableStatisticsContext;
Expand Down

0 comments on commit 6cce9f1

Please sign in to comment.