-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Snowflake Connector - Reduce computing resources used for metadata queries #43452
base: master
Are you sure you want to change the base?
Changes from 62 commits
397fa5c
78e4b62
f32b489
9e53d82
5a706ac
7fb31fa
48af04c
12dc83a
69ce596
2c57f15
575598d
bd91e6f
edb487e
88814e9
cf3cc98
89ab6e5
45cd7d0
95725e2
6167403
cc68c4f
29c6785
5deb78f
640049c
e0bcf2d
37302ee
a688e17
40acab0
7372bd2
29b4260
7bed233
c6a37de
4b33e38
08c30a2
9946cfb
0bec496
9cbcc60
54f7721
5cd96c5
b332563
2adc898
976612e
e61387b
b4b7f7b
8a5ed25
0a3f801
82e3eb3
cf82e90
a096193
1d4a713
6462915
b732c83
c9096a2
51efa7c
5c729c1
3a0b8a9
0cd8c53
418bb6e
3121009
e82b761
7d66e12
671e9a8
ed3dbdc
ff65462
38b76d4
679e259
b1f5b81
701bc55
ede80bb
e37c092
468fbe2
f1bd975
3a8fe08
f3635bb
3c96f94
77832a4
b62b6ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,8 @@ plugins { | |
airbyteJavaConnector { | ||
cdkVersionRequired = '0.44.14' | ||
features = ['db-destinations', 's3-destinations', 'typing-deduping'] | ||
useLocalCdk = false | ||
//TODO: Change to false before merging to master | ||
useLocalCdk = true | ||
} | ||
|
||
java { | ||
|
@@ -44,4 +45,5 @@ integrationTestJava { | |
dependencies { | ||
implementation 'net.snowflake:snowflake-jdbc:3.14.1' | ||
implementation 'org.apache.commons:commons-text:1.10.0' | ||
implementation 'org.json:json:20210307' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need this. We already have Jackson dependency in our dependency chain. |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ data: | |
connectorSubtype: database | ||
connectorType: destination | ||
definitionId: 424892c4-daac-4491-b35d-c6688ba547ba | ||
dockerImageTag: 3.11.9 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the change? If unneeded, I'd rather keep this out side of the current PR |
||
dockerImageTag: 3.11.11 | ||
dockerRepository: airbyte/destination-snowflake | ||
documentationUrl: https://docs.airbyte.com/integrations/destinations/snowflake | ||
githubIssueLabel: destination-snowflake | ||
|
@@ -148,4 +148,4 @@ data: | |
secretStore: | ||
type: GSM | ||
alias: airbyte-connector-testing-secret-store | ||
metadataSpecVersion: "1.0" | ||
metadataSpecVersion: "1.0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -293,4 +293,8 @@ object SnowflakeDatabaseUtils { | |
AirbyteProtocolType.UNKNOWN -> "VARIANT" | ||
} | ||
} | ||
|
||
fun fromIsNullableSnowflakeString(isNullable: String?): Boolean { | ||
return "true".equals(isNullable, ignoreCase = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove and use String.toBoolean() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the function and used String.toBoolean |
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,17 +26,18 @@ import io.airbyte.integrations.base.destination.typing_deduping.Struct | |
import io.airbyte.integrations.base.destination.typing_deduping.Union | ||
import io.airbyte.integrations.base.destination.typing_deduping.UnsupportedOneOf | ||
import io.airbyte.integrations.destination.snowflake.SnowflakeDatabaseUtils | ||
import io.airbyte.integrations.destination.snowflake.SnowflakeDatabaseUtils.fromIsNullableSnowflakeString | ||
import io.airbyte.integrations.destination.snowflake.migrations.SnowflakeState | ||
import io.airbyte.integrations.destination.snowflake.typing_deduping.SnowflakeSqlGenerator.Companion.QUOTE | ||
import java.sql.Connection | ||
import java.sql.DatabaseMetaData | ||
import java.sql.ResultSet | ||
import java.sql.SQLException | ||
import java.time.Instant | ||
import java.util.* | ||
import java.util.stream.Collectors | ||
import net.snowflake.client.jdbc.SnowflakeSQLException | ||
import org.apache.commons.text.StringSubstitutor | ||
import org.json.JSONObject | ||
import org.jooq.SQLDialect | ||
import org.slf4j.Logger | ||
import org.slf4j.LoggerFactory | ||
|
@@ -70,75 +71,83 @@ class SnowflakeDestinationHandler( | |
private fun getFinalTableRowCount( | ||
streamIds: List<StreamId> | ||
): LinkedHashMap<String, LinkedHashMap<String, Int>> { | ||
val tableRowCounts = LinkedHashMap<String, LinkedHashMap<String, Int>>() | ||
// convert list stream to array | ||
val namespaces = streamIds.map { it.finalNamespace }.toTypedArray() | ||
val names = streamIds.map { it.finalName }.toTypedArray() | ||
val query = | ||
""" | ||
|SELECT table_schema, table_name, row_count | ||
|FROM information_schema.tables | ||
|WHERE table_catalog = ? | ||
|AND table_schema IN (${IntRange(1, streamIds.size).joinToString { "?" }}) | ||
|AND table_name IN (${IntRange(1, streamIds.size).joinToString { "?" }}) | ||
|""".trimMargin() | ||
val bindValues = arrayOf(databaseName) + namespaces + names | ||
val results: List<JsonNode> = database.queryJsons(query, *bindValues) | ||
for (result in results) { | ||
val tableSchema = result["TABLE_SCHEMA"].asText() | ||
val tableName = result["TABLE_NAME"].asText() | ||
val rowCount = result["ROW_COUNT"].asInt() | ||
tableRowCounts | ||
.computeIfAbsent(tableSchema) { _: String? -> LinkedHashMap() }[tableName] = | ||
rowCount | ||
|
||
val tableRowCountsFromShowQuery = LinkedHashMap<String, LinkedHashMap<String, Int>>() | ||
var showColumnsResult: List<JsonNode> = listOf() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this and set the val inside your try block |
||
|
||
try { | ||
for (stream in streamIds) { | ||
val showColumnsQuery = | ||
String.format( | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use kotlin templates instead of |
||
SHOW TABLES LIKE '%s' IN "%s"."%s"; | ||
""".trimIndent(), | ||
stream.finalName, | ||
databaseName, | ||
stream.finalNamespace, | ||
) | ||
showColumnsResult = database.queryJsons( | ||
showColumnsQuery, | ||
) | ||
for (result in showColumnsResult) { | ||
val tableSchema = result["schema_name"].asText() | ||
val tableName = result["name"].asText() | ||
val rowCount = result["rows"].asText() | ||
|
||
tableRowCountsFromShowQuery | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The indentation is super confusing here (probably enforced by our format command). Any way to change that, or is our formatter going to bark at you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
.computeIfAbsent(tableSchema) { _: String? -> LinkedHashMap() }[tableName] = | ||
rowCount.toInt() | ||
} | ||
} | ||
} catch (e: SQLException) { | ||
showColumnsResult.stream().close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure why we need to close the stream here |
||
//Not re-throwing the exception since the SQLException occurs when the table does not exist | ||
//throw e | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the commented |
||
} | ||
return tableRowCounts | ||
return tableRowCountsFromShowQuery | ||
} | ||
|
||
|
||
@Throws(Exception::class) | ||
private fun getInitialRawTableState( | ||
id: StreamId, | ||
suffix: String, | ||
): InitialRawTableStatus { | ||
|
||
val rawTableName = id.rawName + suffix | ||
val tableExists = | ||
database.executeMetadataQuery { databaseMetaData: DatabaseMetaData -> | ||
LOGGER.info( | ||
"Retrieving table from Db metadata: {} {}", | ||
var tableExists = false | ||
var showTablesResult: List<JsonNode> = listOf() | ||
|
||
try { | ||
val showTablesQuery = | ||
String.format( | ||
""" | ||
SHOW TABLES LIKE '%s' IN "%s"."%s"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use kotlin templates instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does that work when the QUOTED_IDENTIFIERS_IGNORE_CASE is set to true? We have a test class that sets that for the testDatabase |
||
""".trimIndent(), | ||
rawTableName, | ||
databaseName, | ||
id.rawNamespace, | ||
rawTableName | ||
) | ||
try { | ||
val rs = | ||
databaseMetaData.getTables( | ||
databaseName, | ||
id.rawNamespace, | ||
rawTableName, | ||
null | ||
) | ||
// When QUOTED_IDENTIFIERS_IGNORE_CASE is set to true, the raw table is | ||
// interpreted as uppercase | ||
// in db metadata calls. check for both | ||
val rsUppercase = | ||
databaseMetaData.getTables( | ||
databaseName, | ||
id.rawNamespace.uppercase(), | ||
rawTableName.uppercase(), | ||
null | ||
) | ||
rs.next() || rsUppercase.next() | ||
} catch (e: SQLException) { | ||
LOGGER.error("Failed to retrieve table metadata", e) | ||
throw RuntimeException(e) | ||
} | ||
) | ||
showTablesResult = database.queryJsons( | ||
showTablesQuery, | ||
) | ||
if(showTablesResult.size > 0) { | ||
tableExists = true | ||
} | ||
} catch (e: SQLException) { | ||
showTablesResult.stream().close() | ||
//Not re-throwing the exception since the SQLException occurs when the table does not exist | ||
//throw e | ||
} | ||
|
||
if (!tableExists) { | ||
return InitialRawTableStatus( | ||
rawTableExists = false, | ||
hasUnprocessedRecords = false, | ||
maxProcessedTimestamp = Optional.empty() | ||
) | ||
} | ||
|
||
// Snowflake timestamps have nanosecond precision, so decrement by 1ns | ||
// And use two explicit queries because COALESCE doesn't short-circuit. | ||
// This first query tries to find the oldest raw record with loaded_at = NULL | ||
|
@@ -560,6 +569,7 @@ class SnowflakeDestinationHandler( | |
} | ||
|
||
companion object { | ||
|
||
private val LOGGER: Logger = | ||
LoggerFactory.getLogger(SnowflakeDestinationHandler::class.java) | ||
const val EXCEPTION_COMMON_PREFIX: String = | ||
|
@@ -573,39 +583,63 @@ class SnowflakeDestinationHandler( | |
databaseName: String, | ||
streamIds: List<StreamId> | ||
): LinkedHashMap<String, LinkedHashMap<String, TableDefinition>> { | ||
val existingTables = LinkedHashMap<String, LinkedHashMap<String, TableDefinition>>() | ||
// convert list stream to array | ||
val namespaces = streamIds.map { it.finalNamespace }.toTypedArray() | ||
val names = streamIds.map { it.finalName }.toTypedArray() | ||
val query = | ||
""" | ||
|SELECT table_schema, table_name, column_name, data_type, is_nullable | ||
|FROM information_schema.columns | ||
|WHERE table_catalog = ? | ||
|AND table_schema IN (${IntRange(1, streamIds.size).joinToString { "?" }}) | ||
|AND table_name IN (${IntRange(1, streamIds.size).joinToString { "?" }}) | ||
|ORDER BY table_schema, table_name, ordinal_position; | ||
|""".trimMargin() | ||
|
||
val bindValues = | ||
arrayOf(databaseName.uppercase(Locale.getDefault())) + namespaces + names | ||
val results: List<JsonNode> = database.queryJsons(query, *bindValues) | ||
for (result in results) { | ||
val tableSchema = result["TABLE_SCHEMA"].asText() | ||
val tableName = result["TABLE_NAME"].asText() | ||
val columnName = result["COLUMN_NAME"].asText() | ||
val dataType = result["DATA_TYPE"].asText() | ||
val isNullable = result["IS_NULLABLE"].asText() | ||
val tableDefinition = | ||
existingTables | ||
.computeIfAbsent(tableSchema) { _: String? -> LinkedHashMap() } | ||
.computeIfAbsent(tableName) { _: String? -> | ||
TableDefinition(LinkedHashMap()) | ||
|
||
val existingTablesFromShowQuery = | ||
LinkedHashMap<String, LinkedHashMap<String, TableDefinition>>() | ||
var showColumnsResult: List<JsonNode> = listOf() | ||
|
||
try { | ||
for (stream in streamIds) { | ||
val showColumnsQuery = | ||
String.format( | ||
""" | ||
SHOW COLUMNS IN TABLE "%s"."%s"."%s"; | ||
""".trimIndent(), | ||
databaseName, | ||
stream.finalNamespace, | ||
stream.finalName, | ||
) | ||
showColumnsResult = database.queryJsons( | ||
showColumnsQuery, | ||
) | ||
|
||
for (result in showColumnsResult) { | ||
val tableSchema = result["schema_name"].asText() | ||
val tableName = result["table_name"].asText() | ||
val columnName = result["column_name"].asText() | ||
var dataType = JSONObject(result["data_type"].asText()).getString("type") | ||
|
||
//TODO: Need to check if there are other datatype differences | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that should probably be checked before merging this PR. Do we have any automated test that checks this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. per the doc, it seems those are the only differences. I'd rather have a comment that points to https://docs.snowflake.com/en/sql-reference/sql/show-columns#output rather than a |
||
// between the original approach and the new approach with SHOW queries | ||
if(dataType.equals("FIXED")) { | ||
dataType = "NUMBER" | ||
} else if(dataType.equals("REAL")) { | ||
dataType = "FLOAT" | ||
} | ||
tableDefinition.columns[columnName] = | ||
ColumnDefinition(columnName, dataType, 0, fromIsNullableIsoString(isNullable)) | ||
|
||
val isNullable = result["null?"].asText() | ||
val tableDefinition = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's create the table definition outside of the loop on |
||
existingTablesFromShowQuery | ||
.computeIfAbsent(tableSchema) { _: String? -> LinkedHashMap() } | ||
.computeIfAbsent(tableName) { _: String? -> | ||
TableDefinition(LinkedHashMap()) | ||
} | ||
tableDefinition.columns[columnName] = | ||
ColumnDefinition( | ||
columnName, | ||
dataType, | ||
0, | ||
fromIsNullableSnowflakeString(isNullable), | ||
) | ||
} | ||
} | ||
} catch (e: SQLException) { | ||
showColumnsResult.stream().close() | ||
//Not re-throwing the exception since the SQLException occurs when the table does not exist | ||
//throw e | ||
} | ||
return existingTables | ||
return existingTablesFromShowQuery | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
connection.use. Also, keep connection a val instead of a var
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I have changed the connection to a val. The code won't be changed to connection.use as we discussed since the connection needs to be open when the result set is returned