-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Destination BigQuery: Accept Dataset ID field prefixed by Project ID #8383
Changes from 11 commits
7f7b220
69b510d
5d99d33
ddd1425
d3d9cda
f4ced62
a3838eb
af80798
4c78cb5
7171871
85ab152
c505823
686e2c7
d4522f4
6c26a63
25bf817
5eefebe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
package io.airbyte.integrations.destination.bigquery; | ||
|
||
import static io.airbyte.integrations.destination.bigquery.helpers.LoggerHelper.getJobErrorMessage; | ||
import static java.util.Objects.isNull; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.node.ObjectNode; | ||
|
@@ -37,6 +38,8 @@ | |
import java.util.List; | ||
import java.util.Set; | ||
import java.util.UUID; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
import org.apache.commons.lang3.tuple.ImmutablePair; | ||
import org.joda.time.DateTime; | ||
import org.slf4j.Logger; | ||
|
@@ -46,6 +49,7 @@ public class BigQueryUtils { | |
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryUtils.class); | ||
private static final String BIG_QUERY_DATETIME_FORMAT = "yyyy-MM-dd HH:mm:ss.SSSSSS"; | ||
private static final Pattern DATASET_ID_PATTERN = Pattern.compile("^(([a-z]([a-z0-9\\-]*[a-z0-9])?):)?([a-zA-Z0-9_]+)$"); | ||
|
||
public static ImmutablePair<Job, String> executeQuery(final BigQuery bigquery, final QueryJobConfiguration queryConfig) { | ||
final JobId jobId = JobId.of(UUID.randomUUID().toString()); | ||
|
@@ -162,6 +166,27 @@ public static JsonNode getGcsAvroJsonNodeConfig(final JsonNode config) { | |
return gcsJsonNode; | ||
} | ||
|
||
public static String getDatasetId(final JsonNode config) { | ||
String datasetId = config.get(BigQueryConsts.CONFIG_DATASET_ID).asText(); | ||
Matcher matcher = DATASET_ID_PATTERN.matcher(datasetId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the regex makes it a little hard to follow the logic; string manipulation feels more clear: int colonIndex = datasetId.indexOf(":");
if (colonIndex != -1) {
String projectId = datasetId.substring(0, colonIndex);
// check equality against config.get(PROJECT_ID)
// ...
}
// if colonIndex is -1, then this returns the entire string
// otherwise it returns everything after the colon
return datasetId.substring(colonIndex + 1); This means we would lose the validation aspect (e.g. if someone enters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, I agree with your suggestion. We don't need to check here what BigQuery checks. |
||
|
||
if (matcher.matches()) { | ||
if (!isNull(matcher.group(1))) { | ||
final String projectId = config.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); | ||
if (!(projectId.equals(matcher.group(2)))) { | ||
throw new IllegalArgumentException(String.format( | ||
"Project ID included in Dataset ID must match Project ID field's value: Project ID is %s, but you specified %s in Dataset ID", | ||
projectId, | ||
matcher.group(2))); | ||
} | ||
} | ||
return matcher.group(4); | ||
} | ||
throw new IllegalArgumentException(String.format( | ||
"BigQuery Dataset ID format must match '[project-id:]dataset_id': %s", | ||
datasetId)); | ||
} | ||
|
||
public static String getDatasetLocation(final JsonNode config) { | ||
if (config.has(BigQueryConsts.CONFIG_DATASET_LOCATION)) { | ||
return config.get(BigQueryConsts.CONFIG_DATASET_LOCATION).asText(); | ||
|
@@ -214,7 +239,7 @@ public static void transformJsonDateTimeToBigDataFormat(List<String> dateTimeFie | |
} | ||
|
||
public static String getSchema(final JsonNode config, final ConfiguredAirbyteStream stream) { | ||
final String defaultSchema = config.get(BigQueryConsts.CONFIG_DATASET_ID).asText(); | ||
final String defaultSchema = getDatasetId(config); | ||
final String srcNamespace = stream.getStream().getNamespace(); | ||
if (srcNamespace == null) { | ||
return defaultSchema; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,13 +55,18 @@ | |
import java.time.Instant; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.function.Consumer; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
import java.util.stream.StreamSupport; | ||
import org.apache.commons.lang3.tuple.ImmutablePair; | ||
import org.junit.jupiter.api.AfterEach; | ||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.api.TestInfo; | ||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.Arguments; | ||
import org.junit.jupiter.params.provider.MethodSource; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -197,16 +202,20 @@ void testSpec() throws Exception { | |
assertEquals(expected, actual); | ||
} | ||
|
||
@Test | ||
void testCheckSuccess() { | ||
@ParameterizedTest | ||
@MethodSource("datasetIdResetterProvider") | ||
void testCheckSuccess(DatasetIdResetter resetDatasetId) { | ||
resetDatasetId.accept(config); | ||
final AirbyteConnectionStatus actual = new BigQueryDestination().check(config); | ||
final AirbyteConnectionStatus expected = new AirbyteConnectionStatus().withStatus(Status.SUCCEEDED); | ||
assertEquals(expected, actual); | ||
} | ||
|
||
@Test | ||
void testCheckFailure() { | ||
@ParameterizedTest | ||
@MethodSource("datasetIdResetterProvider") | ||
void testCheckFailure(DatasetIdResetter resetDatasetId) { | ||
((ObjectNode) config).put(BigQueryConsts.CONFIG_PROJECT_ID, "fake"); | ||
resetDatasetId.accept(config); | ||
final AirbyteConnectionStatus actual = new BigQueryDestination().check(config); | ||
final String actualMessage = actual.getMessage(); | ||
LOGGER.info("Checking expected failure message:" + actualMessage); | ||
|
@@ -215,8 +224,10 @@ void testCheckFailure() { | |
assertEquals(expected, actual.withMessage("")); | ||
} | ||
|
||
@Test | ||
void testWriteSuccess() throws Exception { | ||
@ParameterizedTest | ||
@MethodSource("datasetIdResetterProvider") | ||
void testWriteSuccess(DatasetIdResetter resetDatasetId) throws Exception { | ||
resetDatasetId.accept(config); | ||
final BigQueryDestination destination = new BigQueryDestination(); | ||
final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); | ||
|
||
|
@@ -244,8 +255,10 @@ void testWriteSuccess() throws Exception { | |
.collect(Collectors.toList())); | ||
} | ||
|
||
@Test | ||
void testWriteFailure() throws Exception { | ||
@ParameterizedTest | ||
@MethodSource("datasetIdResetterProvider") | ||
void testWriteFailure(DatasetIdResetter resetDatasetId) throws Exception { | ||
resetDatasetId.accept(config); | ||
// hack to force an exception to be thrown from within the consumer. | ||
final AirbyteMessage spiedMessage = spy(MESSAGE_USERS1); | ||
doThrow(new RuntimeException()).when(spiedMessage).getRecord(); | ||
|
@@ -305,8 +318,10 @@ private List<JsonNode> retrieveRecords(final String tableName) throws Exception | |
.collect(Collectors.toList()); | ||
} | ||
|
||
@Test | ||
void testWritePartitionOverUnpartitioned() throws Exception { | ||
@ParameterizedTest | ||
@MethodSource("datasetIdResetterProvider") | ||
void testWritePartitionOverUnpartitioned(DatasetIdResetter resetDatasetId) throws Exception { | ||
resetDatasetId.accept(config); | ||
final String raw_table_name = String.format("_airbyte_raw_%s", USERS_STREAM_NAME); | ||
createUnpartitionedTable(bigquery, dataset, raw_table_name); | ||
assertFalse(isTablePartitioned(bigquery, dataset, raw_table_name)); | ||
|
@@ -369,4 +384,29 @@ private boolean isTablePartitioned(final BigQuery bigquery, final Dataset datase | |
return false; | ||
} | ||
|
||
private static class DatasetIdResetter { | ||
private Consumer<JsonNode> consumer; | ||
|
||
DatasetIdResetter(Consumer<JsonNode> consumer) { | ||
this.consumer = consumer; | ||
} | ||
|
||
public void accept(JsonNode config) { | ||
consumer.accept(config); | ||
} | ||
} | ||
|
||
private static Stream<Arguments> datasetIdResetterProvider() { | ||
return Stream.of( | ||
Arguments.arguments(new DatasetIdResetter(config -> {})), | ||
Arguments.arguments(new DatasetIdResetter( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add a comment to explain what this is testing |
||
config -> { | ||
String projectId = ((ObjectNode) config).get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); | ||
String datasetId = ((ObjectNode) config).get(BigQueryConsts.CONFIG_DATASET_ID).asText(); | ||
((ObjectNode) config).put(BigQueryConsts.CONFIG_DATASET_ID, | ||
String.format("%s:%s", projectId, datasetId)); | ||
} | ||
)) | ||
); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Copyright (c) 2021 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.integrations.destination.bigquery; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertThrows; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.google.common.collect.ImmutableMap; | ||
import io.airbyte.commons.json.Jsons; | ||
import java.util.stream.Stream; | ||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.Arguments; | ||
import org.junit.jupiter.params.provider.MethodSource; | ||
|
||
public class BigQueryUtilsTest { | ||
|
||
private ImmutableMap.Builder<Object, Object> configMapBuilder; | ||
|
||
@BeforeEach | ||
public void init() { | ||
configMapBuilder = ImmutableMap.builder() | ||
.put(BigQueryConsts.CONFIG_CREDS, "test_secret") | ||
.put(BigQueryConsts.CONFIG_DATASET_LOCATION, "US"); | ||
} | ||
|
||
@ParameterizedTest | ||
@MethodSource("validBigQueryIdProvider") | ||
public void testGetDatasetIdSuccess(String projectId, String datasetId, String expected) throws Exception { | ||
JsonNode config = Jsons.jsonNode(configMapBuilder | ||
.put(BigQueryConsts.CONFIG_PROJECT_ID, projectId) | ||
.put(BigQueryConsts.CONFIG_DATASET_ID, datasetId) | ||
.build()); | ||
|
||
String actual = BigQueryUtils.getDatasetId(config); | ||
|
||
assertEquals(expected, actual); | ||
} | ||
|
||
@ParameterizedTest | ||
@MethodSource("invalidBigQueryIdProvider") | ||
public void testGetDatasetIdFail(String projectId, String datasetId, String expected) throws Exception { | ||
JsonNode config = Jsons.jsonNode(configMapBuilder | ||
.put(BigQueryConsts.CONFIG_PROJECT_ID, projectId) | ||
.put(BigQueryConsts.CONFIG_DATASET_ID, datasetId) | ||
.build()); | ||
|
||
Exception exception = assertThrows(IllegalArgumentException.class, () -> BigQueryUtils.getDatasetId(config)); | ||
|
||
assertEquals(expected, exception.getMessage()); | ||
} | ||
|
||
private static Stream<Arguments> validBigQueryIdProvider() { | ||
return Stream.of( | ||
Arguments.arguments("my-project", "my_dataset", "my_dataset"), | ||
Arguments.arguments("my-project", "my-project:my_dataset", "my_dataset")); | ||
} | ||
|
||
private static Stream<Arguments> invalidBigQueryIdProvider() { | ||
return Stream.of( | ||
Arguments.arguments("my-project", ":my_dataset", "BigQuery Dataset ID format must match '[project-id:]dataset_id': :my_dataset"), | ||
Arguments.arguments("my-project", "my-project:my-project:my_dataset", "BigQuery Dataset ID format must match '[project-id:]dataset_id': my-project:my-project:my_dataset"), | ||
Arguments.arguments("my-project", "my-project-:my_dataset", "BigQuery Dataset ID format must match '[project-id:]dataset_id': my-project-:my_dataset"), | ||
Arguments.arguments("my-project", "my-project:", "BigQuery Dataset ID format must match '[project-id:]dataset_id': my-project:"), | ||
Arguments.arguments("my-project", "your-project:my_dataset", | ||
"Project ID included in Dataset ID must match Project ID field's value: Project ID is my-project, but you specified your-project in Dataset ID")); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BigQueryUtils.getDatasetId(config) used in 2 cases
Could you pls consider to add integration test to BigQueryDestinationTest to check that: