diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile b/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile index 78885b5710cb8..078057b09805b 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile @@ -4,7 +4,9 @@ WORKDIR /airbyte ENV APPLICATION destination-bigquery-denormalized -ADD build/distributions/${APPLICATION}*.tar /airbyte +COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar -LABEL io.airbyte.version=0.1.11 +RUN tar xf ${APPLICATION}.tar --strip-components=1 + +LABEL io.airbyte.version=0.2.0 LABEL io.airbyte.name=airbyte/destination-bigquery-denormalized diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/build.gradle b/airbyte-integrations/connectors/destination-bigquery-denormalized/build.gradle index 11b47deb667d0..5b0c1bd57006c 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/build.gradle +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/build.gradle @@ -17,11 +17,11 @@ dependencies { implementation project(':airbyte-integrations:bases:base-java') implementation project(':airbyte-integrations:connectors:destination-bigquery') implementation project(':airbyte-protocol:models') + implementation project(':airbyte-integrations:connectors:destination-s3') + implementation project(':airbyte-integrations:connectors:destination-gcs') integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-destination-test') - integrationTestJavaImplementation project(':airbyte-integrations:connectors:destination-bigquery') integrationTestJavaImplementation project(':airbyte-integrations:connectors:destination-bigquery-denormalized') - implementation files(project(':airbyte-integrations:bases:base-java').airbyteDocker.outputs) } diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestination.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestination.java index aa467b7b271cc..dc0c30b563162 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestination.java +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestination.java @@ -5,32 +5,13 @@ package io.airbyte.integrations.destination.bigquery; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.Field.Builder; -import com.google.cloud.bigquery.Field.Mode; -import com.google.cloud.bigquery.FieldList; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.StandardSQLTypeName; -import com.google.common.base.Preconditions; -import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.util.MoreIterators; -import io.airbyte.integrations.base.AirbyteMessageConsumer; -import io.airbyte.integrations.base.AirbyteStreamNameNamespacePair; import io.airbyte.integrations.base.Destination; import io.airbyte.integrations.base.IntegrationRunner; -import io.airbyte.integrations.base.JavaBaseConstants; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.formatter.DefaultBigQueryDenormalizedRecordFormatter; +import io.airbyte.integrations.destination.bigquery.formatter.GcsBigQueryDenormalizedRecordFormatter; +import io.airbyte.integrations.destination.bigquery.uploader.UploaderType; import java.util.Map; -import java.util.Set; -import java.util.function.Consumer; -import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,14 +19,6 @@ public class BigQueryDenormalizedDestination extends BigQueryDestination { private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDenormalizedDestination.class); - public static final String NESTED_ARRAY_FIELD = "value"; - protected static final String PROPERTIES_FIELD = "properties"; - private static final String TYPE_FIELD = "type"; - private static final String FORMAT_FIELD = "format"; - private static final String REF_DEFINITION_KEY = "$ref"; - - private final Set fieldsContainRefDefinitionValue = new HashSet<>(); - @Override protected String getTargetTableName(final String streamName) { // This BigQuery destination does not write to a staging "raw" table but directly to a normalized @@ -53,156 +26,22 @@ protected String getTargetTableName(final String streamName) { return getNamingResolver().getIdentifier(streamName); } - protected AirbyteMessageConsumer getRecordConsumer(final BigQuery bigquery, - final Map writeConfigs, - final ConfiguredAirbyteCatalog catalog, - final Consumer outputRecordCollector, - final boolean isGcsUploadingMode, - final boolean isKeepFilesInGcs) { - return new BigQueryDenormalizedRecordConsumer(bigquery, writeConfigs, catalog, outputRecordCollector, getNamingResolver(), - fieldsContainRefDefinitionValue); - } - @Override - protected Schema getBigQuerySchema(final JsonNode jsonSchema) { - final List fieldList = getSchemaFields(getNamingResolver(), jsonSchema); - if (fieldList.stream().noneMatch(f -> f.getName().equals(JavaBaseConstants.COLUMN_NAME_AB_ID))) { - fieldList.add(Field.of(JavaBaseConstants.COLUMN_NAME_AB_ID, StandardSQLTypeName.STRING)); - } - if (fieldList.stream().noneMatch(f -> f.getName().equals(JavaBaseConstants.COLUMN_NAME_EMITTED_AT))) { - fieldList.add(Field.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, StandardSQLTypeName.TIMESTAMP)); - } - return com.google.cloud.bigquery.Schema.of(fieldList); - } - - private List getSchemaFields(final BigQuerySQLNameTransformer namingResolver, final JsonNode jsonSchema) { - Preconditions.checkArgument(jsonSchema.isObject() && jsonSchema.has(PROPERTIES_FIELD)); - final ObjectNode properties = (ObjectNode) jsonSchema.get(PROPERTIES_FIELD); - List tmpFields = Jsons.keys(properties).stream() - .peek(addToRefList(properties)) - .map(key -> getField(namingResolver, key, properties.get(key)) - .build()) - .collect(Collectors.toList()); - if (!fieldsContainRefDefinitionValue.isEmpty()) { - LOGGER.warn("Next fields contain \"$ref\" as Definition: {}. They are going to be saved as String Type column", - fieldsContainRefDefinitionValue); - } - return tmpFields; + protected Map getFormatterMap(JsonNode jsonSchema) { + return Map.of(UploaderType.STANDARD, new DefaultBigQueryDenormalizedRecordFormatter(jsonSchema, getNamingResolver()), + UploaderType.AVRO, new GcsBigQueryDenormalizedRecordFormatter(jsonSchema, getNamingResolver())); } /** - * @param properties - JSON schema with properties - * - * The method is responsible for population of fieldsContainRefDefinitionValue set with keys - * contain $ref definition - * - * Currently, AirByte doesn't support parsing value by $ref key definition. The issue to - * track this 7725 + * BigQuery might have different structure of the Temporary table. + * If this method returns TRUE, temporary table will have only three common Airbyte attributes. + * In case of FALSE, temporary table structure will be in line with Airbyte message JsonSchema. + * @return use default AirbyteSchema or build using JsonSchema */ - private Consumer addToRefList(ObjectNode properties) { - return key -> { - if (properties.get(key).has(REF_DEFINITION_KEY)) { - fieldsContainRefDefinitionValue.add(key); - } - }; - } - - private static Builder getField(final BigQuerySQLNameTransformer namingResolver, final String key, final JsonNode fieldDefinition) { - - final String fieldName = namingResolver.getIdentifier(key); - final Builder builder = Field.newBuilder(fieldName, StandardSQLTypeName.STRING); - final List fieldTypes = getTypes(fieldName, fieldDefinition.get(TYPE_FIELD)); - for (int i = 0; i < fieldTypes.size(); i++) { - final JsonSchemaType fieldType = fieldTypes.get(i); - if (fieldType == JsonSchemaType.NULL) { - builder.setMode(Mode.NULLABLE); - } - if (i == 0) { - // Treat the first type in the list with the widest scope as the primary type - final JsonSchemaType primaryType = fieldTypes.get(i); - switch (primaryType) { - case NULL -> { - builder.setType(StandardSQLTypeName.STRING); - } - case STRING, NUMBER, INTEGER, BOOLEAN -> { - builder.setType(primaryType.getBigQueryType()); - } - case ARRAY -> { - final JsonNode items; - if (fieldDefinition.has("items")) { - items = fieldDefinition.get("items"); - } else { - LOGGER.warn("Source connector provided schema for ARRAY with missed \"items\", will assume that it's a String type"); - // this is handler for case when we get "array" without "items" - // (https://github.com/airbytehq/airbyte/issues/5486) - items = getTypeStringSchema(); - } - final Builder subField = getField(namingResolver, fieldName, items).setMode(Mode.REPEATED); - // "Array of Array of" (nested arrays) are not permitted by BigQuery ("Array of Record of Array of" - // is) - // Turn all "Array of" into "Array of Record of" instead - return builder.setType(StandardSQLTypeName.STRUCT, subField.setName(NESTED_ARRAY_FIELD).build()); - } - case OBJECT -> { - final JsonNode properties; - if (fieldDefinition.has(PROPERTIES_FIELD)) { - properties = fieldDefinition.get(PROPERTIES_FIELD); - } else { - properties = fieldDefinition; - } - final FieldList fieldList = FieldList.of(Jsons.keys(properties) - .stream() - .map(f -> getField(namingResolver, f, properties.get(f)).build()) - .collect(Collectors.toList())); - if (fieldList.size() > 0) { - builder.setType(StandardSQLTypeName.STRUCT, fieldList); - } else { - builder.setType(StandardSQLTypeName.STRING); - } - } - default -> { - throw new IllegalStateException( - String.format("Unexpected type for field %s: %s", fieldName, primaryType)); - } - } - } - } - - // If a specific format is defined, use their specific type instead of the JSON's one - final JsonNode fieldFormat = fieldDefinition.get(FORMAT_FIELD); - if (fieldFormat != null) { - final JsonSchemaFormat schemaFormat = JsonSchemaFormat.fromJsonSchemaFormat(fieldFormat.asText()); - if (schemaFormat != null) { - builder.setType(schemaFormat.getBigQueryType()); - } - } - - return builder; - } - - private static JsonNode getTypeStringSchema() { - return Jsons.deserialize("{\n" - + " \"type\": [\n" - + " \"string\"\n" - + " ]\n" - + " }"); - } - - private static List getTypes(final String fieldName, final JsonNode type) { - if (type == null) { - LOGGER.warn("Field {} has no type defined, defaulting to STRING", fieldName); - return List.of(JsonSchemaType.STRING); - } else if (type.isArray()) { - return MoreIterators.toList(type.elements()).stream() - .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) - // re-order depending to make sure wider scope types are first - .sorted(Comparator.comparingInt(JsonSchemaType::getOrder)) - .collect(Collectors.toList()); - } else if (type.isTextual()) { - return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(type.asText())); - } else { - throw new IllegalStateException("Unexpected type: " + type); - } + @Override + protected boolean isDefaultAirbyteTmpTableSchema() { + // Build temporary table structure based on incoming JsonSchema + return false; } public static void main(final String[] args) throws Exception { diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedRecordConsumer.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedRecordConsumer.java deleted file mode 100644 index a6093b93c7c7b..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedRecordConsumer.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery; - -import com.google.cloud.bigquery.BigQuery; -import io.airbyte.integrations.base.AirbyteStreamNameNamespacePair; -import io.airbyte.integrations.destination.StandardNameTransformer; -import io.airbyte.integrations.destination.bigquery.BigQueryDestination.UploadingMethod; -import io.airbyte.integrations.destination.bigquery.strategy.BigQueryDenormalizedUploadStandardStrategy; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.function.Consumer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class BigQueryDenormalizedRecordConsumer extends BigQueryRecordConsumer { - - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDenormalizedRecordConsumer.class); - - private final Set invalidKeys; - private final Set fieldsWithRefDefinition; - - public BigQueryDenormalizedRecordConsumer(final BigQuery bigquery, - final Map writeConfigs, - final ConfiguredAirbyteCatalog catalog, - final Consumer outputRecordCollector, - final StandardNameTransformer namingResolver, - final Set fieldsWithRefDefinition) { - super(bigquery, writeConfigs, catalog, outputRecordCollector, false, false); - this.fieldsWithRefDefinition = fieldsWithRefDefinition; - invalidKeys = new HashSet<>(); - bigQueryUploadStrategyMap.put(UploadingMethod.STANDARD, - new BigQueryDenormalizedUploadStandardStrategy(bigquery, catalog, outputRecordCollector, namingResolver, invalidKeys, - Set.copyOf(fieldsWithRefDefinition))); - } - - @Override - public void close(final boolean hasFailed) { - fieldsWithRefDefinition.clear(); - super.close(hasFailed); - } - -} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/JsonSchemaFormat.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/JsonSchemaFormat.java index 0c157091dcafd..27420a0b7d699 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/JsonSchemaFormat.java +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/JsonSchemaFormat.java @@ -13,7 +13,8 @@ public enum JsonSchemaFormat { DATE("date", StandardSQLTypeName.DATE), DATETIME("date-time", StandardSQLTypeName.DATETIME), - TIME("time", StandardSQLTypeName.TIME); + TIME("time", StandardSQLTypeName.TIME), + TIMESTAMP("timestamp-micros", StandardSQLTypeName.TIMESTAMP); private final String jsonSchemaFormat; private final StandardSQLTypeName bigQueryType; diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryDenormalizedRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryDenormalizedRecordFormatter.java new file mode 100644 index 0000000000000..7632680638470 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryDenormalizedRecordFormatter.java @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.Field.Builder; +import com.google.cloud.bigquery.Field.Mode; +import com.google.cloud.bigquery.FieldList; +import com.google.cloud.bigquery.QueryParameterValue; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.util.MoreIterators; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; +import io.airbyte.integrations.destination.bigquery.JsonSchemaFormat; +import io.airbyte.integrations.destination.bigquery.JsonSchemaType; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DefaultBigQueryDenormalizedRecordFormatter extends DefaultBigQueryRecordFormatter { + + private static final Logger LOGGER = LoggerFactory.getLogger(DefaultBigQueryDenormalizedRecordFormatter.class); + + private final Set invalidKeys = new HashSet<>(); + + public static final String NESTED_ARRAY_FIELD = "big_query_array"; + protected static final String PROPERTIES_FIELD = "properties"; + private static final String TYPE_FIELD = "type"; + private static final String ARRAY_ITEMS_FIELD = "items"; + private static final String FORMAT_FIELD = "format"; + private static final String REF_DEFINITION_KEY = "$ref"; + + private final Set fieldsContainRefDefinitionValue = new HashSet<>(); + + public DefaultBigQueryDenormalizedRecordFormatter(JsonNode jsonSchema, StandardNameTransformer namingResolver) { + super(jsonSchema, namingResolver); + } + + @Override + protected JsonNode formatJsonSchema(JsonNode jsonSchema) { + populateEmptyArrays(jsonSchema); + surroundArraysByObjects(jsonSchema); + return jsonSchema; + } + + private List findArrays(JsonNode node) { + if (node != null) { + return node.findParents(TYPE_FIELD).stream() + .filter( + jsonNode -> { + JsonNode type = jsonNode.get(TYPE_FIELD); + if (type.isArray()) { + ArrayNode typeNode = (ArrayNode) type; + for (JsonNode arrayTypeNode : typeNode) { + if (arrayTypeNode.isTextual() && arrayTypeNode.textValue().equals("array")) + return true; + } + } else if (type.isTextual()) { + return jsonNode.asText().equals("array"); + } + return false; + }) + .collect(Collectors.toList()); + } else { + return Collections.emptyList(); + } + } + + private void populateEmptyArrays(JsonNode node) { + findArrays(node).forEach(jsonNode -> { + if (!jsonNode.has(ARRAY_ITEMS_FIELD)) { + ObjectNode nodeToChange = (ObjectNode) jsonNode; + nodeToChange.putObject(ARRAY_ITEMS_FIELD).putArray(TYPE_FIELD).add("string"); + } + }); + } + + private void surroundArraysByObjects(JsonNode node) { + findArrays(node).forEach( + jsonNode -> { + JsonNode arrayNode = jsonNode.deepCopy(); + + ObjectNode newNode = (ObjectNode) jsonNode; + newNode.removeAll(); + newNode.putArray(TYPE_FIELD).add("object"); + newNode.putObject(PROPERTIES_FIELD).set(NESTED_ARRAY_FIELD, arrayNode); + + surroundArraysByObjects(arrayNode.get(ARRAY_ITEMS_FIELD)); + }); + } + + @Override + public JsonNode formatRecord(final AirbyteRecordMessage recordMessage) { + // Bigquery represents TIMESTAMP to the microsecond precision, so we convert to microseconds then + // use BQ helpers to string-format correctly. + Preconditions.checkArgument(recordMessage.getData().isObject()); + final ObjectNode data = (ObjectNode) formatData(getBigQuerySchema().getFields(), recordMessage.getData()); + // replace ObjectNode with TextNode for fields with $ref definition key + // Do not need to iterate through all JSON Object nodes, only first nesting object. + if (!fieldsContainRefDefinitionValue.isEmpty()) { + fieldsContainRefDefinitionValue.forEach(key -> { + if (data.get(key) != null && !data.get(key).isNull()) { + data.put(key, data.get(key).toString()); + } + }); + } + addAirbyteColumns(data, recordMessage); + + return data; + } + + protected void addAirbyteColumns(ObjectNode data, final AirbyteRecordMessage recordMessage) { + final long emittedAtMicroseconds = recordMessage.getEmittedAt(); + final String formattedEmittedAt = QueryParameterValue.timestamp(emittedAtMicroseconds).getValue(); + + data.put(JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString()); + data.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, formattedEmittedAt); + } + + protected JsonNode formatData(final FieldList fields, final JsonNode root) { + // handles empty objects and arrays + if (fields == null) { + return root; + } + formatDateTimeFields(fields, root); + if (root.isObject()) { + return getObjectNode(fields, root); + } else if (root.isArray()) { + return getArrayNode(fields, root); + } else { + return root; + } + } + + protected void formatDateTimeFields(final FieldList fields, final JsonNode root) { + final List dateTimeFields = BigQueryUtils.getDateTimeFieldsFromSchema(fields); + if (!dateTimeFields.isEmpty() && !root.isNull()) { + BigQueryUtils.transformJsonDateTimeToBigDataFormat(dateTimeFields, (ObjectNode) root); + } + } + + private JsonNode getArrayNode(FieldList fields, JsonNode root) { + // Arrays can have only one field + final Field arrayField = fields.get(0); + // If an array of records, we should use subfields + final FieldList subFields; + if (arrayField.getSubFields() == null || arrayField.getSubFields().isEmpty()) { + subFields = fields; + } else { + subFields = arrayField.getSubFields(); + } + final JsonNode items = Jsons.jsonNode(MoreIterators.toList(root.elements()).stream() + .map(p -> formatData(subFields, p)) + .collect(Collectors.toList())); + + return Jsons.jsonNode(ImmutableMap.of(NESTED_ARRAY_FIELD, items)); + } + + private JsonNode getObjectNode(FieldList fields, JsonNode root) { + final List fieldNames = fields.stream().map(Field::getName).collect(Collectors.toList()); + return Jsons.jsonNode(Jsons.keys(root).stream() + .filter(key -> { + final boolean validKey = fieldNames.contains(namingResolver.getIdentifier(key)); + if (!validKey && !invalidKeys.contains(key)) { + logFieldFail("Ignoring field as it is not defined in catalog", key); + invalidKeys.add(key); + } + return validKey; + }) + .collect(Collectors.toMap(namingResolver::getIdentifier, + key -> formatData(fields.get(namingResolver.getIdentifier(key)).getSubFields(), root.get(key))))); + } + + @Override + public Schema getBigQuerySchema(final JsonNode jsonSchema) { + final List fieldList = getSchemaFields(namingResolver, jsonSchema); + if (fieldList.stream().noneMatch(f -> f.getName().equals(JavaBaseConstants.COLUMN_NAME_AB_ID))) { + fieldList.add(Field.of(JavaBaseConstants.COLUMN_NAME_AB_ID, StandardSQLTypeName.STRING)); + } + if (fieldList.stream().noneMatch(f -> f.getName().equals(JavaBaseConstants.COLUMN_NAME_EMITTED_AT))) { + fieldList.add(Field.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, StandardSQLTypeName.TIMESTAMP)); + } + LOGGER.info("Airbyte Schema is transformed from {} to {}.", jsonSchema, fieldList); + return com.google.cloud.bigquery.Schema.of(fieldList); + } + + private List getSchemaFields(final StandardNameTransformer namingResolver, final JsonNode jsonSchema) { + LOGGER.info("getSchemaFields : " + jsonSchema + " namingResolver " + namingResolver); + Preconditions.checkArgument(jsonSchema.isObject() && jsonSchema.has(PROPERTIES_FIELD)); + final ObjectNode properties = (ObjectNode) jsonSchema.get(PROPERTIES_FIELD); + List tmpFields = Jsons.keys(properties).stream() + .peek(addToRefList(properties)) + .map(key -> getField(namingResolver, key, properties.get(key)) + .build()) + .collect(Collectors.toList()); + if (!fieldsContainRefDefinitionValue.isEmpty()) { + LOGGER.warn("Next fields contain \"$ref\" as Definition: {}. They are going to be saved as String Type column", + fieldsContainRefDefinitionValue); + } + return tmpFields; + } + + /** + * @param properties - JSON schema with properties + * + * The method is responsible for population of fieldsContainRefDefinitionValue set with keys + * contain $ref definition + * + * Currently, AirByte doesn't support parsing value by $ref key definition. The issue to + * track this 7725 + */ + private Consumer addToRefList(ObjectNode properties) { + return key -> { + if (properties.get(key).has(REF_DEFINITION_KEY)) { + fieldsContainRefDefinitionValue.add(key); + } + }; + } + + private static Builder getField(final StandardNameTransformer namingResolver, final String key, final JsonNode fieldDefinition) { + final String fieldName = namingResolver.getIdentifier(key); + final Builder builder = Field.newBuilder(fieldName, StandardSQLTypeName.STRING); + final List fieldTypes = getTypes(fieldName, fieldDefinition.get(TYPE_FIELD)); + for (int i = 0; i < fieldTypes.size(); i++) { + final JsonSchemaType fieldType = fieldTypes.get(i); + if (fieldType == JsonSchemaType.NULL) { + builder.setMode(Mode.NULLABLE); + } + if (i == 0) { + // Treat the first type in the list with the widest scope as the primary type + final JsonSchemaType primaryType = fieldTypes.get(i); + switch (primaryType) { + case NULL -> { + builder.setType(StandardSQLTypeName.STRING); + } + case STRING, NUMBER, INTEGER, BOOLEAN -> { + builder.setType(primaryType.getBigQueryType()); + } + case ARRAY -> { + final JsonNode items; + if (fieldDefinition.has("items")) { + items = fieldDefinition.get("items"); + } else { + LOGGER.warn("Source connector provided schema for ARRAY with missed \"items\", will assume that it's a String type"); + // this is handler for case when we get "array" without "items" + // (https://github.com/airbytehq/airbyte/issues/5486) + items = getTypeStringSchema(); + } + return getField(namingResolver, fieldName, items).setMode(Mode.REPEATED); + } + case OBJECT -> { + final JsonNode properties; + if (fieldDefinition.has(PROPERTIES_FIELD)) { + properties = fieldDefinition.get(PROPERTIES_FIELD); + } else { + properties = fieldDefinition; + } + final FieldList fieldList = FieldList.of(Jsons.keys(properties) + .stream() + .map(f -> getField(namingResolver, f, properties.get(f)).build()) + .collect(Collectors.toList())); + if (!fieldList.isEmpty()) { + builder.setType(StandardSQLTypeName.STRUCT, fieldList); + } else { + builder.setType(StandardSQLTypeName.STRING); + } + } + default -> { + throw new IllegalStateException( + String.format("Unexpected type for field %s: %s", fieldName, primaryType)); + } + } + } + } + + // If a specific format is defined, use their specific type instead of the JSON's one + final JsonNode fieldFormat = fieldDefinition.get(FORMAT_FIELD); + if (fieldFormat != null) { + final JsonSchemaFormat schemaFormat = JsonSchemaFormat.fromJsonSchemaFormat(fieldFormat.asText()); + if (schemaFormat != null) { + builder.setType(schemaFormat.getBigQueryType()); + } + } + + return builder; + } + + private static JsonNode getTypeStringSchema() { + return Jsons.deserialize("{\n" + + " \"type\": [\n" + + " \"string\"\n" + + " ]\n" + + " }"); + } + + private static List getTypes(final String fieldName, final JsonNode type) { + if (type == null) { + LOGGER.warn("Field {} has no type defined, defaulting to STRING", fieldName); + return List.of(JsonSchemaType.STRING); + } else if (type.isArray()) { + return MoreIterators.toList(type.elements()).stream() + .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) + // re-order depending to make sure wider scope types are first + .sorted(Comparator.comparingInt(JsonSchemaType::getOrder)) + .collect(Collectors.toList()); + } else if (type.isTextual()) { + return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(type.asText())); + } else { + throw new IllegalStateException("Unexpected type: " + type); + } + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsBigQueryDenormalizedRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsBigQueryDenormalizedRecordFormatter.java new file mode 100644 index 0000000000000..123e8488b9dfb --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsBigQueryDenormalizedRecordFormatter.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +public class GcsBigQueryDenormalizedRecordFormatter extends DefaultBigQueryDenormalizedRecordFormatter { + + public GcsBigQueryDenormalizedRecordFormatter( + JsonNode jsonSchema, + StandardNameTransformer namingResolver) { + super(jsonSchema, namingResolver); + } + + @Override + protected JsonNode formatJsonSchema(JsonNode jsonSchema) { + var textJson = Jsons.serialize(jsonSchema); + /* + * BigQuery avro file loader doesn't support DatTime transformation + * https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types Replace + * date-time by timestamp + */ + textJson = textJson.replace("\"format\":\"date-time\"", "\"format\":\"timestamp-micros\""); + // Add string type for Refs + // Avro header convertor requires types for all fields + textJson = textJson.replace("{\"$ref\":\"", "{\"type\":[\"string\"], \"$ref\":\""); + return super.formatJsonSchema(Jsons.deserialize(textJson)); + } + + @Override + protected void addAirbyteColumns(ObjectNode data, AirbyteRecordMessage recordMessage) { + final long emittedAtMicroseconds = TimeUnit.MILLISECONDS.convert(recordMessage.getEmittedAt(), TimeUnit.MILLISECONDS); + + data.put(JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString()); + data.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, emittedAtMicroseconds); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryDenormalizedUploadStandardStrategy.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryDenormalizedUploadStandardStrategy.java deleted file mode 100644 index fdc5f5ad569e1..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryDenormalizedUploadStandardStrategy.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery.strategy; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.FieldList; -import com.google.cloud.bigquery.QueryParameterValue; -import com.google.cloud.bigquery.Schema; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.util.MoreIterators; -import io.airbyte.integrations.base.JavaBaseConstants; -import io.airbyte.integrations.destination.StandardNameTransformer; -import io.airbyte.integrations.destination.bigquery.BigQueryDenormalizedDestination; -import io.airbyte.integrations.destination.bigquery.BigQueryUtils; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.AirbyteRecordMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.util.List; -import java.util.Set; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class BigQueryDenormalizedUploadStandardStrategy extends BigQueryUploadStandardStrategy { - - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDenormalizedUploadStandardStrategy.class); - - private final StandardNameTransformer namingResolver; - private final Set invalidKeys; - private final Set fieldsWithRefDefinition; - - public BigQueryDenormalizedUploadStandardStrategy(BigQuery bigquery, - ConfiguredAirbyteCatalog catalog, - Consumer outputRecordCollector, - StandardNameTransformer namingResolver, - Set invalidKeys, - Set fieldsWithRefDefinition) { - super(bigquery, catalog, outputRecordCollector); - this.namingResolver = namingResolver; - this.invalidKeys = invalidKeys; - this.fieldsWithRefDefinition = fieldsWithRefDefinition; - } - - @Override - protected JsonNode formatRecord(final Schema schema, final AirbyteRecordMessage recordMessage) { - // Bigquery represents TIMESTAMP to the microsecond precision, so we convert to microseconds then - // use BQ helpers to string-format correctly. - final long emittedAtMicroseconds = TimeUnit.MICROSECONDS.convert(recordMessage.getEmittedAt(), TimeUnit.MILLISECONDS); - final String formattedEmittedAt = QueryParameterValue.timestamp(emittedAtMicroseconds).getValue(); - Preconditions.checkArgument(recordMessage.getData().isObject()); - final ObjectNode data = (ObjectNode) formatData(schema.getFields(), recordMessage.getData()); - // replace ObjectNode with TextNode for fields with $ref definition key - // Do not need to iterate through all JSON Object nodes, only first nesting object. - if (!fieldsWithRefDefinition.isEmpty()) { - fieldsWithRefDefinition.forEach(key -> { - if (data.get(key) != null && !data.get(key).isNull()) { - data.put(key, data.get(key).toString()); - } - }); - } - data.put(JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString()); - data.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, formattedEmittedAt); - - return data; - } - - protected JsonNode formatData(final FieldList fields, final JsonNode root) { - // handles empty objects and arrays - if (fields == null) { - return root; - } - final List dateTimeFields = BigQueryUtils.getDateTimeFieldsFromSchema(fields); - if (!dateTimeFields.isEmpty()) { - BigQueryUtils.transformJsonDateTimeToBigDataFormat(dateTimeFields, (ObjectNode) root); - } - if (root.isObject()) { - return getObjectNode(fields, root); - } else if (root.isArray()) { - return getArrayNode(fields, root); - } else { - return root; - } - } - - private JsonNode getArrayNode(FieldList fields, JsonNode root) { - // Arrays can have only one field - final Field arrayField = fields.get(0); - // If an array of records, we should use subfields - final FieldList subFields; - if (arrayField.getSubFields() == null || arrayField.getSubFields().isEmpty()) { - subFields = fields; - } else { - subFields = arrayField.getSubFields(); - } - final JsonNode items = Jsons.jsonNode(MoreIterators.toList(root.elements()).stream() - .map(p -> formatData(subFields, p)) - .collect(Collectors.toList())); - - // "Array of Array of" (nested arrays) are not permitted by BigQuery ("Array of Record of Array of" - // is). Turn all "Array of" into "Array of Record of" instead - return Jsons.jsonNode(ImmutableMap.of(BigQueryDenormalizedDestination.NESTED_ARRAY_FIELD, items)); - } - - private JsonNode getObjectNode(FieldList fields, JsonNode root) { - final List fieldNames = fields.stream().map(Field::getName).collect(Collectors.toList()); - return Jsons.jsonNode(Jsons.keys(root).stream() - .filter(key -> { - final boolean validKey = fieldNames.contains(namingResolver.getIdentifier(key)); - if (!validKey && !invalidKeys.contains(key)) { - LOGGER.warn("Ignoring field {} as it is not defined in catalog", key); - invalidKeys.add(key); - } - return validKey; - }) - .collect(Collectors.toMap(namingResolver::getIdentifier, - key -> formatData(fields.get(namingResolver.getIdentifier(key)).getSubFields(), root.get(key))))); - } - -} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json index 7e9b1a8f23930..5fcd574905900 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json @@ -11,6 +11,15 @@ "required": ["project_id", "dataset_id"], "additionalProperties": true, "properties": { + "big_query_client_buffer_size_mb": { + "title": "Google BigQuery client chunk size", + "description": "Google BigQuery client's chunk (buffer) size (MIN = 1, MAX = 15) for each table. It defaults to 15MiB. Smaller chunk size means less memory consumption, and is recommended for big data sets. For more details refer to the documentation here", + "type": "integer", + "minimum": 1, + "maximum": 15, + "default": 15, + "examples": ["15"] + }, "project_id": { "type": "string", "description": "The GCP project ID for the project containing the target BigQuery dataset.", @@ -65,6 +74,97 @@ "description": "The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.", "title": "Credentials JSON", "airbyte_secret": true + }, + "loading_method": { + "type": "object", + "title": "Loading Method", + "description": "Select the way that data will be uploaded to BigQuery.", + "oneOf": [ + { + "title": "Standard Inserts", + "additionalProperties": false, + "description": "Direct uploading using streams.", + "required": ["method"], + "properties": { + "method": { + "type": "string", + "const": "Standard" + } + } + }, + { + "title": "GCS Staging", + "additionalProperties": false, + "description": "Writes large batches of records to a file, uploads the file to GCS, then uses
COPY INTO table
to upload the file. Recommended for large production workloads for better speed and scalability.", + "required": [ + "method", + "gcs_bucket_name", + "gcs_bucket_path", + "credential" + ], + "properties": { + "method": { + "type": "string", + "const": "GCS Staging" + }, + "gcs_bucket_name": { + "title": "GCS Bucket Name", + "type": "string", + "description": "The name of the GCS bucket.", + "examples": ["airbyte_sync"] + }, + "gcs_bucket_path": { + "description": "Directory under the GCS bucket where data will be written.", + "type": "string", + "examples": ["data_sync/test"] + }, + "keep_files_in_gcs-bucket": { + "type": "string", + "description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished?", + "title": "GCS tmp files afterward processing", + "default": "Delete all tmp files from GCS", + "enum": [ + "Delete all tmp files from GCS", + "Keep all tmp files in GCS" + ] + }, + "credential": { + "title": "Credential", + "type": "object", + "oneOf": [ + { + "title": "HMAC key", + "required": [ + "credential_type", + "hmac_key_access_id", + "hmac_key_secret" + ], + "properties": { + "credential_type": { + "type": "string", + "const": "HMAC_KEY" + }, + "hmac_key_access_id": { + "type": "string", + "description": "HMAC key access ID. When linked to a service account, this ID is 61 characters long; when linked to a user account, it is 24 characters long.", + "title": "HMAC Key Access ID", + "airbyte_secret": true, + "examples": ["1234567890abcdefghij1234"] + }, + "hmac_key_secret": { + "type": "string", + "description": "The corresponding secret for the access ID. It is a 40-character base-64 encoded string.", + "title": "HMAC Key Secret", + "airbyte_secret": true, + "examples": ["1234567890abcdefghij1234567890ABCDEFGHIJ"] + } + } + } + ] + } + } + } + ] } } } diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationAcceptanceTest.java index f7f07feef80af..484fd902a9ad4 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationAcceptanceTest.java @@ -183,7 +183,7 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception { final String credentialsJsonString = new String(Files.readAllBytes(CREDENTIALS_PATH)); - final JsonNode credentialsJson = Jsons.deserialize(credentialsJsonString); + final JsonNode credentialsJson = Jsons.deserialize(credentialsJsonString).get(BigQueryConsts.BIGQUERY_BASIC_CONFIG); final String projectId = credentialsJson.get(CONFIG_PROJECT_ID).asText(); final String datasetLocation = "US"; @@ -191,7 +191,7 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception { config = Jsons.jsonNode(ImmutableMap.builder() .put(CONFIG_PROJECT_ID, projectId) - .put(CONFIG_CREDS, credentialsJsonString) + .put(CONFIG_CREDS, credentialsJson.toString()) .put(CONFIG_DATASET_ID, datasetId) .put(CONFIG_DATASET_LOCATION, datasetLocation) .build()); diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationTest.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationTest.java index 794d12d540f17..77d6b936457a0 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedDestinationTest.java @@ -4,6 +4,7 @@ package io.airbyte.integrations.destination.bigquery; +import static io.airbyte.integrations.destination.bigquery.formatter.DefaultBigQueryDenormalizedRecordFormatter.NESTED_ARRAY_FIELD; import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.*; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.params.provider.Arguments.arguments; @@ -72,6 +73,7 @@ class BigQueryDenormalizedDestinationTest { private static final AirbyteMessage MESSAGE_USERS4 = createRecordMessage(USERS_STREAM_NAME, getDataWithJSONDateTimeFormats()); private static final AirbyteMessage MESSAGE_USERS5 = createRecordMessage(USERS_STREAM_NAME, getDataWithJSONWithReference()); private static final AirbyteMessage MESSAGE_USERS6 = createRecordMessage(USERS_STREAM_NAME, Jsons.deserialize("{\"users\":null}")); + private static final AirbyteMessage MESSAGE_USERS7 = createRecordMessage(USERS_STREAM_NAME, getDataWithNestedDatetimeInsideNullObject()); private static final AirbyteMessage EMPTY_MESSAGE = createRecordMessage(USERS_STREAM_NAME, Jsons.deserialize("{}")); private JsonNode config; @@ -91,13 +93,15 @@ void setup(final TestInfo info) throws IOException { if (!Files.exists(CREDENTIALS_PATH)) { throw new IllegalStateException( - "Must provide path to a big query credentials file. By default {module-root}/config/credentials.json. Override by setting setting path with the CREDENTIALS_PATH constant."); + "Must provide path to a big query credentials file. By default {module-root}/" + CREDENTIALS_PATH + + ". Override by setting setting path with the CREDENTIALS_PATH constant."); } final String credentialsJsonString = new String(Files.readAllBytes(CREDENTIALS_PATH)); - final JsonNode credentialsJson = Jsons.deserialize(credentialsJsonString); + final JsonNode credentialsJson = Jsons.deserialize(credentialsJsonString).get(BigQueryConsts.BIGQUERY_BASIC_CONFIG); final String projectId = credentialsJson.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); - final ServiceAccountCredentials credentials = ServiceAccountCredentials.fromStream(new ByteArrayInputStream(credentialsJsonString.getBytes())); + final ServiceAccountCredentials credentials = + ServiceAccountCredentials.fromStream(new ByteArrayInputStream(credentialsJson.toString().getBytes())); bigquery = BigQueryOptions.newBuilder() .setProjectId(projectId) .setCredentials(credentials) @@ -112,6 +116,7 @@ void setup(final TestInfo info) throws IOException { MESSAGE_USERS4.getRecord().setNamespace(datasetId); MESSAGE_USERS5.getRecord().setNamespace(datasetId); MESSAGE_USERS6.getRecord().setNamespace(datasetId); + MESSAGE_USERS7.getRecord().setNamespace(datasetId); EMPTY_MESSAGE.getRecord().setNamespace(datasetId); final DatasetInfo datasetInfo = DatasetInfo.newBuilder(datasetId).setLocation(datasetLocation).build(); @@ -119,7 +124,7 @@ void setup(final TestInfo info) throws IOException { config = Jsons.jsonNode(ImmutableMap.builder() .put(BigQueryConsts.CONFIG_PROJECT_ID, projectId) - .put(BigQueryConsts.CONFIG_CREDS, credentialsJsonString) + .put(BigQueryConsts.CONFIG_CREDS, credentialsJson.toString()) .put(BigQueryConsts.CONFIG_DATASET_ID, datasetId) .put(BigQueryConsts.CONFIG_DATASET_LOCATION, datasetLocation) .put(BIG_QUERY_CLIENT_CHUNK_SIZE, 10) @@ -182,6 +187,27 @@ void testNestedWrite(final JsonNode schema, final AirbyteMessage message) throws assertEquals(extractJsonValues(resultJson, "domain"), extractJsonValues(expectedUsersJson, "domain")); } + @Test + void testNestedDataTimeInsideNullObject() throws Exception { + catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() + .withStream( + new AirbyteStream().withName(USERS_STREAM_NAME).withNamespace(datasetId).withJsonSchema(getSchemaWithNestedDatetimeInsideNullObject())) + .withSyncMode(SyncMode.FULL_REFRESH).withDestinationSyncMode(DestinationSyncMode.OVERWRITE))); + + final BigQueryDestination destination = new BigQueryDenormalizedDestination(); + final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); + + consumer.accept(MESSAGE_USERS7); + consumer.close(); + + final List usersActual = retrieveRecordsAsJson(USERS_STREAM_NAME); + final JsonNode expectedUsersJson = MESSAGE_USERS7.getRecord().getData(); + assertEquals(usersActual.size(), 1); + final JsonNode resultJson = usersActual.get(0); + assertEquals(extractJsonValues(resultJson, "name"), extractJsonValues(expectedUsersJson, "name")); + assertEquals(extractJsonValues(resultJson, "appointment"), extractJsonValues(expectedUsersJson, "appointment")); + } + @Test void testWriteWithFormat() throws Exception { catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() @@ -272,7 +298,7 @@ private Set extractJsonValues(final JsonNode node, final String attribut if (jsonNode.isArray()) { jsonNode.forEach(arrayNodeValue -> resultSet.add(arrayNodeValue.textValue())); } else if (jsonNode.isObject()) { - resultSet.addAll(extractJsonValues(jsonNode, "value")); + resultSet.addAll(extractJsonValues(jsonNode, NESTED_ARRAY_FIELD)); } else { resultSet.add(jsonNode.textValue()); } diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationAcceptanceTest.java new file mode 100644 index 0000000000000..34261da17837d --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationAcceptanceTest.java @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryOptions; +import com.google.cloud.bigquery.Dataset; +import com.google.cloud.bigquery.DatasetInfo; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.FieldList; +import com.google.cloud.bigquery.FieldValue; +import com.google.cloud.bigquery.FieldValueList; +import com.google.cloud.bigquery.Job; +import com.google.cloud.bigquery.JobId; +import com.google.cloud.bigquery.JobInfo; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.TableResult; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.standardtest.destination.DataArgumentsProvider; +import io.airbyte.integrations.standardtest.destination.DestinationAcceptanceTest; +import io.airbyte.protocol.models.AirbyteCatalog; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.airbyte.protocol.models.CatalogHelpers; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import java.io.ByteArrayInputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ArgumentsSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BigQueryDenormalizedGscDestinationAcceptanceTest extends DestinationAcceptanceTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDenormalizedGscDestinationAcceptanceTest.class); + + private static final Path CREDENTIALS_PATH = Path.of("secrets/credentials.json"); + + private static final String CONFIG_DATASET_ID = "dataset_id"; + private static final String CONFIG_PROJECT_ID = "project_id"; + private static final String CONFIG_DATASET_LOCATION = "dataset_location"; + private static final List AIRBYTE_COLUMNS = List.of(JavaBaseConstants.COLUMN_NAME_AB_ID, JavaBaseConstants.COLUMN_NAME_EMITTED_AT); + + private BigQuery bigquery; + private Dataset dataset; + private boolean tornDown; + private JsonNode config; + private final StandardNameTransformer namingResolver = new StandardNameTransformer(); + + @Override + protected String getImageName() { + return "airbyte/destination-bigquery-denormalized:dev"; + } + + @Override + protected JsonNode getConfig() { + return config; + } + + @Override + protected JsonNode getFailCheckConfig() throws Exception { + ((ObjectNode) config).put(CONFIG_PROJECT_ID, "fake"); + return config; + } + + @Override + protected boolean supportsDBT() { + return true; + } + + @Override + protected boolean implementsNamespaces() { + return true; + } + + @Override + protected String getDefaultSchema(final JsonNode config) { + return config.get(CONFIG_DATASET_ID).asText(); + } + + @Override + protected List retrieveNormalizedRecords(final TestDestinationEnv testEnv, final String streamName, final String namespace) + throws Exception { + final String tableName = namingResolver.getIdentifier(streamName); + final String schema = namingResolver.getIdentifier(namespace); + return retrieveRecordsFromTable(tableName, schema); + } + + @Override + protected List retrieveRecords(final TestDestinationEnv env, + final String streamName, + final String namespace, + final JsonNode streamSchema) + throws Exception { + return new ArrayList<>(retrieveRecordsFromTable(namingResolver.getIdentifier(streamName), namingResolver.getIdentifier(namespace))); + } + + @Override + protected List resolveIdentifier(final String identifier) { + final List result = new ArrayList<>(); + result.add(identifier); + result.add(namingResolver.getIdentifier(identifier)); + return result; + } + + private List retrieveRecordsFromTable(final String tableName, final String schema) throws InterruptedException { + final QueryJobConfiguration queryConfig = + QueryJobConfiguration + .newBuilder( + String.format("SELECT * FROM `%s`.`%s` order by %s asc;", schema, tableName, + JavaBaseConstants.COLUMN_NAME_EMITTED_AT)) + .setUseLegacySql(false).build(); + + final TableResult queryResults = executeQuery(bigquery, queryConfig).getLeft().getQueryResults(); + final FieldList fields = queryResults.getSchema().getFields(); + + return StreamSupport + .stream(queryResults.iterateAll().spliterator(), false) + .map(row -> { + final Map jsonMap = Maps.newHashMap(); + for (final Field field : fields) { + final Object value = getTypedFieldValue(row, field); + if (!isAirbyteColumn(field.getName()) && value != null) { + jsonMap.put(field.getName(), value); + } + } + return jsonMap; + }) + .map(Jsons::jsonNode) + .collect(Collectors.toList()); + } + + private boolean isAirbyteColumn(final String name) { + if (AIRBYTE_COLUMNS.contains(name)) { + return true; + } + return name.startsWith("_airbyte") && name.endsWith("_hashid"); + } + + private Object getTypedFieldValue(final FieldValueList row, final Field field) { + final FieldValue fieldValue = row.get(field.getName()); + if (fieldValue.getValue() != null) { + return switch (field.getType().getStandardType()) { + case FLOAT64, NUMERIC -> fieldValue.getDoubleValue(); + case INT64 -> fieldValue.getNumericValue().intValue(); + case STRING -> fieldValue.getStringValue(); + case BOOL -> fieldValue.getBooleanValue(); + case STRUCT -> fieldValue.getRecordValue().toString(); + default -> fieldValue.getValue(); + }; + } else { + return null; + } + } + + @Override + protected void setup(final TestDestinationEnv testEnv) throws Exception { + if (!Files.exists(CREDENTIALS_PATH)) { + throw new IllegalStateException( + "Must provide path to a big query credentials file. By default {module-root}/" + CREDENTIALS_PATH + + ". Override by setting setting path with the CREDENTIALS_PATH constant."); + } + + final String fullConfigFromSecretFileAsString = new String(Files.readAllBytes(CREDENTIALS_PATH)); + + final JsonNode fullConfigFromSecretFileJson = Jsons.deserialize(fullConfigFromSecretFileAsString); + final JsonNode bigqueryConfigFromSecretFile = fullConfigFromSecretFileJson.get(BigQueryConsts.BIGQUERY_BASIC_CONFIG); + final JsonNode gcsConfigFromSecretFile = fullConfigFromSecretFileJson.get(BigQueryConsts.GCS_CONFIG); + + final String projectId = bigqueryConfigFromSecretFile.get(CONFIG_PROJECT_ID).asText(); + final String datasetLocation = "US"; + + final String datasetId = Strings.addRandomSuffix("airbyte_tests", "_", 8); + + final JsonNode gcsCredentialFromSecretFile = gcsConfigFromSecretFile.get(BigQueryConsts.CREDENTIAL); + final JsonNode credential = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.CREDENTIAL_TYPE, gcsCredentialFromSecretFile.get(BigQueryConsts.CREDENTIAL_TYPE)) + .put(BigQueryConsts.HMAC_KEY_ACCESS_ID, gcsCredentialFromSecretFile.get(BigQueryConsts.HMAC_KEY_ACCESS_ID)) + .put(BigQueryConsts.HMAC_KEY_ACCESS_SECRET, gcsCredentialFromSecretFile.get(BigQueryConsts.HMAC_KEY_ACCESS_SECRET)) + .build()); + + final JsonNode loadingMethod = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.METHOD, BigQueryConsts.GCS_STAGING) + .put(BigQueryConsts.GCS_BUCKET_NAME, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_NAME)) + .put(BigQueryConsts.GCS_BUCKET_PATH, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_PATH).asText() + System.currentTimeMillis()) + .put(BigQueryConsts.CREDENTIAL, credential) + .build()); + + config = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.CONFIG_PROJECT_ID, projectId) + .put(BigQueryConsts.CONFIG_CREDS, bigqueryConfigFromSecretFile.toString()) + .put(BigQueryConsts.CONFIG_DATASET_ID, datasetId) + .put(BigQueryConsts.CONFIG_DATASET_LOCATION, datasetLocation) + .put(BigQueryConsts.LOADING_METHOD, loadingMethod) + .build()); + + final ServiceAccountCredentials credentials = ServiceAccountCredentials + .fromStream(new ByteArrayInputStream(bigqueryConfigFromSecretFile.toString().getBytes())); + + bigquery = BigQueryOptions.newBuilder() + .setProjectId(config.get(CONFIG_PROJECT_ID).asText()) + .setCredentials(credentials) + .build() + .getService(); + + final DatasetInfo datasetInfo = + DatasetInfo.newBuilder(config.get(CONFIG_DATASET_ID).asText()).setLocation(config.get(CONFIG_DATASET_LOCATION).asText()).build(); + dataset = bigquery.create(datasetInfo); + + tornDown = false; + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + if (!tornDown) { + tearDownBigQuery(); + } + })); + } + + @Override + protected void tearDown(final TestDestinationEnv testEnv) { + // gcs tmp files are supposed to be removed automatically by consumer + tearDownBigQuery(); + } + + private void tearDownBigQuery() { + // allows deletion of a dataset that has contents + final BigQuery.DatasetDeleteOption option = BigQuery.DatasetDeleteOption.deleteContents(); + + final boolean success = bigquery.delete(dataset.getDatasetId(), option); + if (success) { + LOGGER.info("BQ Dataset " + dataset + " deleted..."); + } else { + LOGGER.info("BQ Dataset cleanup for " + dataset + " failed!"); + } + + tornDown = true; + } + + // todo (cgardens) - figure out how to share these helpers. they are currently copied from + // BigQueryDestination. + private static ImmutablePair executeQuery(final BigQuery bigquery, final QueryJobConfiguration queryConfig) { + final JobId jobId = JobId.of(UUID.randomUUID().toString()); + final Job queryJob = bigquery.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()); + return executeQuery(queryJob); + } + + private static ImmutablePair executeQuery(final Job queryJob) { + final Job completedJob = waitForQuery(queryJob); + if (completedJob == null) { + throw new RuntimeException("Job no longer exists"); + } else if (completedJob.getStatus().getError() != null) { + // You can also look at queryJob.getStatus().getExecutionErrors() for all + // errors, not just the latest one. + return ImmutablePair.of(null, (completedJob.getStatus().getError().toString())); + } + + return ImmutablePair.of(completedJob, null); + } + + private static Job waitForQuery(final Job queryJob) { + try { + return queryJob.waitFor(); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Verify that the integration successfully writes normalized records successfully (without actually + * running the normalization module) Tests a wide variety of messages an schemas (aspirationally, + * anyway). + */ + @ParameterizedTest + @ArgumentsSource(DataArgumentsProvider.class) + public void testSyncNormalizedWithoutNormalization(final String messagesFilename, final String catalogFilename) throws Exception { + final AirbyteCatalog catalog = Jsons.deserialize(MoreResources.readResource(catalogFilename), AirbyteCatalog.class); + final ConfiguredAirbyteCatalog configuredCatalog = CatalogHelpers.toDefaultConfiguredCatalog(catalog); + final List messages = MoreResources.readResource(messagesFilename).lines() + .map(record -> Jsons.deserialize(record, AirbyteMessage.class)).collect(Collectors.toList()); + + final JsonNode config = getConfig(); + // don't run normalization though + runSyncAndVerifyStateOutput(config, messages, configuredCatalog, false); + + final String defaultSchema = getDefaultSchema(config); + final List actualMessages = retrieveNormalizedRecords(catalog, defaultSchema); + assertSameMessages(messages, actualMessages, true); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationTest.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationTest.java new file mode 100644 index 0000000000000..696e0cfffd3f8 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationTest.java @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery; + +import static io.airbyte.integrations.destination.bigquery.formatter.DefaultBigQueryDenormalizedRecordFormatter.NESTED_ARRAY_FIELD; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getData; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getDataWithEmptyObjectAndArray; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getDataWithFormats; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getDataWithJSONDateTimeFormats; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getDataWithJSONWithReference; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getSchema; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getSchemaWithDateTime; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getSchemaWithFormats; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getSchemaWithInvalidArrayType; +import static io.airbyte.integrations.destination.bigquery.util.BigQueryDenormalizedTestDataUtils.getSchemaWithReferenceDefinition; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.DeleteObjectsRequest; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryOptions; +import com.google.cloud.bigquery.Dataset; +import com.google.cloud.bigquery.DatasetInfo; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.Destination; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.GcsS3Helper; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.airbyte.protocol.models.AirbyteStream; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.DestinationSyncMode; +import io.airbyte.protocol.models.SyncMode; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import org.assertj.core.util.Sets; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class BigQueryDenormalizedGscDestinationTest { + + private static final Path CREDENTIALS_PATH = Path.of("secrets/credentials.json"); + private static final Set AIRBYTE_METADATA_FIELDS = Set.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, JavaBaseConstants.COLUMN_NAME_AB_ID); + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDenormalizedGscDestinationTest.class); + + private static final String BIG_QUERY_CLIENT_CHUNK_SIZE = "big_query_client_buffer_size_mb"; + private static final Instant NOW = Instant.now(); + private static final String USERS_STREAM_NAME = "users"; + private static final AirbyteMessage MESSAGE_USERS1 = createRecordMessage(USERS_STREAM_NAME, getData()); + private static final AirbyteMessage MESSAGE_USERS2 = createRecordMessage(USERS_STREAM_NAME, getDataWithEmptyObjectAndArray()); + private static final AirbyteMessage MESSAGE_USERS3 = createRecordMessage(USERS_STREAM_NAME, getDataWithFormats()); + private static final AirbyteMessage MESSAGE_USERS4 = createRecordMessage(USERS_STREAM_NAME, getDataWithJSONDateTimeFormats()); + private static final AirbyteMessage MESSAGE_USERS5 = createRecordMessage(USERS_STREAM_NAME, getDataWithJSONWithReference()); + private static final AirbyteMessage MESSAGE_USERS6 = createRecordMessage(USERS_STREAM_NAME, Jsons.deserialize("{\"users\":null}")); + private static final AirbyteMessage EMPTY_MESSAGE = createRecordMessage(USERS_STREAM_NAME, Jsons.deserialize("{}")); + + private JsonNode config; + private AmazonS3 s3Client; + + private BigQuery bigquery; + private Dataset dataset; + private ConfiguredAirbyteCatalog catalog; + private String datasetId; + + private boolean tornDown = true; + + @BeforeEach + void setup(final TestInfo info) throws IOException { + if (info.getDisplayName().equals("testSpec()")) { + return; + } + + if (!Files.exists(CREDENTIALS_PATH)) { + throw new IllegalStateException( + "Must provide path to a big query credentials file. By default {module-root}/" + CREDENTIALS_PATH + + ". Override by setting setting path with the CREDENTIALS_PATH constant."); + } + final String credentialsJsonString = new String(Files.readAllBytes(CREDENTIALS_PATH)); + final JsonNode credentialsJson = Jsons.deserialize(credentialsJsonString).get(BigQueryConsts.BIGQUERY_BASIC_CONFIG); + final JsonNode credentialsGcsJson = Jsons.deserialize(credentialsJsonString).get(BigQueryConsts.GCS_CONFIG); + + final String projectId = credentialsJson.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); + final ServiceAccountCredentials credentials = + ServiceAccountCredentials.fromStream(new ByteArrayInputStream(credentialsJson.toString().getBytes())); + bigquery = BigQueryOptions.newBuilder() + .setProjectId(projectId) + .setCredentials(credentials) + .build() + .getService(); + + datasetId = Strings.addRandomSuffix("airbyte_tests", "_", 8); + final String datasetLocation = "EU"; + MESSAGE_USERS1.getRecord().setNamespace(datasetId); + MESSAGE_USERS2.getRecord().setNamespace(datasetId); + MESSAGE_USERS3.getRecord().setNamespace(datasetId); + MESSAGE_USERS4.getRecord().setNamespace(datasetId); + MESSAGE_USERS5.getRecord().setNamespace(datasetId); + MESSAGE_USERS6.getRecord().setNamespace(datasetId); + EMPTY_MESSAGE.getRecord().setNamespace(datasetId); + + final DatasetInfo datasetInfo = DatasetInfo.newBuilder(datasetId).setLocation(datasetLocation).build(); + dataset = bigquery.create(datasetInfo); + + final JsonNode credentialFromSecretFile = credentialsGcsJson.get(BigQueryConsts.CREDENTIAL); + final JsonNode credential = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.CREDENTIAL_TYPE, credentialFromSecretFile.get(BigQueryConsts.CREDENTIAL_TYPE)) + .put(BigQueryConsts.HMAC_KEY_ACCESS_ID, credentialFromSecretFile.get(BigQueryConsts.HMAC_KEY_ACCESS_ID)) + .put(BigQueryConsts.HMAC_KEY_ACCESS_SECRET, credentialFromSecretFile.get(BigQueryConsts.HMAC_KEY_ACCESS_SECRET)) + .build()); + + final JsonNode loadingMethod = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.METHOD, BigQueryConsts.GCS_STAGING) + .put(BigQueryConsts.KEEP_GCS_FILES, BigQueryConsts.KEEP_GCS_FILES_VAL) + .put(BigQueryConsts.GCS_BUCKET_NAME, credentialsGcsJson.get(BigQueryConsts.GCS_BUCKET_NAME)) + .put(BigQueryConsts.GCS_BUCKET_PATH, credentialsGcsJson.get(BigQueryConsts.GCS_BUCKET_PATH).asText() + System.currentTimeMillis()) + .put(BigQueryConsts.CREDENTIAL, credential) + .build()); + + config = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.CONFIG_PROJECT_ID, projectId) + .put(BigQueryConsts.CONFIG_CREDS, credentialsJson.toString()) + .put(BigQueryConsts.CONFIG_DATASET_ID, datasetId) + .put(BigQueryConsts.CONFIG_DATASET_LOCATION, datasetLocation) + .put(BigQueryConsts.LOADING_METHOD, loadingMethod) + .put(BIG_QUERY_CLIENT_CHUNK_SIZE, 10) + .build()); + + final GcsDestinationConfig gcsDestinationConfig = GcsDestinationConfig + .getGcsDestinationConfig(BigQueryUtils.getGcsJsonNodeConfig(config)); + this.s3Client = GcsS3Helper.getGcsS3Client(gcsDestinationConfig); + + tornDown = false; + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + if (!tornDown) { + tearDownBigQuery(); + } + })); + + } + + @AfterEach + void tearDown(final TestInfo info) { + if (info.getDisplayName().equals("testSpec()")) { + return; + } + tearDownGcs(); + tearDownBigQuery(); + } + + /** + * Remove all the GCS output from the tests. + */ + protected void tearDownGcs() { + final JsonNode properties = config.get(BigQueryConsts.LOADING_METHOD); + final String gcsBucketName = properties.get(BigQueryConsts.GCS_BUCKET_NAME).asText(); + final String gcs_bucket_path = properties.get(BigQueryConsts.GCS_BUCKET_PATH).asText(); + + final List keysToDelete = new LinkedList<>(); + final List objects = s3Client + .listObjects(gcsBucketName, gcs_bucket_path) + .getObjectSummaries(); + for (final S3ObjectSummary object : objects) { + keysToDelete.add(new DeleteObjectsRequest.KeyVersion(object.getKey())); + } + + if (keysToDelete.size() > 0) { + LOGGER.info("Tearing down test bucket path: {}/{}", gcsBucketName, gcs_bucket_path); + // Google Cloud Storage doesn't accept request to delete multiple objects + for (final DeleteObjectsRequest.KeyVersion keyToDelete : keysToDelete) { + s3Client.deleteObject(gcsBucketName, keyToDelete.getKey()); + } + LOGGER.info("Deleted {} file(s).", keysToDelete.size()); + } + } + + private void tearDownBigQuery() { + // allows deletion of a dataset that has contents + final BigQuery.DatasetDeleteOption option = BigQuery.DatasetDeleteOption.deleteContents(); + + final boolean success = bigquery.delete(dataset.getDatasetId(), option); + if (success) { + LOGGER.info("BQ Dataset " + dataset + " deleted..."); + } else { + LOGGER.info("BQ Dataset cleanup for " + dataset + " failed!"); + } + + tornDown = true; + } + + @ParameterizedTest + @MethodSource("schemaAndDataProvider") + void testNestedWrite(final JsonNode schema, final AirbyteMessage message) throws Exception { + catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() + .withStream(new AirbyteStream().withName(USERS_STREAM_NAME).withNamespace(datasetId).withJsonSchema(schema)) + .withSyncMode(SyncMode.FULL_REFRESH).withDestinationSyncMode(DestinationSyncMode.OVERWRITE))); + + final BigQueryDestination destination = new BigQueryDenormalizedDestination(); + final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); + + consumer.accept(message); + consumer.close(); + + final List usersActual = retrieveRecordsAsJson(USERS_STREAM_NAME); + final JsonNode expectedUsersJson = message.getRecord().getData(); + assertEquals(usersActual.size(), 1); + final JsonNode resultJson = usersActual.get(0); + assertEquals(extractJsonValues(resultJson, "name"), extractJsonValues(expectedUsersJson, "name")); + assertEquals(extractJsonValues(resultJson, "grants"), extractJsonValues(expectedUsersJson, "grants")); + assertEquals(extractJsonValues(resultJson, "domain"), extractJsonValues(expectedUsersJson, "domain")); + } + + @Test + void testWriteWithFormat() throws Exception { + catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() + .withStream(new AirbyteStream().withName(USERS_STREAM_NAME).withNamespace(datasetId).withJsonSchema(getSchemaWithFormats())) + .withSyncMode(SyncMode.FULL_REFRESH).withDestinationSyncMode(DestinationSyncMode.OVERWRITE))); + + final BigQueryDestination destination = new BigQueryDenormalizedDestination(); + final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); + + consumer.accept(MESSAGE_USERS3); + consumer.close(); + + final List usersActual = retrieveRecordsAsJson(USERS_STREAM_NAME); + final JsonNode expectedUsersJson = MESSAGE_USERS3.getRecord().getData(); + assertEquals(usersActual.size(), 1); + final JsonNode resultJson = usersActual.get(0); + assertEquals(extractJsonValues(resultJson, "name"), extractJsonValues(expectedUsersJson, "name")); + assertEquals(extractJsonValues(resultJson, "date_of_birth"), extractJsonValues(expectedUsersJson, "date_of_birth")); + + // Bigquery's datetime type accepts multiple input format but always outputs the same, so we can't + // expect to receive the value we sent. + assertEquals(extractJsonValues(resultJson, "updated_at"), Set.of("2021-10-11T06:36:53Z")); + + final Schema expectedSchema = Schema.of( + Field.of("name", StandardSQLTypeName.STRING), + Field.of("date_of_birth", StandardSQLTypeName.DATE), + Field.of("updated_at", StandardSQLTypeName.TIMESTAMP), + Field.of(JavaBaseConstants.COLUMN_NAME_AB_ID, StandardSQLTypeName.STRING), + Field.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, StandardSQLTypeName.TIMESTAMP)); + + assertEquals(BigQueryUtils.getTableDefinition(bigquery, dataset.getDatasetId().getDataset(), USERS_STREAM_NAME).getSchema(), expectedSchema); + } + + @Test + void testIfJSONDateTimeWasConvertedToBigQueryFormat() throws Exception { + catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() + .withStream(new AirbyteStream().withName(USERS_STREAM_NAME).withNamespace(datasetId).withJsonSchema(getSchemaWithDateTime())) + .withSyncMode(SyncMode.FULL_REFRESH).withDestinationSyncMode(DestinationSyncMode.OVERWRITE))); + + final BigQueryDestination destination = new BigQueryDenormalizedDestination(); + final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); + + consumer.accept(MESSAGE_USERS4); + consumer.close(); + + final List usersActual = retrieveRecordsAsJson(USERS_STREAM_NAME); + assertEquals(usersActual.size(), 1); + final JsonNode resultJson = usersActual.get(0); + + // BigQuery Accepts "YYYY-MM-DD HH:MM:SS[.SSSSSS]" format + // returns "yyyy-MM-dd'T'HH:mm:ss" format + assertEquals(Set.of("2021-10-11T06:36:53Z"), extractJsonValues(resultJson, "updated_at")); + // check nested datetime + assertEquals(Set.of("2021-11-11T06:36:53Z"), + extractJsonValues(resultJson.get("items"), "nested_datetime")); + } + + @Test + void testJsonReferenceDefinition() throws Exception { + catalog = new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList(new ConfiguredAirbyteStream() + .withStream(new AirbyteStream().withName(USERS_STREAM_NAME).withNamespace(datasetId).withJsonSchema(getSchemaWithReferenceDefinition())) + .withSyncMode(SyncMode.FULL_REFRESH).withDestinationSyncMode(DestinationSyncMode.OVERWRITE))); + + final BigQueryDestination destination = new BigQueryDenormalizedDestination(); + final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, Destination::defaultOutputRecordCollector); + + consumer.accept(MESSAGE_USERS5); + consumer.accept(MESSAGE_USERS6); + consumer.accept(EMPTY_MESSAGE); + consumer.close(); + + final Set actual = + retrieveRecordsAsJson(USERS_STREAM_NAME).stream().flatMap(x -> extractJsonValues(x, "users").stream()).collect(Collectors.toSet()); + + final Set expected = Sets.set( + "{\"name\":\"John\",\"surname\":\"Adams\"}", + null // we expect one record to have not had the users field set + ); + + assertEquals(2, actual.size()); + assertEquals(expected, actual); + } + + private Set extractJsonValues(final JsonNode node, final String attributeName) { + final List valuesNode = node.findValues(attributeName); + final Set resultSet = new HashSet<>(); + valuesNode.forEach(jsonNode -> { + if (jsonNode.isArray()) { + jsonNode.forEach(arrayNodeValue -> resultSet.add(arrayNodeValue.textValue())); + } else if (jsonNode.isObject()) { + resultSet.addAll(extractJsonValues(jsonNode, NESTED_ARRAY_FIELD)); + } else { + resultSet.add(jsonNode.textValue()); + } + }); + + return resultSet; + } + + private JsonNode removeAirbyteMetadataFields(final JsonNode record) { + for (final String airbyteMetadataField : AIRBYTE_METADATA_FIELDS) { + ((ObjectNode) record).remove(airbyteMetadataField); + } + return record; + } + + private List retrieveRecordsAsJson(final String tableName) throws Exception { + final QueryJobConfiguration queryConfig = + QueryJobConfiguration + .newBuilder( + String.format("select TO_JSON_STRING(t) as jsonValue from %s.%s t;", dataset.getDatasetId().getDataset(), tableName.toLowerCase())) + .setUseLegacySql(false).build(); + BigQueryUtils.executeQuery(bigquery, queryConfig); + + return StreamSupport + .stream(BigQueryUtils.executeQuery(bigquery, queryConfig).getLeft().getQueryResults().iterateAll().spliterator(), false) + .map(v -> v.get("jsonValue").getStringValue()) + .map(Jsons::deserialize) + .map(this::removeAirbyteMetadataFields) + .collect(Collectors.toList()); + } + + private static Stream schemaAndDataProvider() { + return Stream.of( + arguments(getSchema(), MESSAGE_USERS1), + arguments(getSchemaWithInvalidArrayType(), MESSAGE_USERS1), + arguments(getSchema(), MESSAGE_USERS2)); + } + + private static AirbyteMessage createRecordMessage(final String stream, final JsonNode data) { + return new AirbyteMessage().withType(AirbyteMessage.Type.RECORD) + .withRecord(new AirbyteRecordMessage().withStream(stream) + .withData(data) + .withEmittedAt(NOW.toEpochMilli())); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/util/BigQueryDenormalizedTestDataUtils.java b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/util/BigQueryDenormalizedTestDataUtils.java index 782c5cc897590..5d8e044854595 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/util/BigQueryDenormalizedTestDataUtils.java +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/src/test-integration/java/io/airbyte/integrations/destination/bigquery/util/BigQueryDenormalizedTestDataUtils.java @@ -225,6 +225,43 @@ public static JsonNode getSchemaWithReferenceDefinition() { + " "); } + public static JsonNode getSchemaWithNestedDatetimeInsideNullObject() { + return Jsons.deserialize("{\n" + + " \"type\": [\n" + + " \"object\"\n" + + " ],\n" + + " \"properties\": {\n" + + " \"name\": {\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ]\n" + + " },\n" + + " \"appointment\": {\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"object\"\n" + + " ],\n" + + " \"properties\": {\n" + + " \"street\": {\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ]\n" + + " },\n" + + " \"expTime\": {\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ],\n" + + " \"format\": \"date-time\"\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + "}"); + } + public static JsonNode getDataWithEmptyObjectAndArray() { return Jsons.deserialize( "{\n" @@ -249,4 +286,12 @@ public static JsonNode getDataWithEmptyObjectAndArray() { + "}"); } + public static JsonNode getDataWithNestedDatetimeInsideNullObject() { + return Jsons.deserialize("{\n" + + " \"name\": \"Alice in Wonderland\",\n" + + " \"appointment\": null\n" + + "}"); + + } + } diff --git a/airbyte-integrations/connectors/destination-bigquery/Dockerfile b/airbyte-integrations/connectors/destination-bigquery/Dockerfile index 2340242c9d750..7e59e7c0f2e5c 100644 --- a/airbyte-integrations/connectors/destination-bigquery/Dockerfile +++ b/airbyte-integrations/connectors/destination-bigquery/Dockerfile @@ -4,7 +4,9 @@ WORKDIR /airbyte ENV APPLICATION destination-bigquery -ADD build/distributions/${APPLICATION}*.tar /airbyte +COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar -LABEL io.airbyte.version=0.5.1 +RUN tar xf ${APPLICATION}.tar --strip-components=1 + +LABEL io.airbyte.version=0.6.0-rc1 LABEL io.airbyte.name=airbyte/destination-bigquery diff --git a/airbyte-integrations/connectors/destination-bigquery/build.gradle b/airbyte-integrations/connectors/destination-bigquery/build.gradle index c30f258a5b59d..ed1bc14b1a761 100644 --- a/airbyte-integrations/connectors/destination-bigquery/build.gradle +++ b/airbyte-integrations/connectors/destination-bigquery/build.gradle @@ -21,6 +21,11 @@ dependencies { implementation project(':airbyte-protocol:models') implementation project(':airbyte-integrations:connectors:destination-s3') implementation project(':airbyte-integrations:connectors:destination-gcs') + implementation('tech.allegro.schema.json2avro:converter') { + version { + branch = 'master' + } + } integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-destination-test') integrationTestJavaImplementation files(project(':airbyte-integrations:bases:base-normalization').airbyteDocker.outputs) diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java index 99a3568ec5f2d..d44662410bd4e 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java @@ -6,23 +6,12 @@ import static java.util.Objects.isNull; -import com.amazonaws.services.s3.AmazonS3; import com.fasterxml.jackson.databind.JsonNode; import com.google.auth.oauth2.ServiceAccountCredentials; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryOptions; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.FormatOptions; import com.google.cloud.bigquery.Job; -import com.google.cloud.bigquery.JobId; -import com.google.cloud.bigquery.JobInfo.CreateDisposition; -import com.google.cloud.bigquery.JobInfo.WriteDisposition; import com.google.cloud.bigquery.QueryJobConfiguration; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.StandardSQLTypeName; -import com.google.cloud.bigquery.TableDataWriteChannel; -import com.google.cloud.bigquery.TableId; -import com.google.cloud.bigquery.WriteChannelConfiguration; import com.google.common.base.Charsets; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.BaseConnector; @@ -30,25 +19,25 @@ import io.airbyte.integrations.base.AirbyteStreamNameNamespacePair; import io.airbyte.integrations.base.Destination; import io.airbyte.integrations.base.IntegrationRunner; -import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.formatter.DefaultBigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.formatter.GcsAvroBigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.formatter.GcsCsvBigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.uploader.AbstractBigQueryUploader; +import io.airbyte.integrations.destination.bigquery.uploader.BigQueryUploaderFactory; +import io.airbyte.integrations.destination.bigquery.uploader.UploaderType; +import io.airbyte.integrations.destination.bigquery.uploader.config.UploaderConfig; import io.airbyte.integrations.destination.gcs.GcsDestination; -import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; -import io.airbyte.integrations.destination.gcs.GcsS3Helper; -import io.airbyte.integrations.destination.gcs.csv.GcsCsvWriter; import io.airbyte.protocol.models.AirbyteConnectionStatus; import io.airbyte.protocol.models.AirbyteConnectionStatus.Status; import io.airbyte.protocol.models.AirbyteMessage; import io.airbyte.protocol.models.AirbyteStream; import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; import io.airbyte.protocol.models.ConfiguredAirbyteStream; -import io.airbyte.protocol.models.DestinationSyncMode; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.sql.Timestamp; import java.util.HashMap; -import java.util.HashSet; import java.util.Map; -import java.util.Set; import java.util.function.Consumer; import org.apache.commons.lang3.tuple.ImmutablePair; import org.slf4j.Logger; @@ -58,11 +47,6 @@ public class BigQueryDestination extends BaseConnector implements Destination { private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDestination.class); - private static final com.google.cloud.bigquery.Schema SCHEMA = com.google.cloud.bigquery.Schema.of( - Field.of(JavaBaseConstants.COLUMN_NAME_AB_ID, StandardSQLTypeName.STRING), - Field.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, StandardSQLTypeName.TIMESTAMP), - Field.of(JavaBaseConstants.COLUMN_NAME_DATA, StandardSQLTypeName.STRING)); - private final BigQuerySQLNameTransformer namingResolver; public BigQueryDestination() { @@ -75,7 +59,7 @@ public AirbyteConnectionStatus check(final JsonNode config) { final String datasetId = config.get(BigQueryConsts.CONFIG_DATASET_ID).asText(); final String datasetLocation = BigQueryUtils.getDatasetLocation(config); final BigQuery bigquery = getBigQuery(config); - final UploadingMethod uploadingMethod = getLoadingMethod(config); + final UploadingMethod uploadingMethod = BigQueryUtils.getLoadingMethod(config); BigQueryUtils.createSchemaTable(bigquery, datasetId, datasetLocation); final QueryJobConfiguration queryConfig = QueryJobConfiguration @@ -109,27 +93,13 @@ protected BigQuerySQLNameTransformer getNamingResolver() { return namingResolver; } - // https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html - private Integer getBigQueryClientChunkSize(final JsonNode config) { - Integer chunkSizeFromConfig = null; - if (config.has(BigQueryConsts.BIG_QUERY_CLIENT_CHUNK_SIZE)) { - chunkSizeFromConfig = config.get(BigQueryConsts.BIG_QUERY_CLIENT_CHUNK_SIZE).asInt(); - if (chunkSizeFromConfig <= 0) { - LOGGER.error("BigQuery client Chunk (buffer) size must be a positive number (MB), but was:" + chunkSizeFromConfig); - throw new IllegalArgumentException("BigQuery client Chunk (buffer) size must be a positive number (MB)"); - } - chunkSizeFromConfig = chunkSizeFromConfig * BigQueryConsts.MiB; - } - return chunkSizeFromConfig; - } - - private BigQuery getBigQuery(final JsonNode config) { + protected BigQuery getBigQuery(final JsonNode config) { final String projectId = config.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); try { final BigQueryOptions.Builder bigQueryBuilder = BigQueryOptions.newBuilder(); ServiceAccountCredentials credentials = null; - if (isUsingJsonCredentials(config)) { + if (BigQueryUtils.isUsingJsonCredentials(config)) { // handle the credentials json being passed as a json object or a json object already serialized as // a string. final String credentialsString = @@ -148,10 +118,6 @@ private BigQuery getBigQuery(final JsonNode config) { } } - public static boolean isUsingJsonCredentials(final JsonNode config) { - return config.has(BigQueryConsts.CONFIG_CREDS) && !config.get(BigQueryConsts.CONFIG_CREDS).asText().isEmpty(); - } - /** * Strategy: *

@@ -179,154 +145,59 @@ public AirbyteMessageConsumer getConsumer(final JsonNode config, final ConfiguredAirbyteCatalog catalog, final Consumer outputRecordCollector) throws IOException { + return getRecordConsumer(getUploaderMap(config, catalog), outputRecordCollector); + } + + protected Map> getUploaderMap(final JsonNode config, + final ConfiguredAirbyteCatalog catalog) + throws IOException { final BigQuery bigquery = getBigQuery(config); - final Map writeConfigs = new HashMap<>(); - final Set existingSchemas = new HashSet<>(); - final boolean isGcsUploadingMode = UploadingMethod.GCS.equals(getLoadingMethod(config)); - final boolean isKeepFilesInGcs = isKeepFilesInGcs(config); - // create tmp tables if not exist + final Map> uploaderMap = new HashMap<>(); for (final ConfiguredAirbyteStream configStream : catalog.getStreams()) { final AirbyteStream stream = configStream.getStream(); final String streamName = stream.getName(); - final String schemaName = getSchema(config, configStream); - final String tableName = getTargetTableName(streamName); - final String tmpTableName = namingResolver.getTmpTableName(streamName); - final String datasetLocation = BigQueryUtils.getDatasetLocation(config); - BigQueryUtils.createSchemaAndTableIfNeeded(bigquery, existingSchemas, schemaName, tmpTableName, - datasetLocation, getBigQuerySchema(stream.getJsonSchema())); - final Schema schema = getBigQuerySchema(stream.getJsonSchema()); - // https://cloud.google.com/bigquery/docs/loading-data-local#loading_data_from_a_local_data_source - final WriteChannelConfiguration writeChannelConfiguration = WriteChannelConfiguration - .newBuilder(TableId.of(schemaName, tmpTableName)) - .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) - .setSchema(schema) - .setFormatOptions(FormatOptions.json()).build(); // new-line delimited json. - - final JobId job = JobId.newBuilder() - .setRandomJob() - .setLocation(datasetLocation) - .setProject(bigquery.getOptions().getProjectId()) + UploaderConfig uploaderConfig = UploaderConfig + .builder() + .bigQuery(bigquery) + .configStream(configStream) + .config(config) + .formatterMap(getFormatterMap(stream.getJsonSchema())) + .tmpTableName(namingResolver.getTmpTableName(streamName)) + .targetTableName(getTargetTableName(streamName)) + .isDefaultAirbyteTmpSchema(isDefaultAirbyteTmpTableSchema()) .build(); - final TableDataWriteChannel writer = bigquery.writer(job, writeChannelConfiguration); - - // this this optional value. If not set - use default client's value (15MiG) - final Integer bigQueryClientChunkSizeFomConfig = getBigQueryClientChunkSize(config); - if (bigQueryClientChunkSizeFomConfig != null) { - writer.setChunkSize(bigQueryClientChunkSizeFomConfig); - } - final WriteDisposition syncMode = getWriteDisposition(configStream.getDestinationSyncMode()); - - if (isGcsUploadingMode) { - final GcsDestinationConfig gcsDestinationConfig = GcsDestinationConfig - .getGcsDestinationConfig(BigQueryUtils.getGcsJsonNodeConfig(config)); - final GcsCsvWriter gcsCsvWriter = initGcsWriter(gcsDestinationConfig, configStream); - gcsCsvWriter.initialize(); - - writeConfigs.put(AirbyteStreamNameNamespacePair.fromAirbyteSteam(stream), - new BigQueryWriteConfig(TableId.of(schemaName, tableName), TableId.of(schemaName, tmpTableName), - writer, syncMode, schema, gcsCsvWriter, gcsDestinationConfig)); - } else { - writeConfigs.put(AirbyteStreamNameNamespacePair.fromAirbyteSteam(stream), - new BigQueryWriteConfig(TableId.of(schemaName, tableName), TableId.of(schemaName, tmpTableName), - writer, syncMode, schema, null, null)); - } - + uploaderMap.put( + AirbyteStreamNameNamespacePair.fromAirbyteSteam(stream), + BigQueryUploaderFactory.getUploader(uploaderConfig)); } - // write to tmp tables - // if success copy delete main table if exists. rename tmp tables to real tables. - return getRecordConsumer(bigquery, writeConfigs, catalog, outputRecordCollector, isGcsUploadingMode, isKeepFilesInGcs); + return uploaderMap; } /** - * Despite the fact that uploading to going to be done to GCS, you may see the S3 client - * initialization. The S3 client appears to be compatible with GCS and widely used in - * destination-gcs connector. Since the destination-gcs connector is partially re-used here - we - * also need to init S3 client. - * - * @param gcsDestinationConfig - * @param configuredStream - * @return GcsCsvWriter - * @throws IOException + * BigQuery might have different structure of the Temporary table. + * If this method returns TRUE, temporary table will have only three common Airbyte attributes. + * In case of FALSE, temporary table structure will be in line with Airbyte message JsonSchema. + * @return use default AirbyteSchema or build using JsonSchema */ - private GcsCsvWriter initGcsWriter(final GcsDestinationConfig gcsDestinationConfig, - final ConfiguredAirbyteStream configuredStream) - throws IOException { - final Timestamp uploadTimestamp = new Timestamp(System.currentTimeMillis()); + protected boolean isDefaultAirbyteTmpTableSchema() { + return true; + } - final AmazonS3 s3Client = GcsS3Helper.getGcsS3Client(gcsDestinationConfig); - return new GcsCsvWriter(gcsDestinationConfig, s3Client, configuredStream, uploadTimestamp); + protected Map getFormatterMap(JsonNode jsonSchema) { + return Map.of(UploaderType.STANDARD, new DefaultBigQueryRecordFormatter(jsonSchema, getNamingResolver()), + UploaderType.CSV, new GcsCsvBigQueryRecordFormatter(jsonSchema, getNamingResolver()), + UploaderType.AVRO, new GcsAvroBigQueryRecordFormatter(jsonSchema, getNamingResolver())); } protected String getTargetTableName(final String streamName) { return namingResolver.getRawTableName(streamName); } - protected AirbyteMessageConsumer getRecordConsumer(final BigQuery bigquery, - final Map writeConfigs, - final ConfiguredAirbyteCatalog catalog, - final Consumer outputRecordCollector, - final boolean isGcsUploadingMode, - final boolean isKeepFilesInGcs) { - return new BigQueryRecordConsumer(bigquery, writeConfigs, catalog, outputRecordCollector, isGcsUploadingMode, isKeepFilesInGcs); - } - - protected Schema getBigQuerySchema(final JsonNode jsonSchema) { - return SCHEMA; - } - - private static String getSchema(final JsonNode config, final ConfiguredAirbyteStream stream) { - final String defaultSchema = config.get(BigQueryConsts.CONFIG_DATASET_ID).asText(); - final String srcNamespace = stream.getStream().getNamespace(); - if (srcNamespace == null) { - return defaultSchema; - } - return srcNamespace; - } - - private static WriteDisposition getWriteDisposition(final DestinationSyncMode syncMode) { - if (syncMode == null) { - throw new IllegalStateException("Undefined destination sync mode"); - } - switch (syncMode) { - case OVERWRITE -> { - return WriteDisposition.WRITE_TRUNCATE; - } - case APPEND, APPEND_DEDUP -> { - return WriteDisposition.WRITE_APPEND; - } - default -> throw new IllegalStateException("Unrecognized destination sync mode: " + syncMode); - } - } - - private UploadingMethod getLoadingMethod(final JsonNode config) { - final JsonNode loadingMethod = config.get(BigQueryConsts.LOADING_METHOD); - if (loadingMethod != null && BigQueryConsts.GCS_STAGING.equals(loadingMethod.get(BigQueryConsts.METHOD).asText())) { - LOGGER.info("Selected loading method is set to: " + UploadingMethod.GCS); - return UploadingMethod.GCS; - } else { - LOGGER.info("Selected loading method is set to: " + UploadingMethod.STANDARD); - return UploadingMethod.STANDARD; - } - } - - private boolean isKeepFilesInGcs(final JsonNode config) { - final JsonNode loadingMethod = config.get(BigQueryConsts.LOADING_METHOD); - if (loadingMethod != null && loadingMethod.get(BigQueryConsts.KEEP_GCS_FILES) != null - && BigQueryConsts.KEEP_GCS_FILES_VAL - .equals(loadingMethod.get(BigQueryConsts.KEEP_GCS_FILES).asText())) { - LOGGER.info("All tmp files GCS will be kept in bucket when replication is finished"); - return true; - } else { - LOGGER.info("All tmp files will be removed from GCS when replication is finished"); - return false; - } - } - - public enum UploadingMethod { - STANDARD, - GCS + protected AirbyteMessageConsumer getRecordConsumer(final Map> writeConfigs, + final Consumer outputRecordCollector) { + return new BigQueryRecordConsumer(writeConfigs, outputRecordCollector); } public static void main(final String[] args) throws Exception { diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryRecordConsumer.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryRecordConsumer.java index 8005b9fde2f66..dac23621492b6 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryRecordConsumer.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryRecordConsumer.java @@ -4,30 +4,14 @@ package io.airbyte.integrations.destination.bigquery; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.DeleteObjectsRequest.KeyVersion; -import com.amazonaws.services.s3.model.S3ObjectSummary; -import com.google.cloud.bigquery.BigQuery; import io.airbyte.integrations.base.AirbyteMessageConsumer; import io.airbyte.integrations.base.AirbyteStreamNameNamespacePair; import io.airbyte.integrations.base.FailureTrackingAirbyteMessageConsumer; -import io.airbyte.integrations.destination.bigquery.BigQueryDestination.UploadingMethod; -import io.airbyte.integrations.destination.bigquery.strategy.BigQueryUploadGCSStrategy; -import io.airbyte.integrations.destination.bigquery.strategy.BigQueryUploadStandardStrategy; -import io.airbyte.integrations.destination.bigquery.strategy.BigQueryUploadStrategy; -import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; -import io.airbyte.integrations.destination.gcs.GcsS3Helper; +import io.airbyte.integrations.destination.bigquery.uploader.AbstractBigQueryUploader; import io.airbyte.protocol.models.AirbyteMessage; import io.airbyte.protocol.models.AirbyteMessage.Type; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.io.IOException; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; import java.util.function.Consumer; -import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,32 +19,14 @@ public class BigQueryRecordConsumer extends FailureTrackingAirbyteMessageConsume private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryRecordConsumer.class); - private final BigQuery bigquery; - private final Map writeConfigs; + private final Map> uploaderMap; private final Consumer outputRecordCollector; - private final boolean isGcsUploadingMode; - private final boolean isKeepFilesInGcs; - - private final ConfiguredAirbyteCatalog catalog; - private AirbyteMessage lastStateMessage = null; - protected final Map bigQueryUploadStrategyMap = new ConcurrentHashMap<>(); - - public BigQueryRecordConsumer(final BigQuery bigquery, - final Map writeConfigs, - final ConfiguredAirbyteCatalog catalog, - final Consumer outputRecordCollector, - final boolean isGcsUploadingMode, - final boolean isKeepFilesInGcs) { - this.bigquery = bigquery; - this.writeConfigs = writeConfigs; - this.catalog = catalog; + public BigQueryRecordConsumer(final Map> uploaderMap, + final Consumer outputRecordCollector) { + this.uploaderMap = uploaderMap; this.outputRecordCollector = outputRecordCollector; - this.isGcsUploadingMode = isGcsUploadingMode; - this.isKeepFilesInGcs = isKeepFilesInGcs; - bigQueryUploadStrategyMap.put(UploadingMethod.STANDARD, new BigQueryUploadStandardStrategy(bigquery, catalog, outputRecordCollector)); - bigQueryUploadStrategyMap.put(UploadingMethod.GCS, new BigQueryUploadGCSStrategy(bigquery)); } @Override @@ -69,7 +35,7 @@ protected void startTracked() { } @Override - public void acceptTracked(final AirbyteMessage message) throws IOException { + public void acceptTracked(final AirbyteMessage message) { if (message.getType() == Type.STATE) { lastStateMessage = message; } else if (message.getType() == Type.RECORD) { @@ -81,58 +47,13 @@ public void acceptTracked(final AirbyteMessage message) throws IOException { private void processRecord(AirbyteMessage message) { final var pair = AirbyteStreamNameNamespacePair.fromRecordMessage(message.getRecord()); - final var writer = writeConfigs.get(pair); - if (isGcsUploadingMode) { - bigQueryUploadStrategyMap.get(UploadingMethod.GCS).upload(writer, message, catalog); - } else { - bigQueryUploadStrategyMap.get(UploadingMethod.STANDARD).upload(writer, message, catalog); - } + uploaderMap.get(pair).upload(message); } @Override public void close(final boolean hasFailed) { LOGGER.info("Started closing all connections"); - // process gcs streams - if (isGcsUploadingMode) { - final List gcsWritersList = writeConfigs.values().parallelStream() - .filter(el -> el.getGcsCsvWriter() != null) - .collect(Collectors.toList()); - bigQueryUploadStrategyMap.get(UploadingMethod.GCS).close(gcsWritersList, hasFailed, lastStateMessage); - } - - bigQueryUploadStrategyMap.get(UploadingMethod.STANDARD).close(new ArrayList<>(writeConfigs.values()), hasFailed, lastStateMessage); - - if (isGcsUploadingMode && !isKeepFilesInGcs) { - deleteDataFromGcsBucket(); - } - } - - private void deleteDataFromGcsBucket() { - writeConfigs.values().forEach(writeConfig -> { - final GcsDestinationConfig gcsDestinationConfig = writeConfig.getGcsDestinationConfig(); - final AmazonS3 s3Client = GcsS3Helper.getGcsS3Client(gcsDestinationConfig); - - final String gcsBucketName = gcsDestinationConfig.getBucketName(); - final String gcs_bucket_path = gcsDestinationConfig.getBucketPath(); - - final List keysToDelete = new LinkedList<>(); - final List objects = s3Client - .listObjects(gcsBucketName, gcs_bucket_path) - .getObjectSummaries(); - for (final S3ObjectSummary object : objects) { - keysToDelete.add(new KeyVersion(object.getKey())); - } - - if (!keysToDelete.isEmpty()) { - LOGGER.info("Tearing down test bucket path: {}/{}", gcsBucketName, gcs_bucket_path); - // Google Cloud Storage doesn't accept request to delete multiple objects - for (final KeyVersion keyToDelete : keysToDelete) { - s3Client.deleteObject(gcsBucketName, keyToDelete.getKey()); - } - LOGGER.info("Deleted {} file(s).", keysToDelete.size()); - } - s3Client.shutdown(); - }); + uploaderMap.values().parallelStream().forEach(uploader -> uploader.close(hasFailed, outputRecordCollector, lastStateMessage)); } } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java index b7ea7ef007e97..c4dbde0a84e97 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java @@ -4,6 +4,8 @@ package io.airbyte.integrations.destination.bigquery; +import static io.airbyte.integrations.destination.bigquery.helpers.LoggerHelper.getJobErrorMessage; + import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.cloud.bigquery.BigQuery; @@ -29,6 +31,8 @@ import com.google.common.collect.ImmutableMap; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.DestinationSyncMode; import java.util.ArrayList; import java.util.List; import java.util.Set; @@ -72,12 +76,12 @@ static Job waitForQuery(final Job queryJob) { } } - static void createSchemaAndTableIfNeeded(final BigQuery bigquery, - final Set existingSchemas, - final String schemaName, - final String tmpTableName, - final String datasetLocation, - final Schema schema) { + public static void createSchemaAndTableIfNeeded(final BigQuery bigquery, + final Set existingSchemas, + final String schemaName, + final String tmpTableName, + final String datasetLocation, + final Schema schema) { if (!existingSchemas.contains(schemaName)) { createSchemaTable(bigquery, schemaName, datasetLocation); existingSchemas.add(schemaName); @@ -139,6 +143,23 @@ public static JsonNode getGcsJsonNodeConfig(final JsonNode config) { return gcsJsonNode; } + public static JsonNode getGcsAvroJsonNodeConfig(final JsonNode config) { + final JsonNode loadingMethod = config.get(BigQueryConsts.LOADING_METHOD); + final JsonNode gcsJsonNode = Jsons.jsonNode(ImmutableMap.builder() + .put(BigQueryConsts.GCS_BUCKET_NAME, loadingMethod.get(BigQueryConsts.GCS_BUCKET_NAME)) + .put(BigQueryConsts.GCS_BUCKET_PATH, loadingMethod.get(BigQueryConsts.GCS_BUCKET_PATH)) + .put(BigQueryConsts.GCS_BUCKET_REGION, getDatasetLocation(config)) + .put(BigQueryConsts.CREDENTIAL, loadingMethod.get(BigQueryConsts.CREDENTIAL)) + .put(BigQueryConsts.FORMAT, Jsons.deserialize("{\n" + + " \"format_type\": \"AVRO\",\n" + + " \"flattening\": \"No flattening\"\n" + + "}")) + .build()); + + LOGGER.debug("Composed GCS config is: \n" + gcsJsonNode.toPrettyString()); + return gcsJsonNode; + } + public static String getDatasetLocation(final JsonNode config) { if (config.has(BigQueryConsts.CONFIG_DATASET_LOCATION)) { return config.get(BigQueryConsts.CONFIG_DATASET_LOCATION).asText(); @@ -190,4 +211,84 @@ public static void transformJsonDateTimeToBigDataFormat(List dateTimeFie }); } + public static String getSchema(final JsonNode config, final ConfiguredAirbyteStream stream) { + final String defaultSchema = config.get(BigQueryConsts.CONFIG_DATASET_ID).asText(); + final String srcNamespace = stream.getStream().getNamespace(); + if (srcNamespace == null) { + return defaultSchema; + } + return srcNamespace; + } + + public static JobInfo.WriteDisposition getWriteDisposition(final DestinationSyncMode syncMode) { + if (syncMode == null) { + throw new IllegalStateException("Undefined destination sync mode"); + } + switch (syncMode) { + case OVERWRITE -> { + return JobInfo.WriteDisposition.WRITE_TRUNCATE; + } + case APPEND, APPEND_DEDUP -> { + return JobInfo.WriteDisposition.WRITE_APPEND; + } + default -> throw new IllegalStateException("Unrecognized destination sync mode: " + syncMode); + } + } + + public static boolean isUsingJsonCredentials(final JsonNode config) { + return config.has(BigQueryConsts.CONFIG_CREDS) && !config.get(BigQueryConsts.CONFIG_CREDS).asText().isEmpty(); + } + + // https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html + public static Integer getBigQueryClientChunkSize(final JsonNode config) { + Integer chunkSizeFromConfig = null; + if (config.has(BigQueryConsts.BIG_QUERY_CLIENT_CHUNK_SIZE)) { + chunkSizeFromConfig = config.get(BigQueryConsts.BIG_QUERY_CLIENT_CHUNK_SIZE).asInt(); + if (chunkSizeFromConfig <= 0) { + LOGGER.error("BigQuery client Chunk (buffer) size must be a positive number (MB), but was:" + chunkSizeFromConfig); + throw new IllegalArgumentException("BigQuery client Chunk (buffer) size must be a positive number (MB)"); + } + chunkSizeFromConfig = chunkSizeFromConfig * BigQueryConsts.MiB; + } + return chunkSizeFromConfig; + } + + public static UploadingMethod getLoadingMethod(final JsonNode config) { + final JsonNode loadingMethod = config.get(BigQueryConsts.LOADING_METHOD); + if (loadingMethod != null && BigQueryConsts.GCS_STAGING.equals(loadingMethod.get(BigQueryConsts.METHOD).asText())) { + LOGGER.info("Selected loading method is set to: " + UploadingMethod.GCS); + return UploadingMethod.GCS; + } else { + LOGGER.info("Selected loading method is set to: " + UploadingMethod.STANDARD); + return UploadingMethod.STANDARD; + } + } + + public static boolean isKeepFilesInGcs(final JsonNode config) { + final JsonNode loadingMethod = config.get(BigQueryConsts.LOADING_METHOD); + if (loadingMethod != null && loadingMethod.get(BigQueryConsts.KEEP_GCS_FILES) != null + && BigQueryConsts.KEEP_GCS_FILES_VAL + .equals(loadingMethod.get(BigQueryConsts.KEEP_GCS_FILES).asText())) { + LOGGER.info("All tmp files GCS will be kept in bucket when replication is finished"); + return true; + } else { + LOGGER.info("All tmp files will be removed from GCS when replication is finished"); + return false; + } + } + + public static void waitForJobFinish(Job job) throws InterruptedException { + if (job != null) { + try { + LOGGER.info("Waiting for job finish {}. Status: {}", job, job.getStatus()); + job.waitFor(); + LOGGER.info("Job finish {} with status {}", job, job.getStatus()); + } catch (final BigQueryException e) { + String errorMessage = getJobErrorMessage(e.getErrors(), job); + LOGGER.error(errorMessage); + throw new BigQueryException(e.getCode(), errorMessage, e); + } + } + } + } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryWriteConfig.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryWriteConfig.java deleted file mode 100644 index 2553e5b2c1446..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryWriteConfig.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery; - -import com.google.cloud.bigquery.JobInfo.WriteDisposition; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.TableDataWriteChannel; -import com.google.cloud.bigquery.TableId; -import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; -import io.airbyte.integrations.destination.gcs.csv.GcsCsvWriter; - -public class BigQueryWriteConfig { - - private final TableId table; - private final TableId tmpTable; - private final TableDataWriteChannel writer; - private final WriteDisposition syncMode; - private final Schema schema; - private final GcsCsvWriter gcsCsvWriter; - private final GcsDestinationConfig gcsDestinationConfig; - - BigQueryWriteConfig(final TableId table, - final TableId tmpTable, - final TableDataWriteChannel writer, - final WriteDisposition syncMode, - final Schema schema, - final GcsCsvWriter gcsCsvWriter, - final GcsDestinationConfig gcsDestinationConfig) { - this.table = table; - this.tmpTable = tmpTable; - this.writer = writer; - this.syncMode = syncMode; - this.schema = schema; - this.gcsCsvWriter = gcsCsvWriter; - this.gcsDestinationConfig = gcsDestinationConfig; - } - - public TableId getTable() { - return table; - } - - public TableId getTmpTable() { - return tmpTable; - } - - public TableDataWriteChannel getWriter() { - return writer; - } - - public WriteDisposition getSyncMode() { - return syncMode; - } - - public Schema getSchema() { - return schema; - } - - public GcsCsvWriter getGcsCsvWriter() { - return gcsCsvWriter; - } - - public GcsDestinationConfig getGcsDestinationConfig() { - return gcsDestinationConfig; - } - -} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/UploadingMethod.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/UploadingMethod.java new file mode 100644 index 0000000000000..38c85e4ea7c7b --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/UploadingMethod.java @@ -0,0 +1,10 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery; + +public enum UploadingMethod { + STANDARD, + GCS +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/BigQueryRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/BigQueryRecordFormatter.java new file mode 100644 index 0000000000000..ff7dbf3f38297 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/BigQueryRecordFormatter.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.cloud.bigquery.Schema; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The class formats incoming JsonSchema and AirbyteRecord in order to be inline with a corresponding uploader. + */ +public abstract class BigQueryRecordFormatter { + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryRecordFormatter.class); + + private Schema bigQuerySchema; + private final Map> mapOfFailedFields = new HashMap<>(); + protected final StandardNameTransformer namingResolver; + protected final JsonNode jsonSchema; + + public BigQueryRecordFormatter(JsonNode jsonSchema, StandardNameTransformer namingResolver) { + this.namingResolver = namingResolver; + this.jsonSchema = formatJsonSchema(jsonSchema.deepCopy()); + } + + protected JsonNode formatJsonSchema(JsonNode jsonSchema) { + // Do nothing by default + return jsonSchema; + }; + + public abstract JsonNode formatRecord(AirbyteRecordMessage recordMessage); + + public Schema getBigQuerySchema() { + if (bigQuerySchema == null) { + bigQuerySchema = getBigQuerySchema(jsonSchema); + } + return bigQuerySchema; + } + + public JsonNode getJsonSchema() { + return jsonSchema; + } + + protected abstract Schema getBigQuerySchema(JsonNode jsonSchema); + + protected void logFieldFail(String error, String fieldName) { + mapOfFailedFields.putIfAbsent(error, new HashSet<>()); + mapOfFailedFields.get(error).add(fieldName); + } + + public void printAndCleanFieldFails() { + if (!mapOfFailedFields.isEmpty()) { + mapOfFailedFields.forEach( + (error, fieldNames) -> + LOGGER.warn( + "Field(s) fail with error {}. Fields : {} ", + error, + String.join(", ", fieldNames))); + mapOfFailedFields.clear(); + } else { + LOGGER.info("No field fails during record format."); + } + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryRecordFormatter.java new file mode 100644 index 0000000000000..4dfc0a2e83e33 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/DefaultBigQueryRecordFormatter.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.QueryParameterValue; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** + * Default BigQuery formatter. + * Represents default Airbyte schema (three columns). + * Note! Default formatter is used inside Direct uploader. + */ +public class DefaultBigQueryRecordFormatter extends BigQueryRecordFormatter { + + private static final com.google.cloud.bigquery.Schema SCHEMA = com.google.cloud.bigquery.Schema.of( + Field.of(JavaBaseConstants.COLUMN_NAME_AB_ID, StandardSQLTypeName.STRING), + Field.of(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, StandardSQLTypeName.TIMESTAMP), + Field.of(JavaBaseConstants.COLUMN_NAME_DATA, StandardSQLTypeName.STRING)); + + public DefaultBigQueryRecordFormatter(JsonNode jsonSchema, StandardNameTransformer namingResolver) { + super(jsonSchema, namingResolver); + } + + @Override + public JsonNode formatRecord(AirbyteRecordMessage recordMessage) { + return Jsons.jsonNode(Map.of( + JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString(), + JavaBaseConstants.COLUMN_NAME_EMITTED_AT, getEmittedAtField(recordMessage), + JavaBaseConstants.COLUMN_NAME_DATA, getData(recordMessage)) + ); + } + + protected Object getEmittedAtField(AirbyteRecordMessage recordMessage) { + // Bigquery represents TIMESTAMP to the microsecond precision, so we convert to microseconds then + // use BQ helpers to string-format correctly. + final long emittedAtMicroseconds = TimeUnit.MICROSECONDS.convert(recordMessage.getEmittedAt(), TimeUnit.MILLISECONDS); + return QueryParameterValue.timestamp(emittedAtMicroseconds).getValue(); + } + + protected Object getData(AirbyteRecordMessage recordMessage) { + final JsonNode formattedData = StandardNameTransformer.formatJsonPath(recordMessage.getData()); + return Jsons.serialize(formattedData); + } + + @Override + public Schema getBigQuerySchema(JsonNode jsonSchema) { + return SCHEMA; + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsAvroBigQueryRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsAvroBigQueryRecordFormatter.java new file mode 100644 index 0000000000000..bb7d372ea4572 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsAvroBigQueryRecordFormatter.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.protocol.models.AirbyteRecordMessage; + +/** + * Formatter for GCS Avro uploader. + * Contains specific filling of default Airbyte attributes. + */ +public class GcsAvroBigQueryRecordFormatter extends DefaultBigQueryRecordFormatter { + + public GcsAvroBigQueryRecordFormatter(JsonNode jsonSchema, StandardNameTransformer namingResolver) { + super(jsonSchema, namingResolver); + } + + @Override + protected Object getEmittedAtField(AirbyteRecordMessage recordMessage) { + return recordMessage.getEmittedAt(); + } + + @Override + protected Object getData(AirbyteRecordMessage recordMessage) { + return StandardNameTransformer.formatJsonPath(recordMessage.getData()).toString(); + } +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsCsvBigQueryRecordFormatter.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsCsvBigQueryRecordFormatter.java new file mode 100644 index 0000000000000..89462ecb6a4e5 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/formatter/GcsCsvBigQueryRecordFormatter.java @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.formatter; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.integrations.destination.StandardNameTransformer; + +/** + * Formatter for GCS CSV uploader. + * Contains specific filling of default Airbyte attributes. + * Note! That it might be extended during CSV GCS integration. + */ +public class GcsCsvBigQueryRecordFormatter extends DefaultBigQueryRecordFormatter { + + public GcsCsvBigQueryRecordFormatter(JsonNode jsonSchema, StandardNameTransformer namingResolver) { + super(jsonSchema, namingResolver); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/helpers/LoggerHelper.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/helpers/LoggerHelper.java index 78448d7ce367b..aeeae8d3975df 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/helpers/LoggerHelper.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/helpers/LoggerHelper.java @@ -4,8 +4,13 @@ package io.airbyte.integrations.destination.bigquery.helpers; +import com.google.cloud.bigquery.BigQueryError; +import com.google.cloud.bigquery.Job; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,4 +29,12 @@ public static void printHeapMemoryConsumption() { LOGGER.info("Max Memory (xmx) : mb = {}", xmx); } + public static String getJobErrorMessage(List errors, Job job) { + if (!errors.isEmpty()) { + return String.format("Error is happened during execution for job: %s, \n For more details see Big Query Error collection: %s:", job, + errors.stream().map(BigQueryError::toString).collect(Collectors.joining(",\n "))); + } + return StringUtils.EMPTY; + } + } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadGCSStrategy.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadGCSStrategy.java deleted file mode 100644 index bec251aca09e2..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadGCSStrategy.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery.strategy; - -import static com.amazonaws.util.StringUtils.UTF8; -import static io.airbyte.integrations.destination.bigquery.helpers.LoggerHelper.printHeapMemoryConsumption; - -import com.fasterxml.jackson.databind.JsonNode; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.BigQueryException; -import com.google.cloud.bigquery.CsvOptions; -import com.google.cloud.bigquery.Job; -import com.google.cloud.bigquery.JobInfo; -import com.google.cloud.bigquery.LoadJobConfiguration; -import com.google.cloud.bigquery.QueryParameterValue; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.TableId; -import io.airbyte.commons.json.Jsons; -import io.airbyte.integrations.destination.StandardNameTransformer; -import io.airbyte.integrations.destination.bigquery.BigQueryWriteConfig; -import io.airbyte.integrations.destination.gcs.csv.GcsCsvWriter; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class BigQueryUploadGCSStrategy implements BigQueryUploadStrategy { - - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryUploadGCSStrategy.class); - - private final BigQuery bigquery; - - public BigQueryUploadGCSStrategy(BigQuery bigquery) { - this.bigquery = bigquery; - } - - @Override - public void upload(BigQueryWriteConfig writer, AirbyteMessage airbyteMessage, ConfiguredAirbyteCatalog catalog) { - var airbyteRecordMessage = airbyteMessage.getRecord(); - var gcsCsvWriter = writer.getGcsCsvWriter(); - // Bigquery represents TIMESTAMP to the microsecond precision, so we convert to microseconds then - // use BQ helpers to string-format correctly. - final long emittedAtMicroseconds = TimeUnit.MICROSECONDS.convert(airbyteRecordMessage.getEmittedAt(), TimeUnit.MILLISECONDS); - final String formattedEmittedAt = QueryParameterValue.timestamp(emittedAtMicroseconds).getValue(); - final JsonNode formattedData = StandardNameTransformer.formatJsonPath(airbyteRecordMessage.getData()); - try { - gcsCsvWriter.getCsvPrinter().printRecord( - UUID.randomUUID().toString(), - formattedEmittedAt, - Jsons.serialize(formattedData)); - } catch (IOException e) { - e.printStackTrace(); - LOGGER.warn("An error occurred writing CSV file."); - } - } - - @Override - public void close(List writeConfigList, boolean hasFailed, AirbyteMessage lastStateMessage) { - if (!writeConfigList.isEmpty()) { - LOGGER.info("GCS connectors that need to be closed:" + writeConfigList); - writeConfigList.parallelStream().forEach(writer -> { - final GcsCsvWriter gcsCsvWriter = writer.getGcsCsvWriter(); - - try { - LOGGER.info("Closing connector:" + gcsCsvWriter); - gcsCsvWriter.close(hasFailed); - } catch (final IOException | RuntimeException e) { - LOGGER.error(String.format("Failed to close %s gcsWriter, \n details: %s", gcsCsvWriter, e.getMessage())); - printHeapMemoryConsumption(); - throw new RuntimeException(e); - } - }); - } - - // copy data from tmp gcs storage to bigquery tables - writeConfigList - .forEach(pair -> { - try { - loadCsvFromGcsTruncate(pair); - } catch (final Exception e) { - LOGGER.error("Failed to load data from GCS CSV file to BigQuery tmp table with reason: " + e.getMessage()); - throw new RuntimeException(e); - } - }); - } - - private void loadCsvFromGcsTruncate(final BigQueryWriteConfig bigQueryWriteConfig) - throws Exception { - try { - final TableId tmpTable = bigQueryWriteConfig.getTmpTable(); - final Schema schema = bigQueryWriteConfig.getSchema(); - final String csvFile = bigQueryWriteConfig.getGcsCsvWriter().getGcsCsvFileLocation(); - - // Initialize client that will be used to send requests. This client only needs to be created - // once, and can be reused for multiple requests. - LOGGER.info(String.format("Started copying data from %s GCS csv file to %s tmp BigQuery table with schema: \n %s", - csvFile, tmpTable, schema)); - - final var csvOptions = CsvOptions.newBuilder().setEncoding(UTF8).setSkipLeadingRows(1).build(); - - final LoadJobConfiguration configuration = - LoadJobConfiguration.builder(tmpTable, csvFile) - .setFormatOptions(csvOptions) - .setSchema(schema) - .setWriteDisposition(bigQueryWriteConfig.getSyncMode()) - .build(); - - // For more information on Job see: - // https://googleapis.dev/java/google-cloud-clients/latest/index.html?com/google/cloud/bigquery/package-summary.html - // Load the table - final Job loadJob = bigquery.create(JobInfo.of(configuration)); - - LOGGER.info("Created a new job GCS csv file to tmp BigQuery table: " + loadJob); - LOGGER.info("Waiting for job to complete..."); - - // Load data from a GCS parquet file into the table - // Blocks until this load table job completes its execution, either failing or succeeding. - final Job completedJob = loadJob.waitFor(); - - // Check for errors - if (completedJob == null) { - LOGGER.error("Job not executed since it no longer exists."); - throw new Exception("Job not executed since it no longer exists."); - } else if (completedJob.getStatus().getError() != null) { - // You can also look at queryJob.getStatus().getExecutionErrors() for all - // errors, not just the latest one. - final String msg = "BigQuery was unable to load into the table due to an error: \n" - + loadJob.getStatus().getError(); - LOGGER.error(msg); - throw new Exception(msg); - } - LOGGER.info("Table is successfully overwritten by CSV file loaded from GCS"); - } catch (final BigQueryException | InterruptedException e) { - LOGGER.error("Column not added during load append \n" + e.toString()); - throw new RuntimeException("Column not added during load append \n" + e.toString()); - } - } - -} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStandardStrategy.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStandardStrategy.java deleted file mode 100644 index b3b7a8778ebca..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStandardStrategy.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery.strategy; - -import static io.airbyte.integrations.destination.bigquery.helpers.LoggerHelper.printHeapMemoryConsumption; - -import com.fasterxml.jackson.databind.JsonNode; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.CopyJobConfiguration; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.Job; -import com.google.cloud.bigquery.JobInfo; -import com.google.cloud.bigquery.JobInfo.CreateDisposition; -import com.google.cloud.bigquery.JobInfo.WriteDisposition; -import com.google.cloud.bigquery.QueryJobConfiguration; -import com.google.cloud.bigquery.QueryParameterValue; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.TableDataWriteChannel; -import com.google.cloud.bigquery.TableId; -import com.google.common.base.Charsets; -import com.google.common.collect.ImmutableMap; -import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.lang.Exceptions; -import io.airbyte.commons.string.Strings; -import io.airbyte.integrations.base.JavaBaseConstants; -import io.airbyte.integrations.destination.StandardNameTransformer; -import io.airbyte.integrations.destination.bigquery.BigQueryUtils; -import io.airbyte.integrations.destination.bigquery.BigQueryWriteConfig; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.AirbyteRecordMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class BigQueryUploadStandardStrategy implements BigQueryUploadStrategy { - - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryUploadStandardStrategy.class); - - private final BigQuery bigquery; - private final ConfiguredAirbyteCatalog catalog; - private final Consumer outputRecordCollector; - - public BigQueryUploadStandardStrategy(BigQuery bigquery, - ConfiguredAirbyteCatalog catalog, - Consumer outputRecordCollector) { - this.bigquery = bigquery; - this.catalog = catalog; - this.outputRecordCollector = outputRecordCollector; - } - - @Override - public void upload(BigQueryWriteConfig writer, AirbyteMessage airbyteMessage, ConfiguredAirbyteCatalog catalog) { - try { - writer.getWriter() - .write(ByteBuffer.wrap((Jsons.serialize(formatRecord(writer.getSchema(), airbyteMessage.getRecord())) + "\n").getBytes(Charsets.UTF_8))); - } catch (final IOException | RuntimeException e) { - LOGGER.error("Got an error while writing message: {}", e.getMessage(), e); - LOGGER.error(String.format( - "Failed to process a message for job: %s, \nStreams numbers: %s, \nSyncMode: %s, \nTableName: %s, \nTmpTableName: %s, \nAirbyteMessage: %s", - writer.getWriter().getJob(), catalog.getStreams().size(), writer.getSyncMode(), writer.getTable(), writer.getTmpTable(), - airbyteMessage.getRecord())); - printHeapMemoryConsumption(); - throw new RuntimeException(e); - } - } - - protected JsonNode formatRecord(final Schema schema, final AirbyteRecordMessage recordMessage) { - // Bigquery represents TIMESTAMP to the microsecond precision, so we convert to microseconds then - // use BQ helpers to string-format correctly. - final long emittedAtMicroseconds = TimeUnit.MICROSECONDS.convert(recordMessage.getEmittedAt(), TimeUnit.MILLISECONDS); - final String formattedEmittedAt = QueryParameterValue.timestamp(emittedAtMicroseconds).getValue(); - final JsonNode formattedData = StandardNameTransformer.formatJsonPath(recordMessage.getData()); - return Jsons.jsonNode(ImmutableMap.of( - JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString(), - JavaBaseConstants.COLUMN_NAME_DATA, Jsons.serialize(formattedData), - JavaBaseConstants.COLUMN_NAME_EMITTED_AT, formattedEmittedAt)); - } - - @Override - public void close(List writeConfigList, boolean hasFailed, AirbyteMessage lastStateMessage) { - try { - writeConfigList.parallelStream().forEach(bigQueryWriteConfig -> Exceptions.toRuntime(() -> { - final TableDataWriteChannel writer = bigQueryWriteConfig.getWriter(); - try { - writer.close(); - } catch (final IOException | RuntimeException e) { - LOGGER.error(String.format("Failed to close writer: %s, \nStreams numbers: %s", - writer.getJob(), catalog.getStreams().size())); - printHeapMemoryConsumption(); - throw new RuntimeException(e); - } - })); - - LOGGER.info("Waiting for jobs to be finished/closed"); - writeConfigList.forEach(bigQueryWriteConfig -> Exceptions.toRuntime(() -> { - if (bigQueryWriteConfig.getWriter().getJob() != null) { - try { - bigQueryWriteConfig.getWriter().getJob().waitFor(); - } catch (final RuntimeException e) { - LOGGER.error( - String.format("Failed to process a message for job: %s, \nStreams numbers: %s, \nSyncMode: %s, \nTableName: %s, \nTmpTableName: %s", - bigQueryWriteConfig.getWriter().getJob(), catalog.getStreams().size(), bigQueryWriteConfig.getSyncMode(), - bigQueryWriteConfig.getTable(), bigQueryWriteConfig.getTmpTable())); - printHeapMemoryConsumption(); - throw new RuntimeException(e); - } - } - })); - - if (!hasFailed) { - LOGGER.info("Replication finished with no explicit errors. Copying data from tmp tables to permanent"); - writeConfigList - .forEach( - bigQueryWriteConfig -> { - if (bigQueryWriteConfig.getSyncMode().equals(WriteDisposition.WRITE_APPEND)) { - partitionIfUnpartitioned(bigQueryWriteConfig, bigquery, bigQueryWriteConfig.getTable()); - } - copyTable(bigquery, bigQueryWriteConfig.getTmpTable(), bigQueryWriteConfig.getTable(), - bigQueryWriteConfig.getSyncMode()); - }); - // BQ is still all or nothing if a failure happens in the destination. - outputRecordCollector.accept(lastStateMessage); - } else { - LOGGER.warn("Had errors while replicating"); - } - } finally { - // clean up tmp tables; - LOGGER.info("Removing tmp tables..."); - writeConfigList.forEach(bigQueryWriteConfig -> bigquery.delete(bigQueryWriteConfig.getTmpTable())); - LOGGER.info("Finishing destination process...completed"); - } - } - - private void partitionIfUnpartitioned(final BigQueryWriteConfig bigQueryWriteConfig, - final BigQuery bigquery, - final TableId destinationTableId) { - try { - final QueryJobConfiguration queryConfig = QueryJobConfiguration - .newBuilder( - String.format("SELECT max(is_partitioning_column) as is_partitioned FROM `%s.%s.INFORMATION_SCHEMA.COLUMNS` WHERE TABLE_NAME = '%s';", - bigquery.getOptions().getProjectId(), - destinationTableId.getDataset(), - destinationTableId.getTable())) - .setUseLegacySql(false) - .build(); - final ImmutablePair result = BigQueryUtils.executeQuery(bigquery, queryConfig); - result.getLeft().getQueryResults().getValues().forEach(row -> { - if (!row.get("is_partitioned").isNull() && row.get("is_partitioned").getStringValue().equals("NO")) { - LOGGER.info("Partitioning existing destination table {}", destinationTableId); - final String tmpPartitionTable = Strings.addRandomSuffix("_airbyte_partitioned_table", "_", 5); - final TableId tmpPartitionTableId = TableId.of(destinationTableId.getDataset(), tmpPartitionTable); - // make sure tmpPartitionTable does not already exist - bigquery.delete(tmpPartitionTableId); - // Use BigQuery SQL to copy because java api copy jobs does not support creating a table from a - // select query, see: - // https://cloud.google.com/bigquery/docs/creating-partitioned-tables#create_a_partitioned_table_from_a_query_result - final QueryJobConfiguration partitionQuery = QueryJobConfiguration - .newBuilder( - getCreatePartitionedTableFromSelectQuery(bigQueryWriteConfig.getSchema(), bigquery.getOptions().getProjectId(), destinationTableId, - tmpPartitionTable)) - .setUseLegacySql(false) - .build(); - BigQueryUtils.executeQuery(bigquery, partitionQuery); - // Copying data from a partitioned tmp table into an existing non-partitioned table does not make it - // partitioned... thus, we force re-create from scratch by completely deleting and creating new - // table. - bigquery.delete(destinationTableId); - copyTable(bigquery, tmpPartitionTableId, destinationTableId, WriteDisposition.WRITE_EMPTY); - bigquery.delete(tmpPartitionTableId); - } - }); - } catch (final InterruptedException e) { - LOGGER.warn("Had errors while partitioning: ", e); - } - } - - // https://cloud.google.com/bigquery/docs/managing-tables#copying_a_single_source_table - private static void copyTable( - final BigQuery bigquery, - final TableId sourceTableId, - final TableId destinationTableId, - final WriteDisposition syncMode) { - final CopyJobConfiguration configuration = CopyJobConfiguration.newBuilder(destinationTableId, sourceTableId) - .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) - .setWriteDisposition(syncMode) - .build(); - - final Job job = bigquery.create(JobInfo.of(configuration)); - final ImmutablePair jobStringImmutablePair = BigQueryUtils.executeQuery(job); - if (jobStringImmutablePair.getRight() != null) { - LOGGER.error("Failed on copy tables with error:" + job.getStatus()); - throw new RuntimeException("BigQuery was unable to copy table due to an error: \n" + job.getStatus().getError()); - } - LOGGER.info("successfully copied table: {} to table: {}", sourceTableId, destinationTableId); - } - - protected String getCreatePartitionedTableFromSelectQuery(final Schema schema, - final String projectId, - final TableId destinationTableId, - final String tmpPartitionTable) { - return String.format("create table `%s.%s.%s` (", projectId, destinationTableId.getDataset(), tmpPartitionTable) - + schema.getFields().stream() - .map(field -> String.format("%s %s", field.getName(), field.getType())) - .collect(Collectors.joining(", ")) - + ") partition by date(" - + JavaBaseConstants.COLUMN_NAME_EMITTED_AT - + ") as select " - + schema.getFields().stream() - .map(Field::getName) - .collect(Collectors.joining(", ")) - + String.format(" from `%s.%s.%s`", projectId, destinationTableId.getDataset(), destinationTableId.getTable()); - } - -} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStrategy.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStrategy.java deleted file mode 100644 index 939ffb84a46f2..0000000000000 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/strategy/BigQueryUploadStrategy.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.integrations.destination.bigquery.strategy; - -import io.airbyte.integrations.destination.bigquery.BigQueryWriteConfig; -import io.airbyte.protocol.models.AirbyteMessage; -import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; -import java.util.List; - -public interface BigQueryUploadStrategy { - - void upload(BigQueryWriteConfig writer, AirbyteMessage airbyteMessage, ConfiguredAirbyteCatalog catalog); - - void close(List writeConfigList, boolean hasFailed, AirbyteMessage lastStateMessage); - -} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractBigQueryUploader.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractBigQueryUploader.java new file mode 100644 index 0000000000000..cad581f262a42 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractBigQueryUploader.java @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import static io.airbyte.integrations.destination.bigquery.helpers.LoggerHelper.printHeapMemoryConsumption; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.CopyJobConfiguration; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.Job; +import com.google.cloud.bigquery.JobInfo; +import com.google.cloud.bigquery.JobInfo.WriteDisposition; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.TableId; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import io.airbyte.protocol.models.AirbyteMessage; +import java.io.IOException; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class AbstractBigQueryUploader { + + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractBigQueryUploader.class); + + protected final TableId table; + protected final TableId tmpTable; + protected final WriteDisposition syncMode; + protected final T writer; + protected final BigQuery bigQuery; + protected final BigQueryRecordFormatter recordFormatter; + + AbstractBigQueryUploader(final TableId table, + final TableId tmpTable, + final T writer, + final WriteDisposition syncMode, + final BigQuery bigQuery, + final BigQueryRecordFormatter recordFormatter) { + this.table = table; + this.tmpTable = tmpTable; + this.writer = writer; + this.syncMode = syncMode; + this.bigQuery = bigQuery; + this.recordFormatter = recordFormatter; + } + + protected void postProcessAction(boolean hasFailed) throws Exception { + // Do nothing by default + } + + public void upload(AirbyteMessage airbyteMessage) { + try { + writer.write((recordFormatter.formatRecord(airbyteMessage.getRecord()))); + } catch (final IOException | RuntimeException e) { + LOGGER.error("Got an error while writing message: {}", e.getMessage(), e); + LOGGER.error(String.format( + "Failed to process a message for job: \n%s, \nAirbyteMessage: %s", + writer.toString(), + airbyteMessage.getRecord())); + printHeapMemoryConsumption(); + throw new RuntimeException(e); + } + } + + public void close(boolean hasFailed, Consumer outputRecordCollector, AirbyteMessage lastStateMessage) { + try { + LOGGER.info("Field fails during format : "); + recordFormatter.printAndCleanFieldFails(); + + LOGGER.info("Closing connector:" + this); + this.writer.close(hasFailed); + + if (!hasFailed) { + uploadData(outputRecordCollector, lastStateMessage); + } + this.postProcessAction(hasFailed); + LOGGER.info("Closed connector:" + this); + } catch (final Exception e) { + LOGGER.error(String.format("Failed to close %s writer, \n details: %s", this, e.getMessage())); + printHeapMemoryConsumption(); + throw new RuntimeException(e); + } + } + + protected void uploadData(Consumer outputRecordCollector, AirbyteMessage lastStateMessage) throws Exception { + try { + LOGGER.info("Uploading data from the tmp table {} to the source table {}.", tmpTable.getTable(), table.getTable()); + uploadDataToTableFromTmpTable(); + LOGGER.info("Data is successfully loaded to the source table {}!", table.getTable()); + outputRecordCollector.accept(lastStateMessage); + LOGGER.info("Final state message is accepted."); + } catch (Exception e) { + LOGGER.error("Upload data is failed!"); + throw e; + } finally { + dropTmpTable(); + } + } + + protected void dropTmpTable() { + try { + // clean up tmp tables; + LOGGER.info("Removing tmp tables..."); + bigQuery.delete(tmpTable); + LOGGER.info("Finishing destination process...completed"); + } catch (Exception e) { + LOGGER.error("Fail to tmp table drop table: " + e.getMessage()); + } + } + + protected void uploadDataToTableFromTmpTable() throws Exception { + LOGGER.info("Replication finished with no explicit errors. Copying data from tmp tables to permanent"); + if (syncMode.equals(JobInfo.WriteDisposition.WRITE_APPEND)) { + partitionIfUnpartitioned(table); + } + copyTable(tmpTable, table, + syncMode); + } + + private void partitionIfUnpartitioned(final TableId destinationTableId) { + try { + final QueryJobConfiguration queryConfig = QueryJobConfiguration + .newBuilder( + String.format("SELECT max(is_partitioning_column) as is_partitioned FROM `%s.%s.INFORMATION_SCHEMA.COLUMNS` WHERE TABLE_NAME = '%s';", + bigQuery.getOptions().getProjectId(), + destinationTableId.getDataset(), + destinationTableId.getTable())) + .setUseLegacySql(false) + .build(); + final ImmutablePair result = BigQueryUtils.executeQuery(bigQuery, queryConfig); + result.getLeft().getQueryResults().getValues().forEach(row -> { + if (!row.get("is_partitioned").isNull() && row.get("is_partitioned").getStringValue().equals("NO")) { + LOGGER.info("Partitioning existing destination table {}", destinationTableId); + final String tmpPartitionTable = Strings.addRandomSuffix("_airbyte_partitioned_table", "_", 5); + final TableId tmpPartitionTableId = TableId.of(destinationTableId.getDataset(), tmpPartitionTable); + // make sure tmpPartitionTable does not already exist + bigQuery.delete(tmpPartitionTableId); + // Use BigQuery SQL to copy because java api copy jobs does not support creating a table from a + // select query, see: + // https://cloud.google.com/bigquery/docs/creating-partitioned-tables#create_a_partitioned_table_from_a_query_result + final QueryJobConfiguration partitionQuery = QueryJobConfiguration + .newBuilder( + getCreatePartitionedTableFromSelectQuery(recordFormatter.getBigQuerySchema(), bigQuery.getOptions().getProjectId(), + destinationTableId, + tmpPartitionTable)) + .setUseLegacySql(false) + .build(); + BigQueryUtils.executeQuery(bigQuery, partitionQuery); + // Copying data from a partitioned tmp table into an existing non-partitioned table does not make it + // partitioned... thus, we force re-create from scratch by completely deleting and creating new + // table. + bigQuery.delete(destinationTableId); + copyTable(tmpPartitionTableId, destinationTableId, JobInfo.WriteDisposition.WRITE_EMPTY); + bigQuery.delete(tmpPartitionTableId); + } + }); + } catch (final InterruptedException e) { + LOGGER.warn("Had errors while partitioning: ", e); + } + } + + // https://cloud.google.com/bigquery/docs/managing-tables#copying_a_single_source_table + private void copyTable( + final TableId sourceTableId, + final TableId destinationTableId, + final JobInfo.WriteDisposition syncMode) { + final CopyJobConfiguration configuration = CopyJobConfiguration.newBuilder(destinationTableId, sourceTableId) + .setCreateDisposition(JobInfo.CreateDisposition.CREATE_IF_NEEDED) + .setWriteDisposition(syncMode) + .build(); + + final Job job = bigQuery.create(JobInfo.of(configuration)); + final ImmutablePair jobStringImmutablePair = BigQueryUtils.executeQuery(job); + if (jobStringImmutablePair.getRight() != null) { + LOGGER.error("Failed on copy tables with error:" + job.getStatus()); + throw new RuntimeException("BigQuery was unable to copy table due to an error: \n" + job.getStatus().getError()); + } + LOGGER.info("successfully copied table: {} to table: {}", sourceTableId, destinationTableId); + } + + protected String getCreatePartitionedTableFromSelectQuery(final Schema schema, + final String projectId, + final TableId destinationTableId, + final String tmpPartitionTable) { + return String.format("create table `%s.%s.%s` (", projectId, destinationTableId.getDataset(), tmpPartitionTable) + + schema.getFields().stream() + .map(field -> String.format("%s %s", field.getName(), field.getType())) + .collect(Collectors.joining(", ")) + + ") partition by date(" + + JavaBaseConstants.COLUMN_NAME_EMITTED_AT + + ") as select " + + schema.getFields().stream() + .map(Field::getName) + .collect(Collectors.joining(", ")) + + String.format(" from `%s.%s.%s`", projectId, destinationTableId.getDataset(), destinationTableId.getTable()); + } + + @Override + public String toString() { + return "AbstractBigQueryUploader{" + + "table=" + table.getTable() + + ", tmpTable=" + tmpTable.getTable() + + ", syncMode=" + syncMode + + ", writer=" + writer.getClass() + + ", recordFormatter=" + recordFormatter.getClass() + + '}'; + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractGscBigQueryUploader.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractGscBigQueryUploader.java new file mode 100644 index 0000000000000..0c2f1d03e8f49 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/AbstractGscBigQueryUploader.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.DeleteObjectsRequest; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.Job; +import com.google.cloud.bigquery.JobInfo; +import com.google.cloud.bigquery.JobInfo.WriteDisposition; +import com.google.cloud.bigquery.LoadJobConfiguration; +import com.google.cloud.bigquery.TableId; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.GcsS3Helper; +import io.airbyte.integrations.destination.gcs.writer.GscWriter; +import io.airbyte.protocol.models.AirbyteMessage; +import java.util.List; +import java.util.function.Consumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class AbstractGscBigQueryUploader extends AbstractBigQueryUploader { + + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractGscBigQueryUploader.class); + + private final boolean isKeepFilesInGcs; + protected final GcsDestinationConfig gcsDestinationConfig; + + AbstractGscBigQueryUploader(final TableId table, + final TableId tmpTable, + final T writer, + final WriteDisposition syncMode, + final GcsDestinationConfig gcsDestinationConfig, + final BigQuery bigQuery, + final boolean isKeepFilesInGcs, + final BigQueryRecordFormatter recordFormatter) { + super(table, tmpTable, writer, syncMode, bigQuery, recordFormatter); + this.isKeepFilesInGcs = isKeepFilesInGcs; + this.gcsDestinationConfig = gcsDestinationConfig; + } + + @Override + public void postProcessAction(boolean hasFailed) throws Exception { + if (!isKeepFilesInGcs) { + deleteGcsFiles(); + } + } + + @Override + protected void uploadData(Consumer outputRecordCollector, AirbyteMessage lastStateMessage) throws Exception { + LOGGER.info("Uploading data to the tmp table {}.", tmpTable.getTable()); + uploadDataFromFileToTmpTable(); + super.uploadData(outputRecordCollector, lastStateMessage); + } + + protected void uploadDataFromFileToTmpTable() throws Exception { + try { + final String fileLocation = this.writer.getFileLocation(); + + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. + LOGGER.info(String.format("Started copying data from %s GCS " + getFileTypeName() + " file to %s tmp BigQuery table with schema: \n %s", + fileLocation, tmpTable, recordFormatter.getBigQuerySchema())); + + LoadJobConfiguration configuration = getLoadConfiguration(); + + // For more information on Job see: + // https://googleapis.dev/java/google-cloud-clients/latest/index.html?com/google/cloud/bigquery/package-summary.html + // Load the table + final Job loadJob = this.bigQuery.create(JobInfo.of(configuration)); + LOGGER.info("Created a new job GCS " + getFileTypeName() + " file to tmp BigQuery table: " + loadJob); + + // Load data from a GCS parquet file into the table + // Blocks until this load table job completes its execution, either failing or succeeding. + BigQueryUtils.waitForJobFinish(loadJob); + + LOGGER.info("Table is successfully overwritten by " + getFileTypeName() + " file loaded from GCS"); + } catch (final BigQueryException | InterruptedException e) { + LOGGER.error("Column not added during load append \n" + e.toString()); + throw new RuntimeException("Column not added during load append \n" + e.toString()); + } + } + + abstract protected LoadJobConfiguration getLoadConfiguration(); + + private String getFileTypeName() { + return writer.getFileFormat().getFileExtension(); + } + + private void deleteGcsFiles() { + LOGGER.info("Deleting file {}", writer.getFileLocation()); + final GcsDestinationConfig gcsDestinationConfig = this.gcsDestinationConfig; + final AmazonS3 s3Client = GcsS3Helper.getGcsS3Client(gcsDestinationConfig); + + final String gcsBucketName = gcsDestinationConfig.getBucketName(); + final String gcs_bucket_path = gcsDestinationConfig.getBucketPath(); + + final List objects = s3Client + .listObjects(gcsBucketName, gcs_bucket_path) + .getObjectSummaries(); + + objects.stream().filter(s3ObjectSummary -> s3ObjectSummary.getKey().equals(writer.getOutputPath())).forEach(s3ObjectSummary -> { + s3Client.deleteObject(gcsBucketName, new DeleteObjectsRequest.KeyVersion(s3ObjectSummary.getKey()).getKey()); + LOGGER.info("File is deleted : " + s3ObjectSummary.getKey()); + }); + s3Client.shutdown(); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryDirectUploader.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryDirectUploader.java new file mode 100644 index 0000000000000..1b2a30c9fe521 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryDirectUploader.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import com.google.cloud.bigquery.*; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.writer.BigQueryTableWriter; +import io.airbyte.protocol.models.AirbyteMessage; +import java.util.function.Consumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BigQueryDirectUploader extends AbstractBigQueryUploader { + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDirectUploader.class); + + public BigQueryDirectUploader(TableId table, + TableId tmpTable, + BigQueryTableWriter writer, + JobInfo.WriteDisposition syncMode, + BigQuery bigQuery, + BigQueryRecordFormatter recordFormatter) { + super(table, tmpTable, writer, syncMode, bigQuery, recordFormatter); + } + + @Override + protected void uploadData(Consumer outputRecordCollector, AirbyteMessage lastStateMessage) throws Exception { + BigQueryUtils.waitForJobFinish(writer.getWriteChannel().getJob()); + super.uploadData(outputRecordCollector, lastStateMessage); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryUploaderFactory.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryUploaderFactory.java new file mode 100644 index 0000000000000..71e995d24339e --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/BigQueryUploaderFactory.java @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import static io.airbyte.integrations.destination.s3.avro.AvroConstants.JSON_CONVERTER; + +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.JsonNode; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.FormatOptions; +import com.google.cloud.bigquery.JobId; +import com.google.cloud.bigquery.JobInfo; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.TableDataWriteChannel; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.WriteChannelConfiguration; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; +import io.airbyte.integrations.destination.bigquery.UploadingMethod; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.uploader.config.UploaderConfig; +import io.airbyte.integrations.destination.bigquery.writer.BigQueryTableWriter; +import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.GcsS3Helper; +import io.airbyte.integrations.destination.gcs.avro.GcsAvroWriter; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.HashSet; +import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BigQueryUploaderFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryUploaderFactory.class); + + public static AbstractBigQueryUploader getUploader(UploaderConfig uploaderConfig) + throws IOException { + final String schemaName = + BigQueryUtils.getSchema(uploaderConfig.getConfig(), uploaderConfig.getConfigStream()); + final String datasetLocation = BigQueryUtils.getDatasetLocation(uploaderConfig.getConfig()); + final Set existingSchemas = new HashSet<>(); + + final boolean isGcsUploadingMode = + UploadingMethod.GCS.equals(BigQueryUtils.getLoadingMethod(uploaderConfig.getConfig())); + BigQueryRecordFormatter recordFormatter = + (isGcsUploadingMode + ? uploaderConfig.getFormatterMap().get(UploaderType.AVRO) + : uploaderConfig.getFormatterMap().get(UploaderType.STANDARD)); + Schema bigQuerySchema = recordFormatter.getBigQuerySchema(); + + BigQueryUtils.createSchemaAndTableIfNeeded( + uploaderConfig.getBigQuery(), + existingSchemas, + schemaName, + uploaderConfig.getTmpTableName(), + datasetLocation, + bigQuerySchema); + + final TableId targetTable = TableId.of(schemaName, uploaderConfig.getTargetTableName()); + final TableId tmpTable = TableId.of(schemaName, uploaderConfig.getTmpTableName()); + final JobInfo.WriteDisposition syncMode = + BigQueryUtils.getWriteDisposition( + uploaderConfig.getConfigStream().getDestinationSyncMode()); + + return (isGcsUploadingMode + ? getGcsBigQueryUploader( + uploaderConfig.getConfig(), + uploaderConfig.getConfigStream(), + targetTable, + tmpTable, + uploaderConfig.getBigQuery(), + syncMode, + recordFormatter, + uploaderConfig.isDefaultAirbyteTmpSchema()) + : getBigQueryDirectUploader( + uploaderConfig.getConfig(), + targetTable, + tmpTable, + uploaderConfig.getBigQuery(), + syncMode, + datasetLocation, + recordFormatter)); + } + + private static AbstractGscBigQueryUploader getGcsBigQueryUploader( + JsonNode config, + ConfiguredAirbyteStream configStream, + TableId targetTable, + TableId tmpTable, + BigQuery bigQuery, + JobInfo.WriteDisposition syncMode, + BigQueryRecordFormatter formatter, + boolean isDefaultAirbyteTmpSchema) + throws IOException { + + final GcsDestinationConfig gcsDestinationConfig = + GcsDestinationConfig.getGcsDestinationConfig( + BigQueryUtils.getGcsAvroJsonNodeConfig(config)); + JsonNode tmpTableSchema = + (isDefaultAirbyteTmpSchema ? null : formatter.getJsonSchema()); + final GcsAvroWriter gcsCsvWriter = + initGcsWriter(gcsDestinationConfig, configStream, tmpTableSchema); + gcsCsvWriter.initialize(); + + return new GcsAvroBigQueryUploader( + targetTable, + tmpTable, + gcsCsvWriter, + syncMode, + gcsDestinationConfig, + bigQuery, + BigQueryUtils.isKeepFilesInGcs(config), + formatter); + } + + private static GcsAvroWriter initGcsWriter( + final GcsDestinationConfig gcsDestinationConfig, + final ConfiguredAirbyteStream configuredStream, + final JsonNode bigQuerySchema) + throws IOException { + final Timestamp uploadTimestamp = new Timestamp(System.currentTimeMillis()); + + final AmazonS3 s3Client = GcsS3Helper.getGcsS3Client(gcsDestinationConfig); + return new GcsAvroWriter( + gcsDestinationConfig, + s3Client, + configuredStream, + uploadTimestamp, + JSON_CONVERTER, + bigQuerySchema); + } + + private static BigQueryDirectUploader getBigQueryDirectUploader( + JsonNode config, + TableId targetTable, + TableId tmpTable, + BigQuery bigQuery, + JobInfo.WriteDisposition syncMode, + String datasetLocation, + BigQueryRecordFormatter formatter) { + // https://cloud.google.com/bigquery/docs/loading-data-local#loading_data_from_a_local_data_source + final WriteChannelConfiguration writeChannelConfiguration = + WriteChannelConfiguration.newBuilder(tmpTable) + .setCreateDisposition(JobInfo.CreateDisposition.CREATE_IF_NEEDED) + .setSchema(formatter.getBigQuerySchema()) + .setFormatOptions(FormatOptions.json()) + .build(); // new-line delimited json. + + final JobId job = + JobId.newBuilder() + .setRandomJob() + .setLocation(datasetLocation) + .setProject(bigQuery.getOptions().getProjectId()) + .build(); + + final TableDataWriteChannel writer = bigQuery.writer(job, writeChannelConfiguration); + + // this this optional value. If not set - use default client's value (15MiG) + final Integer bigQueryClientChunkSizeFomConfig = + BigQueryUtils.getBigQueryClientChunkSize(config); + if (bigQueryClientChunkSizeFomConfig != null) { + writer.setChunkSize(bigQueryClientChunkSizeFomConfig); + } + + return new BigQueryDirectUploader( + targetTable, + tmpTable, + new BigQueryTableWriter(writer), + syncMode, + bigQuery, + formatter); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsAvroBigQueryUploader.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsAvroBigQueryUploader.java new file mode 100644 index 0000000000000..89227c8fa0dbd --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsAvroBigQueryUploader.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import com.google.cloud.bigquery.*; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.avro.GcsAvroWriter; + +public class GcsAvroBigQueryUploader extends AbstractGscBigQueryUploader { + + public GcsAvroBigQueryUploader(TableId table, + TableId tmpTable, + GcsAvroWriter writer, + JobInfo.WriteDisposition syncMode, + GcsDestinationConfig gcsDestinationConfig, + BigQuery bigQuery, + boolean isKeepFilesInGcs, + BigQueryRecordFormatter recordFormatter) { + super(table, tmpTable, writer, syncMode, gcsDestinationConfig, bigQuery, isKeepFilesInGcs, recordFormatter); + } + + @Override + protected LoadJobConfiguration getLoadConfiguration() { + return LoadJobConfiguration.builder(tmpTable, writer.getFileLocation()).setFormatOptions(FormatOptions.avro()) + .setSchema(recordFormatter.getBigQuerySchema()) + .setWriteDisposition(syncMode) + .setUseAvroLogicalTypes(true) + .build(); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsCsvBigQueryUploader.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsCsvBigQueryUploader.java new file mode 100644 index 0000000000000..887292c5f0a6d --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/GcsCsvBigQueryUploader.java @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +import static com.amazonaws.util.StringUtils.UTF8; + +import com.google.cloud.bigquery.*; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.csv.GcsCsvWriter; + +public class GcsCsvBigQueryUploader extends AbstractGscBigQueryUploader { + + public GcsCsvBigQueryUploader(TableId table, + TableId tmpTable, + GcsCsvWriter writer, + JobInfo.WriteDisposition syncMode, + GcsDestinationConfig gcsDestinationConfig, + BigQuery bigQuery, + boolean isKeepFilesInGcs, + BigQueryRecordFormatter recordFormatter) { + super(table, tmpTable, writer, syncMode, gcsDestinationConfig, bigQuery, isKeepFilesInGcs, recordFormatter); + } + + @Override + protected LoadJobConfiguration getLoadConfiguration() { + final var csvOptions = CsvOptions.newBuilder().setEncoding(UTF8).setSkipLeadingRows(1).build(); + + return LoadJobConfiguration.builder(tmpTable, writer.getFileLocation()) + .setFormatOptions(csvOptions) + .setSchema(recordFormatter.getBigQuerySchema()) + .setWriteDisposition(syncMode) + .build(); + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/UploaderType.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/UploaderType.java new file mode 100644 index 0000000000000..22f1b9e94c463 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/UploaderType.java @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader; + +public enum UploaderType { + STANDARD, + AVRO, + CSV +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/config/UploaderConfig.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/config/UploaderConfig.java new file mode 100644 index 0000000000000..6d93eb0535803 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/uploader/config/UploaderConfig.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.uploader.config; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.cloud.bigquery.BigQuery; +import io.airbyte.integrations.destination.bigquery.formatter.BigQueryRecordFormatter; +import io.airbyte.integrations.destination.bigquery.uploader.UploaderType; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import java.util.Map; +import lombok.Builder; +import lombok.Getter; + +@Builder +@Getter +public class UploaderConfig { + + private JsonNode config; + private ConfiguredAirbyteStream configStream; + private String targetTableName; + private String tmpTableName; + private BigQuery bigQuery; + private Map formatterMap; + private boolean isDefaultAirbyteTmpSchema; + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/writer/BigQueryTableWriter.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/writer/BigQueryTableWriter.java new file mode 100644 index 0000000000000..018cde5c8063a --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/writer/BigQueryTableWriter.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.bigquery.writer; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.cloud.bigquery.TableDataWriteChannel; +import com.google.common.base.Charsets; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BigQueryTableWriter implements CommonWriter { + + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryTableWriter.class); + + private final TableDataWriteChannel writeChannel; + + public BigQueryTableWriter(TableDataWriteChannel writeChannel) { + this.writeChannel = writeChannel; + } + + @Override + public void initialize() throws IOException {} + + @Override + public void write(JsonNode formattedData) throws IOException { + writeChannel.write(ByteBuffer.wrap((Jsons.serialize(formattedData) + "\n").getBytes(Charsets.UTF_8))); + } + + @Override + public void close(boolean hasFailed) throws Exception { + this.writeChannel.close(); + } + + public TableDataWriteChannel getWriteChannel() { + return writeChannel; + } + +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json index 8b0f34047561f..a92dc705469ff 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json @@ -77,7 +77,7 @@ }, "transformation_priority": { "type": "string", - "description": "When running custom transformations or Basic normalization, running queries on interactive mode can hit BQ limits, choosing batch will solve those limitss.", + "description": "When running custom transformations or Basic normalization, running queries on interactive mode can hit BQ limits, choosing batch will solve those limits.", "title": "Transformation Query Run Type", "default": "interactive", "enum": ["interactive", "batch"] diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationAcceptanceTest.java index bfa687f414351..d67b634c3f28e 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationAcceptanceTest.java @@ -28,6 +28,7 @@ import io.airbyte.integrations.destination.StandardNameTransformer; import io.airbyte.integrations.standardtest.destination.DestinationAcceptanceTest; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -44,18 +45,18 @@ public class BigQueryDestinationAcceptanceTest extends DestinationAcceptanceTest private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDestinationAcceptanceTest.class); - private static final Path CREDENTIALS_PATH = Path.of("secrets/credentials.json"); + protected static final Path CREDENTIALS_PATH = Path.of("secrets/credentials.json"); - private static final String CONFIG_DATASET_ID = "dataset_id"; - private static final String CONFIG_PROJECT_ID = "project_id"; - private static final String CONFIG_DATASET_LOCATION = "dataset_location"; - private static final String CONFIG_CREDS = "credentials_json"; + protected static final String CONFIG_DATASET_ID = "dataset_id"; + protected static final String CONFIG_PROJECT_ID = "project_id"; + protected static final String CONFIG_DATASET_LOCATION = "dataset_location"; + protected static final String CONFIG_CREDS = "credentials_json"; - private BigQuery bigquery; - private Dataset dataset; - private boolean tornDown; - private JsonNode config; - private final StandardNameTransformer namingResolver = new StandardNameTransformer(); + protected BigQuery bigquery; + protected Dataset dataset; + protected boolean tornDown; + protected JsonNode config; + protected final StandardNameTransformer namingResolver = new StandardNameTransformer(); @Override protected String getImageName() { @@ -184,6 +185,10 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception { .put(CONFIG_DATASET_LOCATION, datasetLocation) .build()); + setupBigQuery(credentialsJson); + } + + protected void setupBigQuery(JsonNode credentialsJson) throws IOException { final ServiceAccountCredentials credentials = ServiceAccountCredentials .fromStream(new ByteArrayInputStream(credentialsJson.toString().getBytes())); @@ -213,7 +218,7 @@ protected void tearDown(final TestDestinationEnv testEnv) { tearDownBigQuery(); } - private void tearDownBigQuery() { + protected void tearDownBigQuery() { // allows deletion of a dataset that has contents final BigQuery.DatasetDeleteOption option = BigQuery.DatasetDeleteOption.deleteContents(); diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java index 256ff1ea9a407..58d2d76e12007 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java @@ -5,163 +5,16 @@ package io.airbyte.integrations.destination.bigquery; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.auth.oauth2.ServiceAccountCredentials; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.BigQueryOptions; -import com.google.cloud.bigquery.Dataset; -import com.google.cloud.bigquery.DatasetInfo; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.FieldList; -import com.google.cloud.bigquery.FieldValue; -import com.google.cloud.bigquery.FieldValueList; -import com.google.cloud.bigquery.Job; -import com.google.cloud.bigquery.JobId; -import com.google.cloud.bigquery.JobInfo; -import com.google.cloud.bigquery.QueryJobConfiguration; -import com.google.cloud.bigquery.TableResult; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; import io.airbyte.commons.json.Jsons; import io.airbyte.commons.string.Strings; -import io.airbyte.integrations.base.JavaBaseConstants; -import io.airbyte.integrations.destination.StandardNameTransformer; -import io.airbyte.integrations.standardtest.destination.DestinationAcceptanceTest; -import java.io.ByteArrayInputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -public class BigQueryGcsDestinationAcceptanceTest extends DestinationAcceptanceTest { - - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryGcsDestinationAcceptanceTest.class); +public class BigQueryGcsDestinationAcceptanceTest extends BigQueryDestinationAcceptanceTest { private static final Path CREDENTIALS_PATH = Path.of("secrets/credentials.json"); - private static final String CONFIG_DATASET_ID = "dataset_id"; - private static final String CONFIG_PROJECT_ID = "project_id"; - private static final String CONFIG_DATASET_LOCATION = "dataset_location"; - private static final String CONFIG_CREDS = "credentials_json"; - - private BigQuery bigquery; - private Dataset dataset; - private boolean tornDown; - private JsonNode config; - private final StandardNameTransformer namingResolver = new StandardNameTransformer(); - - @Override - protected String getImageName() { - return "airbyte/destination-bigquery:dev"; - } - - @Override - protected JsonNode getConfig() { - return config; - } - - @Override - protected JsonNode getFailCheckConfig() throws Exception { - ((ObjectNode) config).put(CONFIG_PROJECT_ID, "fake"); - return config; - } - - @Override - protected boolean supportsNormalization() { - return true; - } - - @Override - protected boolean supportsDBT() { - return true; - } - - @Override - protected boolean implementsNamespaces() { - return true; - } - - @Override - protected String getDefaultSchema(final JsonNode config) { - return config.get(CONFIG_DATASET_ID).asText(); - } - - @Override - protected List retrieveNormalizedRecords(final TestDestinationEnv testEnv, final String streamName, final String namespace) - throws Exception { - final String tableName = namingResolver.getIdentifier(streamName); - final String schema = namingResolver.getIdentifier(namespace); - return retrieveRecordsFromTable(tableName, schema); - } - - @Override - protected List retrieveRecords(final TestDestinationEnv env, - final String streamName, - final String namespace, - final JsonNode streamSchema) - throws Exception { - return retrieveRecordsFromTable(namingResolver.getRawTableName(streamName), namingResolver.getIdentifier(namespace)) - .stream() - .map(node -> node.get(JavaBaseConstants.COLUMN_NAME_DATA).asText()) - .map(Jsons::deserialize) - .collect(Collectors.toList()); - } - - @Override - protected List resolveIdentifier(final String identifier) { - final List result = new ArrayList<>(); - result.add(identifier); - result.add(namingResolver.getIdentifier(identifier)); - return result; - } - - private List retrieveRecordsFromTable(final String tableName, final String schema) throws InterruptedException { - final QueryJobConfiguration queryConfig = - QueryJobConfiguration - .newBuilder( - String.format("SELECT * FROM `%s`.`%s` order by %s asc;", schema, tableName, - JavaBaseConstants.COLUMN_NAME_EMITTED_AT)) - .setUseLegacySql(false).build(); - - final TableResult queryResults = executeQuery(bigquery, queryConfig).getLeft().getQueryResults(); - final FieldList fields = queryResults.getSchema().getFields(); - - return StreamSupport - .stream(queryResults.iterateAll().spliterator(), false) - .map(row -> { - final Map jsonMap = Maps.newHashMap(); - for (final Field field : fields) { - final Object value = getTypedFieldValue(row, field); - jsonMap.put(field.getName(), value); - } - return jsonMap; - }) - .map(Jsons::jsonNode) - .collect(Collectors.toList()); - } - - private Object getTypedFieldValue(final FieldValueList row, final Field field) { - final FieldValue fieldValue = row.get(field.getName()); - if (fieldValue.getValue() != null) { - return switch (field.getType().getStandardType()) { - case FLOAT64, NUMERIC -> fieldValue.getDoubleValue(); - case INT64 -> fieldValue.getNumericValue().intValue(); - case STRING -> fieldValue.getStringValue(); - case BOOL -> fieldValue.getBooleanValue(); - default -> fieldValue.getValue(); - }; - } else { - return null; - } - } - @Override protected void setup(final TestDestinationEnv testEnv) throws Exception { if (!Files.exists(CREDENTIALS_PATH)) { @@ -203,77 +56,7 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception { .put(BigQueryConsts.LOADING_METHOD, loadingMethod) .build()); - final ServiceAccountCredentials credentials = ServiceAccountCredentials - .fromStream(new ByteArrayInputStream(bigqueryConfigFromSecretFile.toString().getBytes())); - - bigquery = BigQueryOptions.newBuilder() - .setProjectId(config.get(CONFIG_PROJECT_ID).asText()) - .setCredentials(credentials) - .build() - .getService(); - - final DatasetInfo datasetInfo = - DatasetInfo.newBuilder(config.get(CONFIG_DATASET_ID).asText()).setLocation(config.get(CONFIG_DATASET_LOCATION).asText()).build(); - dataset = bigquery.create(datasetInfo); - - tornDown = false; - Runtime.getRuntime() - .addShutdownHook( - new Thread( - () -> { - if (!tornDown) { - tearDownBigQuery(); - } - })); - } - - @Override - protected void tearDown(final TestDestinationEnv testEnv) { - // gcs tmp files are supposed to be removed automatically by consumer - tearDownBigQuery(); - } - - private void tearDownBigQuery() { - // allows deletion of a dataset that has contents - final BigQuery.DatasetDeleteOption option = BigQuery.DatasetDeleteOption.deleteContents(); - - final boolean success = bigquery.delete(dataset.getDatasetId(), option); - if (success) { - LOGGER.info("BQ Dataset " + dataset + " deleted..."); - } else { - LOGGER.info("BQ Dataset cleanup for " + dataset + " failed!"); - } - - tornDown = true; - } - - // todo (cgardens) - figure out how to share these helpers. they are currently copied from - // BigQueryDestination. - private static ImmutablePair executeQuery(final BigQuery bigquery, final QueryJobConfiguration queryConfig) { - final JobId jobId = JobId.of(UUID.randomUUID().toString()); - final Job queryJob = bigquery.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()); - return executeQuery(queryJob); - } - - private static ImmutablePair executeQuery(final Job queryJob) { - final Job completedJob = waitForQuery(queryJob); - if (completedJob == null) { - throw new RuntimeException("Job no longer exists"); - } else if (completedJob.getStatus().getError() != null) { - // You can also look at queryJob.getStatus().getExecutionErrors() for all - // errors, not just the latest one. - return ImmutablePair.of(null, (completedJob.getStatus().getError().toString())); - } - - return ImmutablePair.of(completedJob, null); - } - - private static Job waitForQuery(final Job queryJob) { - try { - return queryJob.waitFor(); - } catch (final Exception e) { - throw new RuntimeException(e); - } + setupBigQuery(bigqueryConfigFromSecretFile); } } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationTest.java index 383871404c1c8..3395cb82ad3d9 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationTest.java @@ -299,7 +299,6 @@ void testWriteFailure() throws Exception { assertThrows(RuntimeException.class, () -> consumer.accept(spiedMessage)); consumer.accept(MESSAGE_USERS2); - assertThrows(RuntimeException.class, () -> consumer.close()); // check if fails when data was not loaded to GCS bucket by some reason final List tableNames = catalog.getStreams() .stream() diff --git a/airbyte-integrations/connectors/destination-gcs/Dockerfile b/airbyte-integrations/connectors/destination-gcs/Dockerfile index 14995b54941ca..23cfcd8c0633d 100644 --- a/airbyte-integrations/connectors/destination-gcs/Dockerfile +++ b/airbyte-integrations/connectors/destination-gcs/Dockerfile @@ -3,7 +3,9 @@ FROM airbyte/integration-base-java:dev WORKDIR /airbyte ENV APPLICATION destination-gcs -ADD build/distributions/${APPLICATION}*.tar /airbyte +COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar + +RUN tar xf ${APPLICATION}.tar --strip-components=1 LABEL io.airbyte.version=0.1.15 LABEL io.airbyte.name=airbyte/destination-gcs diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java index a75f29c60f2c4..d8452e9a9b2b3 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java @@ -7,10 +7,15 @@ import alex.mojaki.s3upload.MultiPartOutputStream; import alex.mojaki.s3upload.StreamTransferManager; import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; +import io.airbyte.integrations.destination.gcs.util.GcsUtils; import io.airbyte.integrations.destination.gcs.writer.BaseGcsWriter; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import io.airbyte.integrations.destination.gcs.writer.GscWriter; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.avro.AvroRecordFactory; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.avro.S3AvroFormatConfig; import io.airbyte.integrations.destination.s3.util.S3StreamTransferManagerHelper; import io.airbyte.integrations.destination.s3.writer.S3Writer; @@ -28,7 +33,7 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.JsonAvroConverter; -public class GcsAvroWriter extends BaseGcsWriter implements S3Writer { +public class GcsAvroWriter extends BaseGcsWriter implements S3Writer, GscWriter, CommonWriter { protected static final Logger LOGGER = LoggerFactory.getLogger(GcsAvroWriter.class); @@ -36,19 +41,34 @@ public class GcsAvroWriter extends BaseGcsWriter implements S3Writer { private final StreamTransferManager uploadManager; private final MultiPartOutputStream outputStream; private final DataFileWriter dataFileWriter; + private final String gcsFileLocation; private final String objectKey; public GcsAvroWriter(final GcsDestinationConfig config, final AmazonS3 s3Client, final ConfiguredAirbyteStream configuredStream, final Timestamp uploadTimestamp, - final Schema schema, final JsonAvroConverter converter) throws IOException { + this(config, s3Client, configuredStream, uploadTimestamp, converter, null); + } + + public GcsAvroWriter(final GcsDestinationConfig config, + final AmazonS3 s3Client, + final ConfiguredAirbyteStream configuredStream, + final Timestamp uploadTimestamp, + final JsonAvroConverter converter, + final JsonNode airbyteSchema) + throws IOException { super(config, s3Client, configuredStream); + Schema schema = (airbyteSchema == null ? GcsUtils.getDefaultAvroSchema(stream.getName(), stream.getNamespace(), true) + : new JsonToAvroSchemaConverter().getAvroSchema(airbyteSchema, stream.getName(), + stream.getNamespace(), true, false, false)); + LOGGER.info("Avro schema : {}", schema); final String outputFilename = BaseGcsWriter.getOutputFilename(uploadTimestamp, S3Format.AVRO); objectKey = String.join("/", outputPrefix, outputFilename); + gcsFileLocation = String.format("gs://%s/%s", config.getBucketName(), objectKey); LOGGER.info("Full GCS path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); @@ -72,6 +92,17 @@ public void write(final UUID id, final AirbyteRecordMessage recordMessage) throw dataFileWriter.append(avroRecordFactory.getAvroRecord(id, recordMessage)); } + @Override + public void write(JsonNode formattedData) throws IOException { + GenericData.Record record = avroRecordFactory.getAvroRecord(formattedData); + dataFileWriter.append(record); + } + + @Override + public String getFileLocation() { + return gcsFileLocation; + } + @Override protected void closeWhenSucceed() throws IOException { dataFileWriter.close(); @@ -86,6 +117,11 @@ protected void closeWhenFail() throws IOException { uploadManager.abort(); } + @Override + public S3Format getFileFormat() { + return S3Format.AVRO; + } + @Override public String getOutputPath() { return objectKey; diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/csv/GcsCsvWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/csv/GcsCsvWriter.java index d1fbc72fe6115..c92332fc510ba 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/csv/GcsCsvWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/csv/GcsCsvWriter.java @@ -7,8 +7,11 @@ import alex.mojaki.s3upload.MultiPartOutputStream; import alex.mojaki.s3upload.StreamTransferManager; import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; import io.airbyte.integrations.destination.gcs.writer.BaseGcsWriter; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import io.airbyte.integrations.destination.gcs.writer.GscWriter; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.csv.CsvSheetGenerator; import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig; @@ -27,7 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GcsCsvWriter extends BaseGcsWriter implements S3Writer { +public class GcsCsvWriter extends BaseGcsWriter implements S3Writer, GscWriter, CommonWriter { private static final Logger LOGGER = LoggerFactory.getLogger(GcsCsvWriter.class); @@ -35,7 +38,7 @@ public class GcsCsvWriter extends BaseGcsWriter implements S3Writer { private final StreamTransferManager uploadManager; private final MultiPartOutputStream outputStream; private final CSVPrinter csvPrinter; - private final String gcsCsvFileLocation; // this used in destination-bigquery (GCS upload type) + private final String gcsFileLocation; private final String objectKey; public GcsCsvWriter(final GcsDestinationConfig config, @@ -50,7 +53,7 @@ public GcsCsvWriter(final GcsDestinationConfig config, final String outputFilename = BaseGcsWriter.getOutputFilename(uploadTimestamp, S3Format.CSV); objectKey = String.join("/", outputPrefix, outputFilename); - gcsCsvFileLocation = String.format("gs://%s/%s", config.getBucketName(), objectKey); + gcsFileLocation = String.format("gs://%s/%s", config.getBucketName(), objectKey); LOGGER.info("Full GCS path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); @@ -69,6 +72,11 @@ public void write(final UUID id, final AirbyteRecordMessage recordMessage) throw csvPrinter.printRecord(csvSheetGenerator.getDataRow(id, recordMessage)); } + @Override + public void write(JsonNode formattedData) throws IOException { + csvPrinter.printRecord(csvSheetGenerator.getDataRow(formattedData)); + } + @Override protected void closeWhenSucceed() throws IOException { csvPrinter.close(); @@ -83,14 +91,20 @@ protected void closeWhenFail() throws IOException { uploadManager.abort(); } - public String getGcsCsvFileLocation() { - return gcsCsvFileLocation; + @Override + public String getFileLocation() { + return gcsFileLocation; } public CSVPrinter getCsvPrinter() { return csvPrinter; } + @Override + public S3Format getFileFormat() { + return S3Format.CSV; + } + @Override public String getOutputPath() { return objectKey; diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/jsonl/GcsJsonlWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/jsonl/GcsJsonlWriter.java index f98cd27c60ba6..1e1666e529af4 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/jsonl/GcsJsonlWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/jsonl/GcsJsonlWriter.java @@ -7,6 +7,7 @@ import alex.mojaki.s3upload.MultiPartOutputStream; import alex.mojaki.s3upload.StreamTransferManager; import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import io.airbyte.commons.jackson.MoreMappers; @@ -14,11 +15,14 @@ import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; import io.airbyte.integrations.destination.gcs.writer.BaseGcsWriter; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import io.airbyte.integrations.destination.gcs.writer.GscWriter; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.util.S3StreamTransferManagerHelper; import io.airbyte.integrations.destination.s3.writer.S3Writer; import io.airbyte.protocol.models.AirbyteRecordMessage; import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import java.io.IOException; import java.io.PrintWriter; import java.nio.charset.StandardCharsets; import java.sql.Timestamp; @@ -26,7 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GcsJsonlWriter extends BaseGcsWriter implements S3Writer { +public class GcsJsonlWriter extends BaseGcsWriter implements S3Writer, GscWriter, CommonWriter { protected static final Logger LOGGER = LoggerFactory.getLogger(GcsJsonlWriter.class); @@ -35,6 +39,7 @@ public class GcsJsonlWriter extends BaseGcsWriter implements S3Writer { private final StreamTransferManager uploadManager; private final MultiPartOutputStream outputStream; private final PrintWriter printWriter; + private final String gcsFileLocation; private final String objectKey; public GcsJsonlWriter(final GcsDestinationConfig config, @@ -46,6 +51,7 @@ public GcsJsonlWriter(final GcsDestinationConfig config, final String outputFilename = BaseGcsWriter.getOutputFilename(uploadTimestamp, S3Format.JSONL); objectKey = String.join("/", outputPrefix, outputFilename); + gcsFileLocation = String.format("gs://%s/%s", config.getBucketName(), objectKey); LOGGER.info("Full GCS path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); this.uploadManager = S3StreamTransferManagerHelper.getDefault( @@ -65,6 +71,11 @@ public void write(final UUID id, final AirbyteRecordMessage recordMessage) { printWriter.println(Jsons.serialize(json)); } + @Override + public void write(JsonNode formattedData) throws IOException { + printWriter.println(Jsons.serialize(formattedData)); + } + @Override protected void closeWhenSucceed() { printWriter.close(); @@ -79,6 +90,16 @@ protected void closeWhenFail() { uploadManager.abort(); } + @Override + public String getFileLocation() { + return gcsFileLocation; + } + + @Override + public S3Format getFileFormat() { + return S3Format.JSONL; + } + @Override public String getOutputPath() { return objectKey; diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/parquet/GcsParquetWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/parquet/GcsParquetWriter.java index ae4373c12abf8..f3f61eb3f4a1c 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/parquet/GcsParquetWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/parquet/GcsParquetWriter.java @@ -5,10 +5,13 @@ package io.airbyte.integrations.destination.gcs.parquet; import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import io.airbyte.integrations.destination.gcs.GcsDestinationConfig; import io.airbyte.integrations.destination.gcs.credential.GcsHmacKeyCredentialConfig; import io.airbyte.integrations.destination.gcs.writer.BaseGcsWriter; +import io.airbyte.integrations.destination.gcs.writer.CommonWriter; +import io.airbyte.integrations.destination.gcs.writer.GscWriter; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.avro.AvroRecordFactory; import io.airbyte.integrations.destination.s3.parquet.S3ParquetFormatConfig; @@ -32,13 +35,14 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.JsonAvroConverter; -public class GcsParquetWriter extends BaseGcsWriter implements S3Writer { +public class GcsParquetWriter extends BaseGcsWriter implements S3Writer, GscWriter, CommonWriter { private static final Logger LOGGER = LoggerFactory.getLogger(GcsParquetWriter.class); private static final ObjectMapper MAPPER = new ObjectMapper(); private final ParquetWriter parquetWriter; private final AvroRecordFactory avroRecordFactory; + private final String gcsFileLocation; private final String objectKey; public GcsParquetWriter(final GcsDestinationConfig config, @@ -54,7 +58,8 @@ public GcsParquetWriter(final GcsDestinationConfig config, objectKey = String.join("/", outputPrefix, outputFilename); LOGGER.info("Storage path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); - final URI uri = new URI(String.format("s3a://%s/%s/%s", config.getBucketName(), outputPrefix, outputFilename)); + gcsFileLocation = String.format("s3a://%s/%s/%s", config.getBucketName(), outputPrefix, outputFilename); + final URI uri = new URI(gcsFileLocation); final Path path = new Path(uri); LOGGER.info("Full GCS path for stream '{}': {}", stream.getName(), path); @@ -95,6 +100,11 @@ public void write(final UUID id, final AirbyteRecordMessage recordMessage) throw parquetWriter.write(avroRecordFactory.getAvroRecord(id, recordMessage)); } + @Override + public void write(JsonNode formattedData) throws IOException { + parquetWriter.write(avroRecordFactory.getAvroRecord(formattedData)); + } + @Override public void close(final boolean hasFailed) throws IOException { if (hasFailed) { @@ -113,4 +123,14 @@ public String getOutputPath() { return objectKey; } + @Override + public String getFileLocation() { + return gcsFileLocation; + } + + @Override + public S3Format getFileFormat() { + return S3Format.PARQUET; + } + } diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/util/GcsUtils.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/util/GcsUtils.java new file mode 100644 index 0000000000000..1e78e263ee421 --- /dev/null +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/util/GcsUtils.java @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.gcs.util; + +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.s3.avro.AvroConstants; +import javax.annotation.Nullable; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class GcsUtils { + + private static final Logger LOGGER = LoggerFactory.getLogger(GcsUtils.class); + + public static Schema getDefaultAvroSchema(final String name, + @Nullable final String namespace, + final boolean appendAirbyteFields) { + LOGGER.info("Default schema."); + final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); + SchemaBuilder.RecordBuilder builder = SchemaBuilder.record(stdName); + + if (namespace != null) { + builder = builder.namespace(namespace); + } + + SchemaBuilder.FieldAssembler assembler = builder.fields(); + + Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() + .addToSchema(Schema.create(Schema.Type.LONG)); + Schema UUID_SCHEMA = LogicalTypes.uuid() + .addToSchema(Schema.create(Schema.Type.STRING)); + + if (appendAirbyteFields) { + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_AB_ID).type(UUID_SCHEMA).noDefault(); + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_EMITTED_AT) + .type(TIMESTAMP_MILLIS_SCHEMA).noDefault(); + } + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_DATA).type().stringType().noDefault(); + + return assembler.endRecord(); + } + +} diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/BaseGcsWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/BaseGcsWriter.java index b5983f5eaa1fe..a70445369885f 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/BaseGcsWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/BaseGcsWriter.java @@ -33,7 +33,7 @@ *

  • Create the bucket and prepare the bucket path.
  • * */ -public abstract class BaseGcsWriter implements S3Writer { +public abstract class BaseGcsWriter implements S3Writer, CommonWriter { private static final Logger LOGGER = LoggerFactory.getLogger(BaseGcsWriter.class); @@ -86,6 +86,7 @@ public void initialize() { LOGGER.info("Deleted {} file(s) for stream '{}'.", keysToDelete.size(), stream.getName()); } + LOGGER.info("Overwrite is finished"); } } diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/CommonWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/CommonWriter.java new file mode 100644 index 0000000000000..2789b7449bcf5 --- /dev/null +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/CommonWriter.java @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.gcs.writer; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; + +public interface CommonWriter { + + void initialize() throws IOException; + + void write(JsonNode formattedData) throws IOException; + + void close(boolean hasFailed) throws Exception; + +} diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/GscWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/GscWriter.java new file mode 100644 index 0000000000000..c043971ff4945 --- /dev/null +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/GscWriter.java @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.gcs.writer; + +import io.airbyte.integrations.destination.s3.S3Format; + +public interface GscWriter extends CommonWriter { + + String getFileLocation(); + + S3Format getFileFormat(); + + String getOutputPath(); + +} diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java index 45d1e334d2dcb..d3d87c457a6e6 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java @@ -37,14 +37,13 @@ public S3Writer create(final GcsDestinationConfig config, final AirbyteStream stream = configuredStream.getStream(); LOGGER.info("Json schema for stream {}: {}", stream.getName(), stream.getJsonSchema()); - final JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true); - - LOGGER.info("Avro schema for stream {}: {}", stream.getName(), avroSchema.toString(false)); - if (format == S3Format.AVRO) { - return new GcsAvroWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, AvroConstants.JSON_CONVERTER); + return new GcsAvroWriter(config, s3Client, configuredStream, uploadTimestamp, AvroConstants.JSON_CONVERTER, stream.getJsonSchema()); } else { + final JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); + final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true); + + LOGGER.info("Avro schema for stream {}: {}", stream.getName(), avroSchema.toString(false)); return new GcsParquetWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, AvroConstants.JSON_CONVERTER); } } diff --git a/airbyte-integrations/connectors/destination-gcs/src/test/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriterTest.java b/airbyte-integrations/connectors/destination-gcs/src/test/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriterTest.java index 75616d186e791..1200ca15555be 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/test/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriterTest.java +++ b/airbyte-integrations/connectors/destination-gcs/src/test/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriterTest.java @@ -17,7 +17,6 @@ import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; -import org.apache.avro.Schema; import org.junit.jupiter.api.Test; class GcsAvroWriterTest { @@ -37,7 +36,6 @@ public void generatesCorrectObjectPath() throws IOException { .withNamespace("fake-namespace") .withName("fake-stream")), Timestamp.from(Instant.ofEpochMilli(1234)), - mock(Schema.class), null); assertEquals("fake-bucketPath/fake_namespace/fake_stream/1970_01_01_1234_0.avro", writer.getOutputPath()); diff --git a/airbyte-integrations/connectors/destination-s3/Dockerfile b/airbyte-integrations/connectors/destination-s3/Dockerfile index 16773fda6dde0..8c09020fb4d75 100644 --- a/airbyte-integrations/connectors/destination-s3/Dockerfile +++ b/airbyte-integrations/connectors/destination-s3/Dockerfile @@ -3,7 +3,9 @@ FROM airbyte/integration-base-java:dev WORKDIR /airbyte ENV APPLICATION destination-s3 -ADD build/distributions/${APPLICATION}*.tar /airbyte +COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar + +RUN tar xf ${APPLICATION}.tar --strip-components=1 LABEL io.airbyte.version=0.2.0 LABEL io.airbyte.name=airbyte/destination-s3 diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java index 791df02105424..5f3a1ad6f2091 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java @@ -5,6 +5,7 @@ package io.airbyte.integrations.destination.s3.avro; import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -14,6 +15,8 @@ import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.JsonAvroConverter; public class AvroRecordFactory { @@ -38,4 +41,9 @@ public GenericData.Record getAvroRecord(final UUID id, final AirbyteRecordMessag return converter.convertToGenericDataRecord(WRITER.writeValueAsBytes(jsonRecord), schema); } + public GenericData.Record getAvroRecord(JsonNode formattedData) throws JsonProcessingException { + var bytes = WRITER.writeValueAsBytes(formattedData); + return converter.convertToGenericDataRecord(bytes, schema); + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index bae78b808e06d..bb5b067ad2602 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -93,13 +93,22 @@ public Map getStandardizedNames() { return standardizedNames; } + public Schema getAvroSchema(final JsonNode jsonSchema, + final String name, + @Nullable final String namespace, + final boolean appendAirbyteFields) { + return getAvroSchema(jsonSchema, name, namespace, appendAirbyteFields, true, true); + } + /** * @return - Avro schema based on the input {@code jsonSchema}. */ public Schema getAvroSchema(final JsonNode jsonSchema, final String name, @Nullable final String namespace, - final boolean appendAirbyteFields) { + final boolean appendAirbyteFields, + final boolean appendExtraProps, + final boolean addStringToLogicalTypes) { final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); RecordBuilder builder = SchemaBuilder.record(stdName); if (!stdName.equals(name)) { @@ -150,18 +159,20 @@ public Schema getAvroSchema(final JsonNode jsonSchema, AvroConstants.DOC_KEY_VALUE_DELIMITER, fieldName)); } - assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) + assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition, appendExtraProps, addStringToLogicalTypes)) .withDefault(null); } - // support additional properties in one field - assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) - .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); + if (appendExtraProps) { + // support additional properties in one field + assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) + .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); + } return assembler.endRecord(); } - Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition) { + Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition, final boolean appendExtraProps, final boolean addStringToLogicalTypes) { Preconditions .checkState(fieldType != JsonSchemaType.NULL, "Null types should have been filtered out"); @@ -178,6 +189,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType if (fieldDefinition.has("format")) { final String format = fieldDefinition.get("format").asText(); fieldSchema = switch (format) { + case "timestamp-micros" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); case "date-time" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); case "date" -> LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); case "time" -> LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); @@ -189,7 +201,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType } case COMBINED -> { final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); - final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get()); + final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get(), appendExtraProps, addStringToLogicalTypes); fieldSchema = Schema.createUnion(unionTypes); } case ARRAY -> { @@ -198,9 +210,9 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType LOGGER.warn("Source connector provided schema for ARRAY with missed \"items\", will assume that it's a String type"); fieldSchema = Schema.createArray(Schema.createUnion(NULL_SCHEMA, STRING_SCHEMA)); } else if (items.isObject()) { - fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items)); + fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items, appendExtraProps, addStringToLogicalTypes)); } else if (items.isArray()) { - final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items); + final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items, appendExtraProps, addStringToLogicalTypes); arrayElementTypes.add(0, NULL_SCHEMA); fieldSchema = Schema.createArray(Schema.createUnion(arrayElementTypes)); } else { @@ -208,18 +220,18 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType String.format("Array field %s has invalid items property: %s", fieldName, items)); } } - case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, null, false); + case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, null, false, appendExtraProps, addStringToLogicalTypes); default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } return fieldSchema; } - List getSchemasFromTypes(final String fieldName, final ArrayNode types) { + List getSchemasFromTypes(final String fieldName, final ArrayNode types, final boolean appendExtraProps, final boolean addStringToLogicalTypes) { return MoreIterators.toList(types.elements()) .stream() .flatMap(definition -> getNonNullTypes(fieldName, definition).stream().flatMap(type -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition); + final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition, appendExtraProps, addStringToLogicalTypes); if (singleFieldSchema.isUnion()) { return singleFieldSchema.getTypes().stream(); } else { @@ -233,12 +245,12 @@ List getSchemasFromTypes(final String fieldName, final ArrayNode types) /** * @param fieldDefinition - Json schema field definition. E.g. { type: "number" }. */ - Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition) { + Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition, final boolean appendExtraProps, final boolean addStringToLogicalTypes) { // Filter out null types, which will be added back in the end. final List nonNullFieldTypes = getNonNullTypes(fieldName, fieldDefinition) .stream() .flatMap(fieldType -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition); + final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition, appendExtraProps, addStringToLogicalTypes); if (singleFieldSchema.isUnion()) { return singleFieldSchema.getTypes().stream(); } else { @@ -260,7 +272,7 @@ Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinit // cannot be properly processed. if ((nonNullFieldTypes .stream().anyMatch(schema -> schema.getLogicalType() != null)) && - (!nonNullFieldTypes.contains(STRING_SCHEMA))) { + (!nonNullFieldTypes.contains(STRING_SCHEMA)) && addStringToLogicalTypes) { nonNullFieldTypes.add(STRING_SCHEMA); } return Schema.createUnion(nonNullFieldTypes); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/BaseSheetGenerator.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/BaseSheetGenerator.java index d779f43f9b26d..086e6182ba501 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/BaseSheetGenerator.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/BaseSheetGenerator.java @@ -24,6 +24,11 @@ public List getDataRow(final UUID id, final AirbyteRecordMessage recordM return data; } + @Override + public List getDataRow(JsonNode formattedData) { + return new LinkedList<>(getRecordColumns(formattedData)); + } + abstract List getRecordColumns(JsonNode json); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/CsvSheetGenerator.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/CsvSheetGenerator.java index 760da40ac6a22..f22c01d5b2b1f 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/CsvSheetGenerator.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/CsvSheetGenerator.java @@ -20,6 +20,8 @@ public interface CsvSheetGenerator { List getDataRow(UUID id, AirbyteRecordMessage recordMessage); + List getDataRow(JsonNode formattedData); + final class Factory { public static CsvSheetGenerator create(final JsonNode jsonSchema, final S3CsvFormatConfig formatConfig) { diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/StagingDatabaseCsvSheetGenerator.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/StagingDatabaseCsvSheetGenerator.java index 4310a074147b0..4f3e137f6186b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/StagingDatabaseCsvSheetGenerator.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/StagingDatabaseCsvSheetGenerator.java @@ -4,11 +4,14 @@ package io.airbyte.integrations.destination.s3.csv; +import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.protocol.models.AirbyteRecordMessage; import java.sql.Timestamp; import java.time.Instant; +import java.util.Collections; +import java.util.LinkedList; import java.util.List; import java.util.UUID; @@ -41,4 +44,9 @@ public List getDataRow(final UUID id, final AirbyteRecordMessage recordM Timestamp.from(Instant.ofEpochMilli(recordMessage.getEmittedAt()))); } + @Override + public List getDataRow(JsonNode formattedData) { + return new LinkedList<>(Collections.singletonList(Jsons.serialize(formattedData))); + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java index 2a90a03c25827..3c81df7418e3e 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java @@ -62,7 +62,8 @@ public static class GetFieldTypeTestCaseProvider implements ArgumentsProvider { @Override public Stream provideArguments(final ExtensionContext context) throws Exception { - final JsonNode testCases = Jsons.deserialize(MoreResources.readResource("parquet/json_schema_converter/type_conversion_test_cases.json")); + final JsonNode testCases = + Jsons.deserialize(MoreResources.readResource("parquet/json_schema_converter/type_conversion_test_cases.json")); return MoreIterators.toList(testCases.elements()).stream().map(testCase -> Arguments.of( testCase.get("fieldName").asText(), testCase.get("jsonFieldSchema"), @@ -76,7 +77,7 @@ public Stream provideArguments(final ExtensionContext conte public void testFieldTypeConversion(final String fieldName, final JsonNode jsonFieldSchema, final JsonNode avroFieldType) { assertEquals( avroFieldType, - Jsons.deserialize(SCHEMA_CONVERTER.getNullableFieldTypes(fieldName, jsonFieldSchema).toString()), + Jsons.deserialize(SCHEMA_CONVERTER.getNullableFieldTypes(fieldName, jsonFieldSchema, true, true).toString()), String.format("Test for %s failed", fieldName)); } diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md index 4a534c30a27bf..0d1c66fef4779 100644 --- a/docs/integrations/destinations/bigquery.md +++ b/docs/integrations/destinations/bigquery.md @@ -154,32 +154,34 @@ Therefore, Airbyte BigQuery destination will convert any invalid characters into ### bigquery | Version | Date | Pull Request | Subject | -| :--- | :--- | :--- | :--- | -| 0.5.1 | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations | -| 0.5.0 | 2021-10-26 | [\#7240](https://github.com/airbytehq/airbyte/issues/7240) | Output partitioned/clustered tables | -| 0.4.1 | 2021-10-04 | [\#6733](https://github.com/airbytehq/airbyte/issues/6733) | Support dataset starting with numbers | -| 0.4.0 | 2021-08-26 | [\#5296](https://github.com/airbytehq/airbyte/issues/5296) | Added GCS Staging uploading option | -| 0.3.12 | 2021-08-03 | [\#3549](https://github.com/airbytehq/airbyte/issues/3549) | Add optional arg to make a possibility to change the BigQuery client's chunk\buffer size | -| 0.3.11 | 2021-07-30 | [\#5125](https://github.com/airbytehq/airbyte/pull/5125) | Enable `additionalPropertities` in spec.json | -| 0.3.10 | 2021-07-28 | [\#3549](https://github.com/airbytehq/airbyte/issues/3549) | Add extended logs and made JobId filled with region and projectId | -| 0.3.9 | 2021-07-28 | [\#5026](https://github.com/airbytehq/airbyte/pull/5026) | Add sanitized json fields in raw tables to handle quotes in column names | -| 0.3.6 | 2021-06-18 | [\#3947](https://github.com/airbytehq/airbyte/issues/3947) | Service account credentials are now optional. | -| 0.3.4 | 2021-06-07 | [\#3277](https://github.com/airbytehq/airbyte/issues/3277) | Add dataset location option | +|:--------| :--- | :--- | :--- | +| 0.6.0 | 2021-12-17 | [\#8788](https://github.com/airbytehq/airbyte/issues/8788) | BigQuery/BiqQuery denorm Destinations : Add possibility to use different types of GCS files | +| 0.5.1 | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations | +| 0.5.0 | 2021-10-26 | [\#7240](https://github.com/airbytehq/airbyte/issues/7240) | Output partitioned/clustered tables | +| 0.4.1 | 2021-10-04 | [\#6733](https://github.com/airbytehq/airbyte/issues/6733) | Support dataset starting with numbers | +| 0.4.0 | 2021-08-26 | [\#5296](https://github.com/airbytehq/airbyte/issues/5296) | Added GCS Staging uploading option | +| 0.3.12 | 2021-08-03 | [\#3549](https://github.com/airbytehq/airbyte/issues/3549) | Add optional arg to make a possibility to change the BigQuery client's chunk\buffer size | +| 0.3.11 | 2021-07-30 | [\#5125](https://github.com/airbytehq/airbyte/pull/5125) | Enable `additionalPropertities` in spec.json | +| 0.3.10 | 2021-07-28 | [\#3549](https://github.com/airbytehq/airbyte/issues/3549) | Add extended logs and made JobId filled with region and projectId | +| 0.3.9 | 2021-07-28 | [\#5026](https://github.com/airbytehq/airbyte/pull/5026) | Add sanitized json fields in raw tables to handle quotes in column names | +| 0.3.6 | 2021-06-18 | [\#3947](https://github.com/airbytehq/airbyte/issues/3947) | Service account credentials are now optional. | +| 0.3.4 | 2021-06-07 | [\#3277](https://github.com/airbytehq/airbyte/issues/3277) | Add dataset location option | ### bigquery-denormalized | Version | Date | Pull Request | Subject | -| :--- | :--- | :--- | :--- | -| 0.1.11 | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations | -| 0.1.10 | 2021-11-09 | [\#7804](https://github.com/airbytehq/airbyte/pull/7804) | handle null values in fields described by a $ref definition | -| 0.1.9 | 2021-11-08 | [\#7736](https://github.com/airbytehq/airbyte/issues/7736) | Fixed the handling of ObjectNodes with $ref definition key | -| 0.1.8 | 2021-10-27 | [\#7413](https://github.com/airbytehq/airbyte/issues/7413) | Fixed DATETIME conversion for BigQuery | -| 0.1.7 | 2021-10-26 | [\#7240](https://github.com/airbytehq/airbyte/issues/7240) | Output partitioned/clustered tables | -| 0.1.6 | 2021-09-16 | [\#6145](https://github.com/airbytehq/airbyte/pull/6145) | BigQuery Denormalized support for date, datetime & timestamp types through the json "format" key | -| 0.1.5 | 2021-09-07 | [\#5881](https://github.com/airbytehq/airbyte/pull/5881) | BigQuery Denormalized NPE fix | -| 0.1.4 | 2021-09-04 | [\#5813](https://github.com/airbytehq/airbyte/pull/5813) | fix Stackoverflow error when receive a schema from source where "Array" type doesn't contain a required "items" element | -| 0.1.3 | 2021-08-07 | [\#5261](https://github.com/airbytehq/airbyte/pull/5261) | 🐛 Destination BigQuery\(Denormalized\): Fix processing arrays of records | -| 0.1.2 | 2021-07-30 | [\#5125](https://github.com/airbytehq/airbyte/pull/5125) | Enable `additionalPropertities` in spec.json | -| 0.1.1 | 2021-06-21 | [\#3555](https://github.com/airbytehq/airbyte/pull/3555) | Partial Success in BufferedStreamConsumer | -| 0.1.0 | 2021-06-21 | [\#4176](https://github.com/airbytehq/airbyte/pull/4176) | Destination using Typed Struct and Repeated fields | +|:--------| :--- | :--- | :--- | +| 0.2.0 | 2021-12-17 | [\#8788](https://github.com/airbytehq/airbyte/pull/8788) | BigQuery/BiqQuery denorm Destinations : Add possibility to use different types of GCS files | +| 0.1.11 | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations | +| 0.1.10 | 2021-11-09 | [\#7804](https://github.com/airbytehq/airbyte/pull/7804) | handle null values in fields described by a $ref definition | +| 0.1.9 | 2021-11-08 | [\#7736](https://github.com/airbytehq/airbyte/issues/7736) | Fixed the handling of ObjectNodes with $ref definition key | +| 0.1.8 | 2021-10-27 | [\#7413](https://github.com/airbytehq/airbyte/issues/7413) | Fixed DATETIME conversion for BigQuery | +| 0.1.7 | 2021-10-26 | [\#7240](https://github.com/airbytehq/airbyte/issues/7240) | Output partitioned/clustered tables | +| 0.1.6 | 2021-09-16 | [\#6145](https://github.com/airbytehq/airbyte/pull/6145) | BigQuery Denormalized support for date, datetime & timestamp types through the json "format" key | +| 0.1.5 | 2021-09-07 | [\#5881](https://github.com/airbytehq/airbyte/pull/5881) | BigQuery Denormalized NPE fix | +| 0.1.4 | 2021-09-04 | [\#5813](https://github.com/airbytehq/airbyte/pull/5813) | fix Stackoverflow error when receive a schema from source where "Array" type doesn't contain a required "items" element | +| 0.1.3 | 2021-08-07 | [\#5261](https://github.com/airbytehq/airbyte/pull/5261) | 🐛 Destination BigQuery\(Denormalized\): Fix processing arrays of records | +| 0.1.2 | 2021-07-30 | [\#5125](https://github.com/airbytehq/airbyte/pull/5125) | Enable `additionalPropertities` in spec.json | +| 0.1.1 | 2021-06-21 | [\#3555](https://github.com/airbytehq/airbyte/pull/3555) | Partial Success in BufferedStreamConsumer | +| 0.1.0 | 2021-06-21 | [\#4176](https://github.com/airbytehq/airbyte/pull/4176) | Destination using Typed Struct and Repeated fields |