Skip to content

Commit

Permalink
fix(ingest): map bigquery nested types properly (datahub-project#10409)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and sleeperdeep committed Jun 25, 2024
1 parent 60ef784 commit c686c38
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ def cleanup(config: BigQueryV2Config) -> None:
)
class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
# Note: We use the hive schema parser to parse nested BigQuery types. We also have
# some extra type mappings in that file.
BIGQUERY_FIELD_TYPE_MAPPINGS: Dict[
str,
Type[
Expand Down Expand Up @@ -1264,7 +1266,6 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
type=SchemaFieldDataType(
self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)()
),
# NOTE: nativeDataType will not be in sync with older connector
nativeDataType=col.data_type,
description=col.comment,
nullable=col.is_nullable,
Expand Down
22 changes: 21 additions & 1 deletion metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,20 @@ class HiveColumnToAvroConverter:
"bigint": "long",
"varchar": "string",
"char": "string",
"bytes": "bytes",
}
_EXTRA_BIGQUERY_TYPE_TO_AVRO_TYPE = {
# A few extra types, purely to map BigQuery things correctly.
"bool": "boolean",
"decimal": "double",
"numeric": "int",
"bignumeric": "long",
"bigdecimal": "double",
"float64": "double",
"int64": "long",
"smallint": "int",
"tinyint": "int",
"byteint": "int",
}

_COMPLEX_TYPE = re.compile("^(struct|map|array|uniontype)")
Expand Down Expand Up @@ -180,13 +194,19 @@ def _parse_basic_datatype_string(s: str) -> Dict[str, object]:
"native_data_type": s,
"_nullable": True,
}
elif s == "timestamp":
elif s in {"timestamp", "datetime"}:
return {
"type": "int",
"logicalType": "timestamp-millis",
"native_data_type": s,
"_nullable": True,
}
elif s in HiveColumnToAvroConverter._EXTRA_BIGQUERY_TYPE_TO_AVRO_TYPE:
return {
"type": HiveColumnToAvroConverter._EXTRA_BIGQUERY_TYPE_TO_AVRO_TYPE[s],
"native_data_type": s,
"_nullable": True,
}
else:
return {"type": "null", "native_data_type": s, "_nullable": True}

Expand Down

0 comments on commit c686c38

Please sign in to comment.