Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bigquery): add support for hive partitioning options configuration #9626

Merged
merged 4 commits into from
Nov 19, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion bigquery/google/cloud/bigquery/external_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,72 @@ def from_api_repr(cls, resource):
_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions)


class HivePartitioningOptions(object):
"""Options that configure hive partitioning.
plamut marked this conversation as resolved.
Show resolved Hide resolved

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#HivePartitioningOptions
"""

def __init__(self):
self._properties = {}

@property
def mode(self):
"""Optional[str]: When set, what mode of hive partitioning to use when reading data.

Two modes are supported: "AUTO" and "STRINGS".

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#HivePartitioningOptions.FIELDS.mode
"""
return self._properties.get("mode")

@mode.setter
def mode(self, value):
self._properties["mode"] = value

@property
def source_uri_prefix(self):
"""Optional[str]: When hive partition detection is requested, a common prefix for
all source URIs is required.

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#HivePartitioningOptions.FIELDS.source_uri_prefix
"""
return self._properties.get("sourceUriPrefix")

@source_uri_prefix.setter
def source_uri_prefix(self, value):
self._properties["sourceUriPrefix"] = value

def to_api_repr(self):
"""Build an API representation of this object.

Returns:
Dict[str, Any]: A dictionary in the format used by the BigQuery API.
"""
return copy.deepcopy(self._properties)

@classmethod
def from_api_repr(cls, resource):
"""Factory: construct a :class:`~.external_config.HivePartitioningOptions`
instance given its API representation.

Args:
resource (Dict[str, Any]):
Definition of a :class:`~.external_config.HivePartitioningOptions`
instance in the same representation as is returned from the
API.

Returns:
HivePartitioningOptions: Configuration parsed from ``resource``.
"""
config = cls()
config._properties = copy.deepcopy(resource)
return config


class ExternalConfig(object):
"""Description of an external data source.

Expand Down Expand Up @@ -592,7 +658,7 @@ def source_format(self):

@property
def options(self):
"""Dict[str, Any]: Source-specific options."""
"""Optional[Dict[str, Any]]: Source-specific options."""
return self._options

@property
Expand Down Expand Up @@ -624,6 +690,24 @@ def compression(self):
def compression(self, value):
self._properties["compression"] = value

@property
def hive_partitioning(self):
"""Optional[:class:`~.external_config.HivePartitioningOptions`]: When set, \
it configures hive partitioning support.

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.hive_partitioning_options
"""
prop = self._properties.get("hivePartitioningOptions")
if prop is None:
return None
return HivePartitioningOptions.from_api_repr(prop)

@hive_partitioning.setter
def hive_partitioning(self, value):
prop = value.to_api_repr() if value is not None else None
self._properties["hivePartitioningOptions"] = prop

@property
def ignore_unknown_values(self):
"""bool: If :data:`True`, extra values that are not represented in the
Expand Down
28 changes: 28 additions & 0 deletions bigquery/google/cloud/bigquery/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery.external_config import HivePartitioningOptions
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.query import _query_param_from_api_repr
from google.cloud.bigquery.query import ArrayQueryParameter
Expand Down Expand Up @@ -1138,6 +1139,33 @@ def field_delimiter(self):
def field_delimiter(self, value):
self._set_sub_prop("fieldDelimiter", value)

@property
def hive_partitioning(self):
"""Optional[:class:`~.external_config.HivePartitioningOptions`]: When set, \
it configures hive partitioning support.

.. note::
**Experimental**. This feature is experimental and might change or
have limited support.

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.hive_partitioning_options
"""
prop = self._get_sub_prop("hivePartitioningOptions")
if prop is None:
return None
return HivePartitioningOptions.from_api_repr(prop)

@hive_partitioning.setter
def hive_partitioning(self, value):
if value is not None:
if isinstance(value, HivePartitioningOptions):
value = value.to_api_repr()
else:
raise TypeError("Expected a HivePartitioningOptions instance or None.")

self._set_sub_prop("hivePartitioningOptions", value)

@property
def ignore_unknown_values(self):
"""bool: Ignore extra values not represented in the table schema.
Expand Down
52 changes: 52 additions & 0 deletions bigquery/tests/unit/test_external_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,58 @@ def test_to_api_repr_sheets(self):

self.assertEqual(got_resource, exp_resource)

def test_from_api_repr_hive_partitioning(self):
resource = _copy_and_update(
self.BASE_RESOURCE,
{
"sourceFormat": "FORMAT_FOO",
"hivePartitioningOptions": {
"sourceUriPrefix": "http://foo/bar",
"mode": "STRINGS",
},
},
)

ec = external_config.ExternalConfig.from_api_repr(resource)

self._verify_base(ec)
self.assertEqual(ec.source_format, "FORMAT_FOO")
self.assertIsInstance(
ec.hive_partitioning, external_config.HivePartitioningOptions
)
self.assertEqual(ec.hive_partitioning.source_uri_prefix, "http://foo/bar")
self.assertEqual(ec.hive_partitioning.mode, "STRINGS")

# converting back to API representation should yield the same result
got_resource = ec.to_api_repr()
self.assertEqual(got_resource, resource)

del resource["hivePartitioningOptions"]
ec = external_config.ExternalConfig.from_api_repr(resource)
self.assertIsNone(ec.hive_partitioning)

got_resource = ec.to_api_repr()
self.assertEqual(got_resource, resource)

def test_to_api_repr_hive_partitioning(self):
hive_partitioning = external_config.HivePartitioningOptions()
hive_partitioning.source_uri_prefix = "http://foo/bar"
hive_partitioning.mode = "STRINGS"

ec = external_config.ExternalConfig("FORMAT_FOO")
ec.hive_partitioning = hive_partitioning

got_resource = ec.to_api_repr()

expected_resource = {
"sourceFormat": "FORMAT_FOO",
"hivePartitioningOptions": {
"sourceUriPrefix": "http://foo/bar",
"mode": "STRINGS",
},
}
self.assertEqual(got_resource, expected_resource)

def test_from_api_repr_csv(self):
resource = _copy_and_update(
self.BASE_RESOURCE,
Expand Down
40 changes: 40 additions & 0 deletions bigquery/tests/unit/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,46 @@ def test_field_delimiter_setter(self):
config.field_delimiter = field_delimiter
self.assertEqual(config._properties["load"]["fieldDelimiter"], field_delimiter)

def test_hive_partitioning_missing(self):
config = self._get_target_class()()
self.assertIsNone(config.hive_partitioning)

def test_hive_partitioning_hit(self):
from google.cloud.bigquery.external_config import HivePartitioningOptions

config = self._get_target_class()()
config._properties["load"]["hivePartitioningOptions"] = {
"sourceUriPrefix": "http://foo/bar",
"mode": "STRINGS",
}
result = config.hive_partitioning
self.assertIsInstance(result, HivePartitioningOptions)
self.assertEqual(result.source_uri_prefix, "http://foo/bar")
self.assertEqual(result.mode, "STRINGS")

def test_hive_partitioning_setter(self):
from google.cloud.bigquery.external_config import HivePartitioningOptions

hive_partitioning = HivePartitioningOptions()
hive_partitioning.source_uri_prefix = "http://foo/bar"
hive_partitioning.mode = "AUTO"

config = self._get_target_class()()
config.hive_partitioning = hive_partitioning
self.assertEqual(
config._properties["load"]["hivePartitioningOptions"],
{"sourceUriPrefix": "http://foo/bar", "mode": "AUTO"},
)

config.hive_partitioning = None
self.assertIsNone(config._properties["load"]["hivePartitioningOptions"])

def test_hive_partitioning_invalid_type(self):
config = self._get_target_class()()

with self.assertRaises(TypeError):
config.hive_partitioning = {"mode": "AUTO"}

def test_ignore_unknown_values_missing(self):
config = self._get_target_class()()
self.assertIsNone(config.ignore_unknown_values)
Expand Down