allow bq base cursor methods run_extract, run_copy, run_load to all

take in source or destination table strings that include projects. For backwards compatibility reasons, the project is not required. This allows for decoupling of the execution of these methods from projects that have the information they access.
apache · Mar 9, 2016 · ab73b70 · WesleyBatista · Mar 11, 2016 · ab73b70
1 parent cbf139c
commit ab73b70
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 55 deletions.
diff --git a/airflow/contrib/hooks/bigquery_hook.py b/airflow/contrib/hooks/bigquery_hook.py
@@ -197,7 +197,7 @@ def run_query(self, bql, destination_dataset_table = False, write_disposition =
 
         return self.run_with_configuration(configuration)
 
-    def run_extract(self, source_dataset_table, destination_cloud_storage_uris, compression='NONE', export_format='CSV', field_delimiter=',', print_header=True):
+    def run_extract(self, source_project_dataset_table, destination_cloud_storage_uris, compression='NONE', export_format='CSV', field_delimiter=',', print_header=True):
         """
         Executes a BigQuery extract command to copy data from BigQuery to 
         Google Cloud Storage. See here:
@@ -206,8 +206,8 @@ def run_extract(self, source_dataset_table, destination_cloud_storage_uris, comp
 
         For more details about these parameters.
 
-        :param source_dataset_table: The dotted <dataset>.<table> BigQuery table to use as the source data.
-        :type source_dataset_table: string
+        :param source_project_dataset_table: The dotted <dataset>.<table> BigQuery table to use as the source data.
+        :type source_project_dataset_table: string
         :param destination_cloud_storage_uris: The destination Google Cloud 
             Storage URI (e.g. gs://some-bucket/some-file.txt). Follows 
             convention defined here: 
@@ -222,14 +222,11 @@ def run_extract(self, source_dataset_table, destination_cloud_storage_uris, comp
         :param print_header: Whether to print a header for a CSV file extract.
         :type print_header: boolean
         """
-        assert '.' in source_dataset_table, \
-            'Expected source_dataset_table in the format of <dataset>.<table>. Got: {}'.format(source_dataset_table)
-
-        source_dataset, source_table = source_dataset_table.split('.', 1)
+        source_project, source_dataset, source_table = self._split_project_dataset_table_input('source_project_dataset_table', source_project_dataset_table)
         configuration = {
             'extract': {
                 'sourceTable': {
-                    'projectId': self.project_id,
+                    'projectId': source_project,
                     'datasetId': source_dataset,
                     'tableId': source_table,
                 },
@@ -248,7 +245,11 @@ def run_extract(self, source_dataset_table, destination_cloud_storage_uris, comp
 
         return self.run_with_configuration(configuration)
 
-    def run_copy(self, source_dataset_tables, destination_project_dataset_table, write_disposition='WRITE_EMPTY', create_disposition='CREATE_IF_NEEDED'):
+    def run_copy(self,
+                 source_project_dataset_tables,
+                 destination_project_dataset_table,
+                 write_disposition='WRITE_EMPTY',
+                 reate_disposition='CREATE_IF_NEEDED'):
         """
         Executes a BigQuery copy command to copy data from one BigQuery table
         to another. See here:
@@ -257,10 +258,11 @@ def run_copy(self, source_dataset_tables, destination_project_dataset_table, wri
 
         For more details about these parameters.
 
-        :param source_dataset_tables: One or more dotted <dataset>.<table>
+        :param source_project_dataset_tables: One or more dotted (<project>.)<dataset>.<table>
             BigQuery tables to use as the source data. Use a list if there are
             multiple source tables.
-        :type source_dataset_tables: list|string
+            If <project> is not included, project will be the project defined in the connection json.
+        :type source_project_dataset_tables: list|string
         :param destination_project_dataset_table: The destination BigQuery
             table. Format is: <project>.<dataset>.<table>
         :type destination_project_dataset_table: string
@@ -269,19 +271,16 @@ def run_copy(self, source_dataset_tables, destination_project_dataset_table, wri
         :param create_disposition: The create disposition if the table doesn't exist.
         :type create_disposition: string
         """
-        source_dataset_tables = [source_dataset_tables] if not isinstance(source_dataset_tables, list) else source_dataset_tables
-        source_project_dataset_tables = []
+        source_project_dataset_tables = [source_project_dataset_tables] if not isinstance(source_project_dataset_tables, list) else source_project_dataset_tables
 
-        for source_dataset_table in source_dataset_tables:
-            assert '.' in source_dataset_table, \
-                'Expected source_dataset_table in the format of <dataset>.<table>. Got: {}'.format(source_dataset_table)
-
-            source_dataset, source_table = source_dataset_table.split('.', 1)
-            source_project_dataset_tables.append({
-                'projectId': self.project_id,
-                'datasetId': source_dataset,
-                'tableId': source_table
-            })
+        source_project_dataset_tables_fixup = []
+        for source_project_dataset_table in source_project_dataset_tables:
+            source_project, source_dataset, source_table = self._split_project_dataset_table_input('source_project_dataset_table', source_project_dataset_table)
+            source_project_dataset_tables_fixup.append({
+                    'projectId': source_project,
+                    'datasetId': source_dataset,
+                    'tableId': source_table
+                })
 
         assert 3 == len(destination_project_dataset_table.split('.')), \
             'Expected destination_project_dataset_table in the format of <project>.<dataset>.<table>. Got: {}'.format(destination_project_dataset_table)
@@ -291,7 +290,7 @@ def run_copy(self, source_dataset_tables, destination_project_dataset_table, wri
             'copy': {
                 'createDisposition': create_disposition,
                 'writeDisposition': write_disposition,
-                'sourceTables': source_project_dataset_tables,
+                'sourceTables': source_project_dataset_tables_fixup,
                 'destinationTable': {
                     'projectId': destination_project,
                     'datasetId': destination_dataset,
@@ -302,7 +301,14 @@ def run_copy(self, source_dataset_tables, destination_project_dataset_table, wri
 
         return self.run_with_configuration(configuration)
 
-    def run_load(self, destination_dataset_table, schema_fields, source_uris, source_format='CSV', create_disposition='CREATE_IF_NEEDED', skip_leading_rows=0, write_disposition='WRITE_EMPTY', field_delimiter=','):
+    def run_load(self,
+                 destination_project_dataset_table,
+                 schema_fields, source_uris,
+                 source_format='CSV',
+                 create_disposition='CREATE_IF_NEEDED',
+                 skip_leading_rows=0,
+                 write_disposition='WRITE_EMPTY',
+                 field_delimiter=','):
         """
         Executes a BigQuery load command to load data from Google Cloud Storage
         to BigQuery. See here:
@@ -311,8 +317,9 @@ def run_load(self, destination_dataset_table, schema_fields, source_uris, source
 
         For more details about these parameters.
 
-        :param destination_dataset_table: The dotted <dataset>.<table> BigQuery table to load data into.
-        :type destination_dataset_table: string
+        :param destination_project_dataset_table: The dotted (<project>.)<dataset>.<table> BigQuery table to load data into.
+            If <project> is not included, project will be the project defined in the connection json.
+        :type destination_project_dataset_table: string
         :param schema_fields: The schema field list as defined here:
             https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load
         :type schema_fields: list
@@ -331,16 +338,13 @@ def run_load(self, destination_dataset_table, schema_fields, source_uris, source
         :param field_delimiter: The delimiter to use when loading from a CSV.
         :type field_delimiter: string
         """
-        assert '.' in destination_dataset_table, \
-            'Expected destination_dataset_table in the format of <dataset>.<table>. Got: {}'.format(destination_dataset_table)
-
-        destination_dataset, destination_table = destination_dataset_table.split('.', 1)
+        destination_project, destination_dataset, destination_table = self._split_project_dataset_table_input('destination_project_dataset_table', destination_project_dataset_table)
 
         configuration = {
             'load': {
                 'createDisposition': create_disposition,
                 'destinationTable': {
-                    'projectId': self.project_id,
+                    'projectId': destination_project,
                     'datasetId': destination_dataset,
                     'tableId': destination_table,
                 },
@@ -359,6 +363,27 @@ def run_load(self, destination_dataset_table, schema_fields, source_uris, source
 
         return self.run_with_configuration(configuration)
 
+    def _split_project_dataset_table_input(self, var_name, project_dataset_table):
+        """
+        :param var_name: the name of the variable input, for logging and erroring purposes.
+        :type var_name: str
+        :param project_dataset_table: input string in (<project>.)<dataset>.<project> format.
+            if project is not included in the string, self.project_id will be returned in the tuple.
+        :type project_dataset_table: str
+        :return: (project, dataset, table) tuple
+        """
+        table_split = project_dataset_table.split('.')
+        assert len(table_split) == 2 or len(table_split) == 3, \
+            'Expected {var} in the format of (<project.)<dataset>.<table>, got {input}'.format(var=var_name, input=project_dataset_table)
+
+        if len(table_split) == 2:
+            logging.info('project not included in {var}: {input}; using project "{project}"'.format(var=var_name, input=project_dataset_table, project=self.project_id))
+            dataset, table = table_split
+            return self.project_id, dataset, table
+        else:
+            project, dataset, table = table_split
+            return project, dataset, table
+
     def run_with_configuration(self, configuration):
         """
         Executes a BigQuery SQL query. See here:

diff --git a/airflow/contrib/operators/bigquery_to_bigquery.py b/airflow/contrib/operators/bigquery_to_bigquery.py
@@ -8,14 +8,14 @@ class BigQueryToBigQueryOperator(BaseOperator):
     """
     Copy a BigQuery table to another BigQuery table.
     """
-    template_fields = ('source_dataset_tables','destination_project_dataset_table',)
+    template_fields = ('source_project_dataset_tables','destination_project_dataset_table',)
     template_ext = ('.sql',)
     ui_color = '#e6f0e4'
 
     @apply_defaults
     def __init__(
         self, 
-        source_dataset_tables,
+        source_project_dataset_tables,
         destination_project_dataset_table,
         write_disposition='WRITE_EMPTY',
         create_disposition='CREATE_IF_NEEDED',
@@ -30,10 +30,11 @@ def __init__(
 
         For more details about these parameters.
 
-        :param source_dataset_tables: One or more dotted <dataset>.<table>
-            BigQuery tables to use as the source data. Use a list if there are
-            multiple source tables.
-        :type source_dataset_tables: list|string
+        :param source_project_dataset_tables: One or more dotted (<project>.)<dataset>.<table>
+            BigQuery tables to use as the source data.
+            If <project> is not included, project will be the project defined in the connection json.
+            Use a list if there are multiple source tables.
+        :type source_project_dataset_tables: list|string
         :param destination_project_dataset_table: The destination BigQuery
             table. Format is: <project>.<dataset>.<table>
         :type destination_project_dataset_table: string
@@ -48,20 +49,20 @@ def __init__(
         :type delegate_to: string
         """
         super(BigQueryToBigQueryOperator, self).__init__(*args, **kwargs)
-        self.source_dataset_tables = source_dataset_tables
+        self.source_project_dataset_tables = source_project_dataset_tables
         self.destination_project_dataset_table = destination_project_dataset_table
         self.write_disposition = write_disposition
         self.create_disposition = create_disposition
         self.bigquery_conn_id = bigquery_conn_id
         self.delegate_to = delegate_to
 
     def execute(self, context):
-        logging.info('Executing copy of %s into: %s', self.source_dataset_tables, self.destination_project_dataset_table)
+        logging.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
         conn = hook.get_conn()
         cursor = conn.cursor()
         cursor.run_copy(
-            self.source_dataset_tables,
+            self.source_project_dataset_tables,
             self.destination_project_dataset_table,
             self.write_disposition,
             self.create_disposition)
diff --git a/airflow/contrib/operators/bigquery_to_gcs.py b/airflow/contrib/operators/bigquery_to_gcs.py
@@ -8,14 +8,14 @@ class BigQueryToCloudStorageOperator(BaseOperator):
     """
     Transfers a BigQuery table to a Google Cloud Storage bucket.
     """
-    template_fields = ('source_dataset_table','destination_cloud_storage_uris',)
+    template_fields = ('source_project_dataset_table','destination_cloud_storage_uris',)
     template_ext = ('.sql',)
     ui_color = '#e4e6f0'
 
     @apply_defaults
     def __init__(
         self, 
-        source_dataset_table, 
+        source_project_dataset_table,
         destination_cloud_storage_uris, 
         compression='NONE', 
         export_format='CSV', 
@@ -33,8 +33,9 @@ def __init__(
 
         For more details about these parameters.
 
-        :param source_dataset_table: The dotted <dataset>.<table> BigQuery table to use as the source data.
-        :type source_dataset_table: string
+        :param source_project_dataset_table: The dotted (<project>.)<dataset>.<table> BigQuery table to use as the
+            source data. If <project> is not included, project will be the project defined in the connection json.
+        :type source_project_dataset_table: string
         :param destination_cloud_storage_uris: The destination Google Cloud 
             Storage URI (e.g. gs://some-bucket/some-file.txt). Follows 
             convention defined here: 
@@ -55,7 +56,7 @@ def __init__(
         :type delegate_to: string
         """
         super(BigQueryToCloudStorageOperator, self).__init__(*args, **kwargs)
-        self.source_dataset_table = source_dataset_table 
+        self.source_project_dataset_table = source_project_dataset_table
         self.destination_cloud_storage_uris = destination_cloud_storage_uris
         self.compression = compression
         self.export_format = export_format
@@ -65,12 +66,12 @@ def __init__(
         self.delegate_to = delegate_to
 
     def execute(self, context):
-        logging.info('Executing extract of %s into: %s', self.source_dataset_table, self.destination_cloud_storage_uris)
+        logging.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
         conn = hook.get_conn()
         cursor = conn.cursor()
         cursor.run_extract(
-            self.source_dataset_table,
+            self.source_project_dataset_table,
             self.destination_cloud_storage_uris,
             self.compression,
             self.export_format,

diff --git a/airflow/contrib/operators/gcs_to_bq.py b/airflow/contrib/operators/gcs_to_bq.py
@@ -19,7 +19,7 @@ def __init__(
         self,
         bucket,
         source_objects,
-        destination_dataset_table,
+        destination_project_dataset_table,
         schema_fields=False,
         schema_object=False,
         source_format='CSV',
@@ -43,8 +43,9 @@ def __init__(
         :type bucket: string
         :param source_objects: List of Google cloud storage URIs to load from.
         :type object: list
-        :param destination_dataset_table: The dotted <dataset>.<table> BigQuery table to load data into.
-        :type destination_dataset_table: string
+        :param destination_project_dataset_table: The dotted (<project>.)<dataset>.<table> BigQuery table to load data
+            into. If <project> is not included, project will be the project defined in the connection json.
+        :type destination_project_dataset_table: string
         :param schema_fields: If set, the schema field list as defined here:
             https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load
         :type schema_fields: list
@@ -85,7 +86,7 @@ def __init__(
         self.schema_object = schema_object
 
         # BQ config
-        self.destination_dataset_table = destination_dataset_table
+        self.destination_project_dataset_table = destination_project_dataset_table
         self.schema_fields = schema_fields
         self.source_format = source_format
         self.create_disposition = create_disposition
@@ -109,7 +110,7 @@ def execute(self, context):
         conn = bq_hook.get_conn()
         cursor = conn.cursor()
         cursor.run_load(
-            destination_dataset_table=self.destination_dataset_table,
+            destination_project_dataset_table=self.destination_project_dataset_table,
             schema_fields=schema_fields,
             source_uris=source_uris,
             source_format=self.source_format,
@@ -119,8 +120,8 @@ def execute(self, context):
             field_delimiter=self.field_delimiter)
 
         if self.max_id_key:
-            cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_dataset_table))
+            cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table))
             row = cursor.fetchone()
             max_id = row[0] if row[0] else 0
-            logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_dataset_table, self.max_id_key, max_id))
+            logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id))
             return max_id