data-dot-all · dlpzx · Sep 13, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 2, 2024
diff --git a/backend/dataall/core/environment/cdk/environment_stack.py b/backend/dataall/core/environment/cdk/environment_stack.py
@@ -581,12 +581,11 @@ def create_integration_tests_role(self):
                     's3:CreateBucket',
                     's3:DeleteBucket',
                     's3:PutEncryptionConfiguration',
-                    's3:List*',
                     's3:GetObject*',
                     's3:DeleteObject',
                 ],
                 effect=iam.Effect.ALLOW,
-                resources=['arn:aws:s3:::dataalltesting*'],
+                resources=['arn:aws:s3:::dataalltesting*', 'arn:aws:s3:::dataalltesting*/*'],
             )
         )
         self.test_role.add_to_policy(
@@ -607,6 +606,7 @@ def create_integration_tests_role(self):
                     'lakeformation:GrantPermissions',
                     'lakeformation:PutDataLakeSettings',
                     'lakeformation:GetDataLakeSettings',
+                    'glue:GetDatabase',
                     'kms:CreateKey',
                     'kms:CreateAlias',
                     'kms:DeleteAlias',
@@ -615,7 +615,9 @@ def create_integration_tests_role(self):
                     'kms:PutKeyPolicy',
                     'kms:ScheduleKeyDeletion',
                     'kms:TagResource',
+                    'kms:DescribeKey',
                     's3:GetBucketVersioning',
+                    's3:List*',
                 ],
                 effect=iam.Effect.ALLOW,
                 resources=['*'],

diff --git a/backend/dataall/modules/datasets_base/db/dataset_repositories.py b/backend/dataall/modules/datasets_base/db/dataset_repositories.py
@@ -65,8 +65,8 @@ def _query_all_user_datasets(session, username, groups, all_subqueries: List[Que
             term = filter['term']
             query = query.filter(
                 or_(
-                    DatasetBase.description.ilike(term + '%%'),
-                    DatasetBase.label.ilike(term + '%%'),
+                    DatasetBase.label.ilike('%' + term + '%'),
+                    DatasetBase.description.ilike('%' + term + '%'),
                     DatasetBase.tags.contains(f'{{{term}}}'),
                 )
             )
@@ -90,10 +90,12 @@ def _query_user_datasets(session, username, groups, filter) -> Query:
             )
         )
         if filter and filter.get('term'):
+            term = filter['term']
             query = query.filter(
                 or_(
-                    DatasetBase.description.ilike(filter.get('term') + '%%'),
-                    DatasetBase.label.ilike(filter.get('term') + '%%'),
+                    DatasetBase.label.ilike('%' + term + '%'),
+                    DatasetBase.description.ilike('%' + term + '%'),
+                    DatasetBase.tags.contains(f'{{{term}}}'),
                 )
             )
         return query.order_by(DatasetBase.label).distinct(DatasetBase.datasetUri, DatasetBase.label)
@@ -125,7 +127,6 @@ def _query_environment_datasets(session, uri, filter) -> Query:
                     DatasetBase.label.ilike('%' + term + '%'),
                     DatasetBase.description.ilike('%' + term + '%'),
                     DatasetBase.tags.contains(f'{{{term}}}'),
-                    DatasetBase.region.ilike('%' + term + '%'),
                 )
             )
         return query.order_by(DatasetBase.label)
diff --git a/tests_new/integration_tests/README.md b/tests_new/integration_tests/README.md
@@ -10,8 +10,10 @@ Currently **we support only Cognito based deployments** but support for any IdP
 
 ## Pre-requisites
 
-- A real deployment of data.all in AWS
-- An SSM parameter (`/{resource_prefix/{env_name}/testdata`) with the following contents
+- A real deployment of data.all in AWS. 
+     - For this deployment the `cdk.json` flag `enable_pivot_role_auto_create` must be set to `true`.
+     - For this deployment the `config.json` flag `cdk_pivot_role_multiple_environments_same_account` must be set to `true` if an AWS account is going to be reused for multiple environments,
+- An SSM parameter (`/dataall/{env_name}/testdata`) in the DEPLOYMENT ACCOUNT with the following contents
     ```
     {
       "users": {
@@ -85,4 +87,9 @@ You can also run the tests locally by...
 
 ## Coverage
 
-At the moment integration tests only cover Organizations module as an example.
+At the moment integration tests cover:
+- Organizations
+- Environments
+- S3 Datasets
+- Notebooks
+- Worksheets
diff --git a/tests_new/integration_tests/client.py b/tests_new/integration_tests/client.py
@@ -8,17 +8,17 @@
 ENVNAME = os.getenv('ENVNAME', 'dev')
 
 
+def _retry_if_connection_error(exception):
+    """Return True if we should retry, False otherwise"""
+    return isinstance(exception, requests.exceptions.ConnectionError) or isinstance(exception, requests.ReadTimeout)
+
+
 class Client:
     def __init__(self, username, password):
         self.username = username
         self.password = password
         self.token = self._get_jwt_token()
 
-    @staticmethod
-    def _retry_if_connection_error(exception):
-        """Return True if we should retry, False otherwise"""
-        return isinstance(exception, requests.exceptions.ConnectionError) or isinstance(exception, requests.ReadTimeout)
-
     @retry(
         retry_on_exception=_retry_if_connection_error,
         stop_max_attempt_number=3,

diff --git a/tests_new/integration_tests/modules/datasets_base/queries.py b/tests_new/integration_tests/modules/datasets_base/queries.py
@@ -59,3 +59,55 @@ def list_datasets(client, term=''):
     }
     response = client.query(query=query)
     return response.data.listDatasets
+
+
+def list_owned_datasets(client, term=''):
+    query = {
+        'operationName': 'listOwnedDatasets',
+        'variables': {'filter': {'term': term}},
+        'query': f"""
+                query listOwnedDatasets($filter: DatasetFilter) {{
+                      listOwnedDatasets(filter: $filter) {{
+                        count
+                        page
+                        pages
+                        hasNext
+                        hasPrevious
+                        nodes {{
+                          {DATASET_BASE_TYPE}
+                        }}
+                      }}
+                    }}
+                """,
+    }
+    response = client.query(query=query)
+    return response.data.listOwnedDatasets
+
+
+def list_datasets_created_in_environment(client, environment_uri, term=''):
+    query = {
+        'operationName': 'ListDatasets',
+        'variables': {'environmentUri': environment_uri, 'filter': {'term': term}},
+        'query': f"""
+                query ListDatasetsCreatedInEnvironment(
+                      $filter: DatasetFilter
+                      $environmentUri: String!
+                    ) {{
+                      listDatasetsCreatedInEnvironment(
+                        environmentUri: $environmentUri
+                        filter: $filter
+                      ) {{
+                        count
+                        page
+                        pages
+                        hasNext
+                        hasPrevious
+                        nodes {{
+                          {DATASET_BASE_TYPE}
+                       }}
+                    }}
+                  }}
+                """,
+    }
+    response = client.query(query=query)
+    return response.data.listDatasetsCreatedInEnvironment
diff --git a/tests_new/integration_tests/modules/datasets_base/test_dataset.py b/tests_new/integration_tests/modules/datasets_base/test_dataset.py
@@ -0,0 +1,44 @@
+import logging
+from assertpy import assert_that
+
+from integration_tests.modules.datasets_base.queries import (
+    list_datasets,
+    list_owned_datasets,
+    list_datasets_created_in_environment,
+)
+
+log = logging.getLogger(__name__)
+
+
+def test_list_datasets(
+    client1, session_s3_dataset1, session_imported_sse_s3_dataset1, session_imported_kms_s3_dataset1, session_id
+):
+    assert_that(list_datasets(client1, term=session_id).nodes).is_length(3)
+
+
+def test_list_datasets_unauthorized(
+    client2, session_s3_dataset1, session_imported_sse_s3_dataset1, session_imported_kms_s3_dataset1, session_id
+):
+    assert_that(list_datasets(client2, term=session_id).nodes).is_length(0)
+
+
+def test_list_owned_datasets(  # TODO
+    client1, session_s3_dataset1, session_imported_sse_s3_dataset1, session_imported_kms_s3_dataset1, session_id
+):
+    assert_that(list_owned_datasets(client1, term=session_id).nodes).is_length(3)
+
+
+def test_list_owned_datasets_unauthorized(  # TODO
+    client2, session_s3_dataset1, session_imported_sse_s3_dataset1, session_imported_kms_s3_dataset1, session_id
+):
+    assert_that(list_owned_datasets(client2, term=session_id).nodes).is_length(0)
+
+
+def test_list_datasets_created_in_environment():
+    # TODO
+    pass
+
+
+def test_list_datasets_created_in_environment_unauthorized():
+    # TODO
+    pass
diff --git a/tests_new/integration_tests/modules/s3_datasets/aws_clients.py b/tests_new/integration_tests/modules/s3_datasets/aws_clients.py
@@ -1,6 +1,7 @@
 import logging
 import json
 import re
+import os
 from botocore.exceptions import ClientError
 
 log = logging.getLogger(__name__)
@@ -12,18 +13,33 @@ def __init__(self, session, region):
         self._resource = session.resource('s3', region_name=region)
         self._region = region
 
-    def create_bucket(self, bucket_name, kms_key_id=None):
+    def bucket_exists(self, bucket_name):
+        """
+        Check if an S3 bucket exists.
+        :param bucket_name: Name of the S3 bucket to check
+        :return: True if the bucket exists, False otherwise
+        """
+        try:
+            self._client.head_bucket(Bucket=bucket_name)
+            return True
+        except ClientError as e:
+            if e.response['Error']['Code'] in ['400', '403', '404']:
+                return False
+            else:
+                raise Exception(f'Error checking if bucket {bucket_name} exists: {e}')
+
+    def create_bucket(self, bucket_name, kms_key_arn=None):
         """
         Create an S3 bucket.
         :param bucket_name: Name of the S3 bucket to be created
-        :param kms_key_id: KMS key ID to use for encryption if encryption_type is 'aws:kms'
+        :param kms_key_arn: KMS key Arn to use for encryption if encryption_type is 'aws:kms'
         :return: None
         """
         bucket_name = re.sub('[^a-zA-Z0-9-]', '', bucket_name).lower()
 
-        encryption_type = 'aws:kms' if kms_key_id else 'AES256'
+        encryption_type = 'aws:kms' if kms_key_arn else 'AES256'
         encryption_config = (
-            {'SSEAlgorithm': encryption_type, 'KMSMasterKeyID': kms_key_id}
+            {'SSEAlgorithm': encryption_type, 'KMSMasterKeyID': kms_key_arn}
             if encryption_type == 'aws:kms'
             else {'SSEAlgorithm': encryption_type}
         )
@@ -41,7 +57,7 @@ def create_bucket(self, bucket_name, kms_key_id=None):
                 Bucket=bucket_name,
                 ServerSideEncryptionConfiguration={
                     'Rules': [
-                        {'ApplyServerSideEncryptionByDefault': encryption_config, 'BucketKeyEnabled': False},
+                        {'ApplyServerSideEncryptionByDefault': encryption_config, 'BucketKeyEnabled': True},
                     ]
                 },
             )
@@ -67,12 +83,48 @@ def delete_bucket(self, bucket_name):
         except ClientError as e:
             log.exception(f'Error deleting S3 bucket: {e}')
 
+    def upload_file_to_prefix(self, local_file_path, s3_path):
+        """
+        Upload a file from a local path to an S3 bucket with a specified prefix.
+
+        :param local_file_path: Path to the local file to be uploaded
+        :param s3_path: S3 path where the file should be uploaded, including the bucket name and prefix
+        :return: None
+        """
+        try:
+            bucket_name, prefix = s3_path.split('/', 1)
+            object_key = f'{prefix}/{os.path.basename(local_file_path)}'
+            self._client.upload_file(local_file_path, bucket_name, object_key)
+        except ClientError as e:
+            logging.error(f'Error uploading file to S3: {e}')
+            raise
+
 
 class KMSClient:
     def __init__(self, session, account_id, region):
         self._client = session.client('kms', region_name=region)
         self._account_id = account_id
 
+    def get_key_alias(self, alias_name):
+        """
+        Get the key alias name for a given alias.
+        :param alias_name: The alias name to look up
+        :return: alias name if the alias exists, False otherwise
+        """
+        try:
+            alias_name = alias_name.lower()
+            response = self._client.describe_key(KeyId=f'alias/{alias_name}')
+            key_id = response['KeyMetadata']['KeyId']
+            aliases = response['KeyMetadata']['Aliases']
+            for alias in aliases:
+                if alias['AliasName'] == f'alias/{alias_name}':
+                    return alias['AliasName']
+        except ClientError as e:
+            if e.response['Error']['Code'] == 'NotFoundException':
+                return False, False
+            else:
+                raise Exception(f'Error getting key alias for {alias_name}: {e}')
+
     def create_key_with_alias(self, alias_name):
         try:
             response = self._client.create_key()
@@ -155,6 +207,19 @@ class GlueClient:
     def __init__(self, session, region):
         self._client = session.client('glue', region_name=region)
 
+    def get_database(self, database_name):
+        """
+        Check if a Glue database exists.
+        :param database_name: Name of the Glue database to check
+        :return: True if the database exists, False otherwise
+        """
+        try:
+            database = self._client.get_database(Name=database_name)
+            return database
+        except ClientError as e:
+            log.exception(f'Database not found, exception: {e}')
+            return False
+
     def create_database(self, database_name, bucket):
         try:
             database_name = re.sub('[^a-zA-Z0-9_]', '', database_name).lower()
@@ -172,16 +237,23 @@ def create_table(self, database_name, bucket, table_name):
                     'Description': 'integration tests',
                     'StorageDescriptor': {
                         'Columns': [
-                            {'Name': 'column1', 'Type': 'string'},
+                            {'Name': 'column1', 'Type': 'int'},
                             {'Name': 'column2', 'Type': 'string'},
                             {'Name': 'column3', 'Type': 'string'},
                         ],
-                        'Location': f's3://{bucket}/',
+                        'Location': f's3://{bucket}/{table_name}/',
+                        'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
+                        'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
+                        'Compressed': False,
+                        'SerdeInfo': {
+                            'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
+                            'Parameters': {'field.delim': ','},
+                        },
                     },
                 },
             )
         except ClientError as e:
-            log.exception(f'Error creating Glue database: {e}')
+            log.exception(f'Error creating Glue table: {e}')
 
     def delete_database(self, database_name):
         """