diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 69b86e136..981e6d8e3 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1904,6 +1904,8 @@ def get_manifest( # TODO: avoid explicitly exposing Synapse store functionality # just instantiate a Store class and let it decide at runtime/config # the store type + # TODO: determine which parts of fileview are necessary for `get` operations + # and pass query parameters at object instantiation to avoid having to re-query if access_token: # for getting an existing manifest on AWS store = SynapseStorage(access_token=access_token) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 9b13bebaf..56c9a3443 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -2119,7 +2119,9 @@ def filename_validation( where_clauses = [] - dataset_clause = f"parentId='{dataset_scope}'" + dataset_clause = SynapseStorage.build_clause_from_dataset_id( + dataset_id=dataset_scope + ) where_clauses.append(dataset_clause) self._login( diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 14c260cb4..e0e8017bd 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -19,12 +19,10 @@ import numpy as np import pandas as pd import synapseclient -import synapseutils from opentelemetry import trace from synapseclient import Annotations as OldAnnotations from synapseclient import ( Column, - Entity, EntityViewSchema, EntityViewType, File, @@ -416,6 +414,30 @@ def query_fileview( else: raise AccessCredentialsError(self.storageFileview) + @staticmethod + def build_clause_from_dataset_id( + dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None + ) -> str: + """ + Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. + Args: + dataset_id: Synapse ID of a dataset that should be used to limit the query + dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query + Returns: + clause for the query or an empty string if no dataset ID is provided + """ + # Calling this method without specifying synIDs will complete but will not scope the view + if (not dataset_id) and (not dataset_folder_list): + return "" + + # This will be used to gather files under a dataset recursively with a fileview query instead of walking + if dataset_folder_list: + search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) + return f"parentId IN ({search_folders})" + + # `dataset_id` should be provided when all files are stored directly under the dataset folder + return f"parentId='{dataset_id}'" + def _build_query( self, columns: Optional[list] = None, where_clauses: Optional[list] = None ): @@ -666,7 +688,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True ) -> List[Tuple[str, str]]: - """Gets all files in a given dataset folder. + """Gets all files (excluding manifest files) in a given dataset folder. Args: datasetId: synapse ID of a storage dataset. @@ -680,105 +702,58 @@ def getFilesInStorageDataset( Raises: ValueError: Dataset ID not found. """ - # select all files within a given storage dataset folder (top level folder in - # a Synapse storage project or folder marked with contentType = 'dataset') - walked_path = synapseutils.walk( - self.syn, datasetId, includeTypes=["folder", "file"] - ) - - current_entity_location = self.synapse_entity_tracker.get( - synapse_id=datasetId, syn=self.syn, download_file=False - ) - - def walk_back_to_project( - current_location: Entity, location_prefix: str, skip_entry: bool - ) -> str: - """ - Recursively walk back up the project structure to get the paths of the - names of each of the directories where we started the walk function. - - Args: - current_location (Entity): The current entity location in the project structure. - location_prefix (str): The prefix to prepend to the path. - skip_entry (bool): Whether to skip the current entry in the path. When - this is True it means we are looking at our starting point. If our - starting point is the project itself we can go ahead and return - back the project as the prefix. - - Returns: - str: The path of the names of each of the directories up to the project root. - """ - if ( - skip_entry - and "concreteType" in current_location - and current_location["concreteType"] == PROJECT_ENTITY - ): - return f"{current_location.name}/{location_prefix}" + file_list = [] - updated_prefix = ( - location_prefix - if skip_entry - else f"{current_location.name}/{location_prefix}" - ) - if ( - "concreteType" in current_location - and current_location["concreteType"] == PROJECT_ENTITY - ): - return updated_prefix - current_location = self.synapse_entity_tracker.get( - synapse_id=current_location["parentId"], - syn=self.syn, - download_file=False, + # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view + if self.storageFileviewTable.empty: + raise ValueError( + f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." ) - return walk_back_to_project( - current_location=current_location, - location_prefix=updated_prefix, - skip_entry=False, + + child_path = self.storageFileviewTable.loc[ + self.storageFileviewTable["parentId"] == datasetId, "path" + ] + if child_path.empty: + raise LookupError( + f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." ) + child_path = child_path.iloc[0] - prefix = walk_back_to_project( - current_location=current_entity_location, - location_prefix="", - skip_entry=True, - ) + # Get the dataset path by eliminating the child's portion of the path to account for nested datasets + parent = child_path.split("/")[:-1] + parent = "/".join(parent) - project_id = self.getDatasetProject(datasetId) - project = self.synapse_entity_tracker.get( - synapse_id=project_id, syn=self.syn, download_file=False - ) - project_name = project.name - file_list = [] + # Format dataset path to be used in table query + dataset_path = f"'{parent}/%'" - # iterate over all results - for dirpath, _, path_filenames in walked_path: - # iterate over all files in a folder - for path_filename in path_filenames: - if ("manifest" not in path_filename[0] and not fileNames) or ( - fileNames and path_filename[0] in fileNames - ): - # don't add manifest to list of files unless it is specified in the - # list of specified fileNames; return all found files - # except the manifest if no fileNames have been specified - # TODO: refactor for clarity/maintainability - - if fullpath: - # append directory path to filename - if dirpath[0].startswith(f"{project_name}/"): - path_without_project_prefix = ( - dirpath[0] + "/" - ).removeprefix(f"{project_name}/") - path_filename = ( - prefix + path_without_project_prefix + path_filename[0], - path_filename[1], - ) - else: - path_filename = ( - prefix + dirpath[0] + "/" + path_filename[0], - path_filename[1], - ) + # When querying, only include files to exclude entity files and subdirectories + where_clauses = [f"path like {dataset_path}", "type='file'"] + + # Requery the fileview to specifically get the files in the given dataset + self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) + + # Exclude manifest files + non_manifest_files = self.storageFileviewTable.loc[ + ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), + :, + ] + + # Remove all files that are not in the list of fileNames + if fileNames: + filename_regex = "|".join(fileNames) + + matching_files = non_manifest_files["path"].str.contains( + filename_regex, case=False, regex=True + ) + + non_manifest_files = non_manifest_files.loc[matching_files, :] + + # Truncate path if necessary + if not fullpath: + non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) - # add file name file id tuple, rearranged so that id is first and name follows - file_list.append(path_filename[::-1]) + # Return list of files as expected by other methods + file_list = list(non_manifest_files.itertuples(index=False, name=None)) return file_list diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 07c97af90..c6412d349 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -4,10 +4,11 @@ # pylint: disable=anomalous-backslash-in-string import logging - -from typing import Any, Mapping, Sequence, Union, Optional -from functools import reduce import re +from functools import reduce +from typing import Any, Mapping, Optional, Sequence, Union + +from schematic.utils.general import SYN_ID_REGEX logger = logging.getLogger(__name__) @@ -69,7 +70,7 @@ def parse_syn_ids( if not syn_ids: return None - project_regex = re.compile("(syn\d+\,?)+") + project_regex = re.compile(SYN_ID_REGEX) valid = project_regex.fullmatch(syn_ids) if not valid: diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 997a193bf..cdc1be2f2 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -24,6 +24,8 @@ T = TypeVar("T") +SYN_ID_REGEX = r"(syn\d+\,?)+" + def find_duplicates(_list: list[T]) -> set[T]: """Find duplicate items in a list""" diff --git a/schematic_api/api/openapi/api.yaml b/schematic_api/api/openapi/api.yaml index 8689a8ae2..7c32e8d90 100644 --- a/schematic_api/api/openapi/api.yaml +++ b/schematic_api/api/openapi/api.yaml @@ -692,8 +692,8 @@ paths: - Synapse Storage /storage/dataset/files: get: - summary: Get all files in a given dataset folder - description: Get all files in a given dataset folder + summary: Get all files (excluding manifest files) in a given dataset folder + description: Get all files (excluding manifest files) in a given dataset folder operationId: schematic_api.api.routes.get_files_storage_dataset security: - access_token: [] diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 8658f6e6b..4367d817b 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -4,14 +4,14 @@ import uuid from io import BytesIO +import numpy as np +import pandas as pd import pytest import requests -from openpyxl import load_workbook from click.testing import CliRunner -import pandas as pd -import numpy as np +from openpyxl import load_workbook -from schematic.configuration.configuration import Configuration, CONFIG +from schematic.configuration.configuration import CONFIG, Configuration from schematic.manifest.commands import manifest from schematic.models.commands import model from tests.conftest import ConfigurationForTesting @@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 # command output has success message - assert result.output.split("\n")[4] == ( + result_list = result.output.split("\n") + assert ( "Your manifest has been validated successfully. " "There are no errors in your manifest, " "and it can be submitted without any modifications." - ) + ) in result_list # command output has no validation errors - for line in result.output.split("\n")[4]: - assert not line.startswith("error") + errors = [errors for result in result_list if result.startswith("error")] def test_validate_invalid_manifest(self, runner: CliRunner) -> None: """ @@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest( os.remove("tests/data/example.Biospecimen.schema.json") # command output has excel file creation message + result_list = result.output.split("\n") assert ( - result.output.split("\n")[7] - == "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx" + "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx" + in result_list ) sheet1 = workbook["Sheet1"] @@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest( # Reset config to it's default values CONFIG.load_config("config_example.yml") - assert result.output.split("\n")[7] == ( - "Find the manifest template using this Google Sheet URL:" - ) - assert result.output.split("\n")[8].startswith( - "https://docs.google.com/spreadsheets/d/" - ) - assert result.output.split("\n")[9] == ( + result_list = result.output.split("\n") + assert "Find the manifest template using this Google Sheet URL:" in result_list + assert ( "Find the manifest template using this CSV file path: " "./CLI_gs_bulk_rna.csv" - ) - - google_sheet_url = result.output.split("\n")[8] + ) in result_list + google_sheet_result = [ + result + for result in result_list + if result.startswith("https://docs.google.com/spreadsheets/d/") + ] + assert len(google_sheet_result) == 1 + google_sheet_url = google_sheet_result[0] # Download the Google Sheets content as an Excel file and load into openpyxl export_url = f"{google_sheet_url}/export?format=xlsx" @@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations( os.remove("tests/data/example.BulkRNA-seqAssay.schema.json") os.remove("./CLI_gs_bulk_rna_annos.csv") - assert result.output.split("\n")[10] == ( - "Find the manifest template using this Google Sheet URL:" - ) - assert result.output.split("\n")[11].startswith( - "https://docs.google.com/spreadsheets/d/" - ) - assert result.output.split("\n")[12] == ( + result_list = result.output.split("\n") + assert "Find the manifest template using this Google Sheet URL:" in result_list + assert ( "Find the manifest template using this CSV file path: " "./CLI_gs_bulk_rna_annos.csv" - ) - - google_sheet_url = result.output.split("\n")[11] + ) in result_list + google_sheet_result = [ + result + for result in result_list + if result.startswith("https://docs.google.com/spreadsheets/d/") + ] + assert len(google_sheet_result) == 1 + google_sheet_url = google_sheet_result[0] # Download the Google Sheets content as an Excel file and load into openpyxl export_url = f"{google_sheet_url}/export?format=xlsx" @@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None # TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202 # Reset config to it's default values CONFIG.load_config("config_example.yml") - # Command output has excel file message - assert result.output.split("\n")[8] == ( + result_list = result.output.split("\n") + assert ( "Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx" + in result_list ) sheet1 = workbook["Sheet1"] diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 2178a83b8..2f604ed5d 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -139,7 +139,6 @@ async def test_submit_filebased_manifest_file_and_entities_valid_manifest_submit dir=create_temp_folder(path=tempfile.gettempdir()), ) as tmp_file: df.to_csv(tmp_file.name, index=False) - # WHEN the manifest is submitted (Assertions are handled in the helper method) self._submit_and_verify_manifest( helpers=helpers, @@ -229,7 +228,6 @@ async def test_submit_filebased_manifest_file_and_entities_mock_filename( dir=create_temp_folder(path=tempfile.gettempdir()), ) as tmp_file: df.to_csv(tmp_file.name, index=False) - # WHEN the manifest is submitted (Assertions are handled in the helper method) self._submit_and_verify_manifest( helpers=helpers, @@ -450,6 +448,11 @@ def _submit_and_verify_manifest( raise ValueError( "expected_manifest_id or expected_manifest_name must be provided" ) + # HACK: must requery the fileview to get new files, since SynapseStorage will query the last state + # of the fileview which may not contain any new folders in the fileview. + # This is a workaround to fileviews not always containing the latest information + # Since the tests don't always follow a similar process as testing resources are created and destroyed + synapse_store.query_fileview(force_requery=True) # Spies if already_spied: diff --git a/tests/integration/test_store_synapse.py b/tests/integration/test_store_synapse.py index 085a48fc3..5c39acc3c 100644 --- a/tests/integration/test_store_synapse.py +++ b/tests/integration/test_store_synapse.py @@ -1,10 +1,17 @@ -from unittest.mock import MagicMock +from re import compile as re_compile +from unittest.mock import MagicMock, create_autospec, patch import numpy as np +import pandas as pd import pytest +from synapseclient import Synapse +from synapseclient.core.cache import Cache +from synapseclient.table import build_table +from schematic.configuration.configuration import Configuration from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.general import SYN_ID_REGEX from schematic.utils.validate_utils import comma_separated_list_regex @@ -125,3 +132,327 @@ def test_process_row_annotations_get_validation( "value2", "value3", ] + + @pytest.mark.parametrize( + "asset_view, dataset_id, expected_files", + [ + ( + "syn23643253", + "syn61374924", + [ + ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"), + ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"), + ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"), + ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"), + ], + ), + ( + "syn23643253", + "syn25614635", + [ + ( + "syn25614636", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + ), + ( + "syn25614637", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + ), + ( + "syn25614638", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ), + ], + ), + ( + "syn63917487", + "syn63917494", + [ + ( + "syn63917520", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt", + ), + ( + "syn63917521", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt", + ), + ( + "syn63917522", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt", + ), + ( + "syn63917518", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63927665", + [ + ( + "syn63927671", + "schematic - main/BulkRNAseq nested files/data/txt1.txt", + ), + ( + "syn63927672", + "schematic - main/BulkRNAseq nested files/data/txt2.txt", + ), + ( + "syn63927673", + "schematic - main/BulkRNAseq nested files/data/txt3.txt", + ), + ( + "syn63927670", + "schematic - main/BulkRNAseq nested files/data/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63987067", + [ + ( + "syn63987072", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt", + ), + ( + "syn63987073", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt", + ), + ( + "syn63987074", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt", + ), + ( + "syn63987071", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt", + ), + ], + ), + ], + ids=[ + "top level folder", + "nested dataset", + "contentType:dataset annotation", + "nested data files", + "doubly nested data files", + ], + ) + @pytest.mark.parametrize( + "filenames", + [None, ["txt1.txt", "txt2.txt"]], + ids=["no file filtering", "file filtering"], + ) + def test_get_files_in_storage_dataset( + self, filenames, asset_view, dataset_id, expected_files, synapse_store + ): + # GIVEN a SynapseStorage object with the appropriate asset view + synapse_store.storageFileview = asset_view + # WHEN getFilesInStorageDataset is called for the given dataset + dataset_files = synapse_store.getFilesInStorageDataset( + datasetId=dataset_id, fileNames=filenames + ) + # AND the filenames are filtered as appropriate + if filenames: + files_to_remove = [] + for f in expected_files: + retain = False + for name in filenames: + if name in f[1]: + retain = True + if not retain: + files_to_remove.append(f) + + for file in files_to_remove: + expected_files.remove(file) + + # THEN the expected files are returned + # AND there are no unexpected files + assert dataset_files == expected_files + # AND the (synId, path) order is correct + synapse_id_regex = re_compile(SYN_ID_REGEX) + if dataset_files: + assert synapse_id_regex.fullmatch(dataset_files[0][0]) + + @pytest.mark.parametrize( + "asset_view, dataset_id, exception, exception_message", + [ + ( + "syn23643253", + "syn64319379", + LookupError, + "Dataset syn64319379 could not be found", + ), + ("syn64367119", "syn61374924", ValueError, "Fileview syn64367119 is empty"), + ], + ids=[ + "empty dataset", + "empty view", + ], + ) + def test_get_files_in_storage_dataset_exception( + self, asset_view, dataset_id, exception, exception_message, synapse_store + ): + # GIVEN a SynapseStorage object with the appropriate asset view + synapse_store.storageFileview = asset_view + # AND the correct and up-to-date fileview + synapse_store.query_fileview(force_requery=True) + # WHEN getFilesInStorageDataset is called + # THEN the appropriate exception should be raised, with the appropriate message + with pytest.raises(exception, match=exception_message): + synapse_store.getFilesInStorageDataset(datasetId=dataset_id, fileNames=None) + + @pytest.mark.parametrize( + "asset_view, dataset_id, expected_files", + [ + ( + "syn23643253", + "syn61374924", + [ + ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"), + ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"), + ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"), + ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"), + ], + ), + ( + "syn23643253", + "syn25614635", + [ + ( + "syn25614636", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + ), + ( + "syn25614637", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + ), + ( + "syn25614638", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ), + ], + ), + ( + "syn63917487", + "syn63917494", + [ + ( + "syn63917520", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt", + ), + ( + "syn63917521", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt", + ), + ( + "syn63917522", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt", + ), + ( + "syn63917518", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63927665", + [ + ( + "syn63927671", + "schematic - main/BulkRNAseq nested files/data/txt1.txt", + ), + ( + "syn63927672", + "schematic - main/BulkRNAseq nested files/data/txt2.txt", + ), + ( + "syn63927673", + "schematic - main/BulkRNAseq nested files/data/txt3.txt", + ), + ( + "syn63927670", + "schematic - main/BulkRNAseq nested files/data/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63987067", + [ + ( + "syn63987072", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt", + ), + ( + "syn63987073", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt", + ), + ( + "syn63987074", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt", + ), + ( + "syn63987071", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt", + ), + ], + ), + ], + ids=[ + "top level folder", + "nested dataset", + "contentType:dataset annotation", + "nested data files", + "doubly nested data files", + ], + ) + @pytest.mark.parametrize( + "filenames", + [None, ["txt1.txt", "txt2.txt"]], + ids=["no file filtering", "file filtering"], + ) + def test_mock_get_files_in_storage_dataset( + self, + synapse_store, + filenames, + asset_view, + dataset_id, + expected_files, + ): + # GIVEN a test configuration + TEST_CONFIG = Configuration() + + with patch( + "schematic.store.synapse.CONFIG", return_value=TEST_CONFIG + ) as mock_config: + # AND the appropriate asset view + mock_config.synapse_master_fileview_id = asset_view + # AND the appropriately filtered filenames + if filenames: + files_to_remove = [] + for f in expected_files: + retain = False + for name in filenames: + if name in f[1]: + retain = True + if not retain: + files_to_remove.append(f) + + for file in files_to_remove: + expected_files.remove(file) + + # WHEN getFilesInStorageDataset is called for the given dataset + dataset_files = synapse_store.getFilesInStorageDataset( + datasetId=dataset_id, fileNames=filenames + ) + + # THEN the expected files are returned + # AND there are no unexpected files + assert dataset_files == expected_files + # AND the (synId, path) order is correct + synapse_id_regex = re_compile(SYN_ID_REGEX) + if dataset_files: + assert synapse_id_regex.fullmatch(dataset_files[0][0]) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index ade80fbe9..2358019d9 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from pandas.testing import assert_series_equal from schematic.configuration.configuration import Configuration from schematic.manifest.generator import ManifestGenerator @@ -103,6 +104,9 @@ def mock_create_blank_google_sheet(): @pytest.fixture(params=[True, False], ids=["sheet_url", "data_frame"]) def manifest(dataset_id, manifest_generator, request): + """ + Only seems to be used for TestManifestGenerator.test_get_manifest_first_time + """ # Rename request param for readability sheet_url = request.param @@ -763,13 +767,11 @@ def test_create_manifests( assert all_results == expected_result @pytest.mark.parametrize( - "component,datasetId,expected_file_based,expected_rows,expected_files", + "component, datasetId, expected_rows, expected_files, expected_annotations", [ - ("Biospecimen", "syn61260107", False, 4, None), ( "BulkRNA-seqAssay", "syn61374924", - True, 4, pd.Series( [ @@ -780,18 +782,50 @@ def test_create_manifests( ], name="Filename", ), + { + "File Format": pd.Series( + ["BAM", "CRAM", "CSV/TSV", ""], name="File Format" + ), + "Genome Build": pd.Series( + ["GRCh37", "GRCh38", "GRCm38", ""], name="Genome Build" + ), + }, + ), + ( + "BulkRNA-seqAssay", + "syn25614635", + 3, + pd.Series( + [ + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ], + name="Filename", + ), + { + "File Format": pd.Series( + ["txt", "csv", "fastq"], name="File Format" + ), + "Year of Birth": pd.Series(["1980", "", ""], name="Year of Birth"), + }, ), ], - ids=["Record based", "File based"], + ids=[ + "top level folder", + "nested dataset", + ], ) + @pytest.mark.parametrize("use_annotations", [True, False]) def test_get_manifest_with_files( self, helpers, component, datasetId, - expected_file_based, expected_rows, expected_files, + expected_annotations, + use_annotations, ): """ Test to ensure that @@ -813,27 +847,37 @@ def test_get_manifest_with_files( path_to_data_model=path_to_data_model, graph=graph_data_model, root=component, - use_annotations=True, + use_annotations=use_annotations, ) - # WHEN a manifest is generated for the appropriate dataset as a dataframe + # WHEN a filebased manifest is generated for the appropriate dataset as a dataframe manifest = generator.get_manifest( dataset_id=datasetId, output_format="dataframe" ) - # AND it is determined if the manifest is filebased - is_file_based = "Filename" in manifest.columns - # AND the number of rows are checked n_rows = manifest.shape[0] # THEN the manifest should have the expected number of rows assert n_rows == expected_rows - # AND the manifest should be filebased or not as expected - assert is_file_based == expected_file_based + # AND the manifest should have the columns expected of filebased metadata + assert "Filename" in manifest.columns + assert "entityId" in manifest.columns + + # AND the manifest should have the expected files from the dataset + assert_series_equal( + manifest["Filename"], + expected_files, + check_dtype=False, + ) - # AND if the manifest is file based - if expected_file_based: - # THEN the manifest should have the expected files - assert manifest["Filename"].equals(expected_files) + # AND if annotations are used to generate the manifest + if use_annotations: + # THEN the annotations in the generated manifest should match the expected annotations + for attribute, annotations in expected_annotations.items(): + assert_series_equal( + manifest[attribute], + annotations, + check_dtype=False, + ) diff --git a/tests/test_store.py b/tests/test_store.py index c9f32bec2..516b7805d 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -11,7 +11,7 @@ import uuid from contextlib import nullcontext as does_not_raise from typing import Any, Callable, Generator -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pandas as pd import pytest @@ -22,6 +22,7 @@ from synapseclient.entity import File, Project from synapseclient.models import Annotations from synapseclient.models import Folder as FolderModel +from synapseclient.table import build_table from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer @@ -463,46 +464,103 @@ def test_getDatasetProject(self, dataset_id, synapse_store): ( True, [ - ("syn126", "schematic - main/parent_folder/test_file"), + ("syn126", "syn_mock", "schematic - main/parent_folder/test_file"), ( "syn125", + "syn_mock", "schematic - main/parent_folder/test_folder/test_file_2", ), ], ), - (False, [("syn126", "test_file"), ("syn125", "test_file_2")]), + ( + False, + [ + ("syn126", "syn_mock", "test_file"), + ("syn125", "syn_mock", "test_file_2"), + ], + ), ], ) def test_getFilesInStorageDataset(self, synapse_store, full_path, expected): - mock_return = [ + mock_table_dataframe_return = pd.DataFrame( + { + "id": ["syn126", "syn125"], + "parentId": ["syn_mock", "syn_mock"], + "path": [ + "schematic - main/parent_folder/test_file", + "schematic - main/parent_folder/test_folder/test_file_2", + ], + } + ) + + with patch.object( + synapse_store, "storageFileviewTable", mock_table_dataframe_return + ), patch.object(synapse_store, "query_fileview") as mocked_query: + # query_fileview is the function called to get the fileview + mocked_query.return_value = mock_table_dataframe_return + + file_list = synapse_store.getFilesInStorageDataset( + datasetId="syn_mock", fileNames=None, fullpath=full_path + ) + assert file_list == expected + + @pytest.mark.parametrize( + "mock_table_dataframe, raised_exception, exception_message", + [ ( - ("parent_folder", "syn123"), - [("test_folder", "syn124")], - [("test_file", "syn126")], + pd.DataFrame( + { + "id": ["child_syn_mock"], + "path": ["schematic - main/parent_folder/child_entity"], + "parentId": ["wrong_syn_mock"], + } + ), + LookupError, + "Dataset syn_mock could not be found", ), ( - ( - os.path.join("schematic - main", "parent_folder", "test_folder"), - "syn124", + pd.DataFrame( + { + "id": [], + "path": [], + "parentId": [], + } ), - [], - [("test_file_2", "syn125")], + ValueError, + "Fileview mock_table_id is empty", ), - ] - with patch( - "synapseutils.walk_functions._help_walk", return_value=mock_return - ) as mock_walk_patch, patch( - "schematic.store.synapse.SynapseStorage.getDatasetProject", - return_value="syn23643250", - ) as mock_project_id_patch, patch( - "synapseclient.entity.Entity.__getattr__", return_value="schematic - main" - ) as mock_project_name_patch, patch.object( - synapse_store.syn, "get", return_value=Project(name="schematic - main") - ): - file_list = synapse_store.getFilesInStorageDataset( - datasetId="syn_mock", fileNames=None, fullpath=full_path - ) - assert file_list == expected + ], + ids=["missing_dataset", "empty_fileview"], + ) + @pytest.mark.parametrize( + "full_path,", + [ + (True), + (False), + ], + ) + def test_get_files_in_storage_dataset_exception( + self, + synapse_store, + mock_table_dataframe, + raised_exception, + exception_message, + full_path, + ): + with patch.object( + synapse_store, "storageFileviewTable", mock_table_dataframe + ), patch.object( + synapse_store, "storageFileview", "mock_table_id" + ) as mock_table_id, patch.object( + synapse_store, "query_fileview" + ) as mocked_query: + # query_fileview is the function called to get the fileview + mocked_query.return_value = mock_table_dataframe + + with pytest.raises(raised_exception, match=exception_message): + synapse_store.getFilesInStorageDataset( + datasetId="syn_mock", fileNames=None, fullpath=full_path + ) @pytest.mark.parametrize("downloadFile", [True, False]) def test_getDatasetManifest(self, synapse_store, downloadFile): @@ -926,6 +984,26 @@ async def mock_success_coro() -> None: await synapse_store._process_store_annos(new_tasks) mock_store_async.assert_not_called() + @pytest.mark.parametrize( + "dataset_id, dataset_folder_list, expected_clause", + [ + ("syn12345678", None, "parentId='syn12345678'"), + ( + "syn63927665", + ["syn63927665", "syn63927667"], + "parentId IN ('syn63927665', 'syn63927667')", + ), + (None, None, ""), + ], + ) + def test_build_clause_from_dataset_id( + self, dataset_id, dataset_folder_list, expected_clause + ): + dataset_clause = SynapseStorage.build_clause_from_dataset_id( + dataset_id=dataset_id, dataset_folder_list=dataset_folder_list + ) + assert dataset_clause == expected_clause + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store):