[SCHEMATIC-183] Use paths from file view for manifest generation (#1529)

source manifest file paths from synapse fileviews at generation
Sage-Bionetworks · Dec 10, 2024 · ef87c39 · ef87c39
1 parent c681346
commit ef87c39
Show file tree

Hide file tree

Showing 11 changed files with 622 additions and 180 deletions.
diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py
@@ -1904,6 +1904,8 @@ def get_manifest(
         # TODO: avoid explicitly exposing Synapse store functionality
         # just instantiate a Store class and let it decide at runtime/config
         # the store type
+        # TODO: determine which parts of fileview are necessary for `get` operations
+        # and pass query parameters at object instantiation to avoid having to re-query
         if access_token:
             # for getting an existing manifest on AWS
             store = SynapseStorage(access_token=access_token)

diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py
@@ -2119,7 +2119,9 @@ def filename_validation(
 
         where_clauses = []
 
-        dataset_clause = f"parentId='{dataset_scope}'"
+        dataset_clause = SynapseStorage.build_clause_from_dataset_id(
+            dataset_id=dataset_scope
+        )
         where_clauses.append(dataset_clause)
 
         self._login(

diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
@@ -19,12 +19,10 @@
 import numpy as np
 import pandas as pd
 import synapseclient
-import synapseutils
 from opentelemetry import trace
 from synapseclient import Annotations as OldAnnotations
 from synapseclient import (
     Column,
-    Entity,
     EntityViewSchema,
     EntityViewType,
     File,
@@ -416,6 +414,30 @@ def query_fileview(
                 else:
                     raise AccessCredentialsError(self.storageFileview)
 
+    @staticmethod
+    def build_clause_from_dataset_id(
+        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
+    ) -> str:
+        """
+        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
+        Args:
+            dataset_id: Synapse ID of a dataset that should be used to limit the query
+            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
+        Returns:
+            clause for the query or an empty string if no dataset ID is provided
+        """
+        # Calling this method without specifying synIDs will complete but will not scope the view
+        if (not dataset_id) and (not dataset_folder_list):
+            return ""
+
+        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
+        if dataset_folder_list:
+            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
+            return f"parentId IN ({search_folders})"
+
+        # `dataset_id` should be provided when all files are stored directly under the dataset folder
+        return f"parentId='{dataset_id}'"
+
     def _build_query(
         self, columns: Optional[list] = None, where_clauses: Optional[list] = None
     ):
@@ -666,7 +688,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
     def getFilesInStorageDataset(
         self, datasetId: str, fileNames: List = None, fullpath: bool = True
     ) -> List[Tuple[str, str]]:
-        """Gets all files in a given dataset folder.
+        """Gets all files (excluding manifest files) in a given dataset folder.
 
         Args:
             datasetId: synapse ID of a storage dataset.
@@ -680,105 +702,58 @@ def getFilesInStorageDataset(
         Raises:
             ValueError: Dataset ID not found.
         """
-        # select all files within a given storage dataset folder (top level folder in
-        # a Synapse storage project or folder marked with contentType = 'dataset')
-        walked_path = synapseutils.walk(
-            self.syn, datasetId, includeTypes=["folder", "file"]
-        )
-
-        current_entity_location = self.synapse_entity_tracker.get(
-            synapse_id=datasetId, syn=self.syn, download_file=False
-        )
-
-        def walk_back_to_project(
-            current_location: Entity, location_prefix: str, skip_entry: bool
-        ) -> str:
-            """
-            Recursively walk back up the project structure to get the paths of the
-            names of each of the directories where we started the walk function.
-
-            Args:
-                current_location (Entity): The current entity location in the project structure.
-                location_prefix (str): The prefix to prepend to the path.
-                skip_entry (bool): Whether to skip the current entry in the path. When
-                    this is True it means we are looking at our starting point. If our
-                    starting point is the project itself we can go ahead and return
-                    back the project as the prefix.
-
-            Returns:
-                str: The path of the names of each of the directories up to the project root.
-            """
-            if (
-                skip_entry
-                and "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return f"{current_location.name}/{location_prefix}"
+        file_list = []
 
-            updated_prefix = (
-                location_prefix
-                if skip_entry
-                else f"{current_location.name}/{location_prefix}"
-            )
-            if (
-                "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return updated_prefix
-            current_location = self.synapse_entity_tracker.get(
-                synapse_id=current_location["parentId"],
-                syn=self.syn,
-                download_file=False,
+        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
+        if self.storageFileviewTable.empty:
+            raise ValueError(
+                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
             )
-            return walk_back_to_project(
-                current_location=current_location,
-                location_prefix=updated_prefix,
-                skip_entry=False,
+
+        child_path = self.storageFileviewTable.loc[
+            self.storageFileviewTable["parentId"] == datasetId, "path"
+        ]
+        if child_path.empty:
+            raise LookupError(
+                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
             )
+        child_path = child_path.iloc[0]
 
-        prefix = walk_back_to_project(
-            current_location=current_entity_location,
-            location_prefix="",
-            skip_entry=True,
-        )
+        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
+        parent = child_path.split("/")[:-1]
+        parent = "/".join(parent)
 
-        project_id = self.getDatasetProject(datasetId)
-        project = self.synapse_entity_tracker.get(
-            synapse_id=project_id, syn=self.syn, download_file=False
-        )
-        project_name = project.name
-        file_list = []
+        # Format dataset path to be used in table query
+        dataset_path = f"'{parent}/%'"
 
-        # iterate over all results
-        for dirpath, _, path_filenames in walked_path:
-            # iterate over all files in a folder
-            for path_filename in path_filenames:
-                if ("manifest" not in path_filename[0] and not fileNames) or (
-                    fileNames and path_filename[0] in fileNames
-                ):
-                    # don't add manifest to list of files unless it is specified in the
-                    # list of specified fileNames; return all found files
-                    # except the manifest if no fileNames have been specified
-                    # TODO: refactor for clarity/maintainability
-
-                    if fullpath:
-                        # append directory path to filename
-                        if dirpath[0].startswith(f"{project_name}/"):
-                            path_without_project_prefix = (
-                                dirpath[0] + "/"
-                            ).removeprefix(f"{project_name}/")
-                            path_filename = (
-                                prefix + path_without_project_prefix + path_filename[0],
-                                path_filename[1],
-                            )
-                        else:
-                            path_filename = (
-                                prefix + dirpath[0] + "/" + path_filename[0],
-                                path_filename[1],
-                            )
+        # When querying, only include files to exclude entity files and subdirectories
+        where_clauses = [f"path like {dataset_path}", "type='file'"]
+
+        # Requery the fileview to specifically get the files in the given dataset
+        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
+
+        # Exclude manifest files
+        non_manifest_files = self.storageFileviewTable.loc[
+            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
+            :,
+        ]
+
+        # Remove all files that are not in the list of fileNames
+        if fileNames:
+            filename_regex = "|".join(fileNames)
+
+            matching_files = non_manifest_files["path"].str.contains(
+                filename_regex, case=False, regex=True
+            )
+
+            non_manifest_files = non_manifest_files.loc[matching_files, :]
+
+        # Truncate path if necessary
+        if not fullpath:
+            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 
-                    # add file name file id tuple, rearranged so that id is first and name follows
-                    file_list.append(path_filename[::-1])
+        # Return list of files as expected by other methods
+        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 
         return file_list
 

diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py
@@ -4,10 +4,11 @@
 # pylint: disable=anomalous-backslash-in-string
 
 import logging
-
-from typing import Any, Mapping, Sequence, Union, Optional
-from functools import reduce
 import re
+from functools import reduce
+from typing import Any, Mapping, Optional, Sequence, Union
+
+from schematic.utils.general import SYN_ID_REGEX
 
 logger = logging.getLogger(__name__)
 
@@ -69,7 +70,7 @@ def parse_syn_ids(
     if not syn_ids:
         return None
 
-    project_regex = re.compile("(syn\d+\,?)+")
+    project_regex = re.compile(SYN_ID_REGEX)
     valid = project_regex.fullmatch(syn_ids)
 
     if not valid:

diff --git a/schematic/utils/general.py b/schematic/utils/general.py
@@ -24,6 +24,8 @@
 
 T = TypeVar("T")
 
+SYN_ID_REGEX = r"(syn\d+\,?)+"
+
 
 def find_duplicates(_list: list[T]) -> set[T]:
     """Find duplicate items in a list"""

diff --git a/schematic_api/api/openapi/api.yaml b/schematic_api/api/openapi/api.yaml
@@ -692,8 +692,8 @@ paths:
         - Synapse Storage
   /storage/dataset/files:
     get:
-      summary: Get all files in a given dataset folder
-      description: Get all files in a given dataset folder
+      summary: Get all files (excluding manifest files) in a given dataset folder
+      description: Get all files (excluding manifest files) in a given dataset folder
       operationId: schematic_api.api.routes.get_files_storage_dataset
       security:
         - access_token: []

diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py
@@ -4,14 +4,14 @@
 import uuid
 from io import BytesIO
 
+import numpy as np
+import pandas as pd
 import pytest
 import requests
-from openpyxl import load_workbook
 from click.testing import CliRunner
-import pandas as pd
-import numpy as np
+from openpyxl import load_workbook
 
-from schematic.configuration.configuration import Configuration, CONFIG
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.manifest.commands import manifest
 from schematic.models.commands import model
 from tests.conftest import ConfigurationForTesting
@@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None:
         # command has no (python) errors, has exit code 0
         assert result.exit_code == 0
         # command output has success message
-        assert result.output.split("\n")[4] == (
+        result_list = result.output.split("\n")
+        assert (
             "Your manifest has been validated successfully. "
             "There are no errors in your manifest, "
             "and it can be submitted without any modifications."
-        )
+        ) in result_list
         # command output has no validation errors
-        for line in result.output.split("\n")[4]:
-            assert not line.startswith("error")
+        errors = [errors for result in result_list if result.startswith("error")]
 
     def test_validate_invalid_manifest(self, runner: CliRunner) -> None:
         """
@@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest(
                 os.remove("tests/data/example.Biospecimen.schema.json")
 
         # command output has excel file creation message
+        result_list = result.output.split("\n")
         assert (
-            result.output.split("\n")[7]
-            == "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
+            "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
+            in result_list
         )
 
         sheet1 = workbook["Sheet1"]
@@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest(
             # Reset config to it's default values
             CONFIG.load_config("config_example.yml")
 
-        assert result.output.split("\n")[7] == (
-            "Find the manifest template using this Google Sheet URL:"
-        )
-        assert result.output.split("\n")[8].startswith(
-            "https://docs.google.com/spreadsheets/d/"
-        )
-        assert result.output.split("\n")[9] == (
+        result_list = result.output.split("\n")
+        assert "Find the manifest template using this Google Sheet URL:" in result_list
+        assert (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna.csv"
-        )
-
-        google_sheet_url = result.output.split("\n")[8]
+        ) in result_list
+        google_sheet_result = [
+            result
+            for result in result_list
+            if result.startswith("https://docs.google.com/spreadsheets/d/")
+        ]
+        assert len(google_sheet_result) == 1
+        google_sheet_url = google_sheet_result[0]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"
@@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations(
                 os.remove("tests/data/example.BulkRNA-seqAssay.schema.json")
             os.remove("./CLI_gs_bulk_rna_annos.csv")
 
-        assert result.output.split("\n")[10] == (
-            "Find the manifest template using this Google Sheet URL:"
-        )
-        assert result.output.split("\n")[11].startswith(
-            "https://docs.google.com/spreadsheets/d/"
-        )
-        assert result.output.split("\n")[12] == (
+        result_list = result.output.split("\n")
+        assert "Find the manifest template using this Google Sheet URL:" in result_list
+        assert (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna_annos.csv"
-        )
-
-        google_sheet_url = result.output.split("\n")[11]
+        ) in result_list
+        google_sheet_result = [
+            result
+            for result in result_list
+            if result.startswith("https://docs.google.com/spreadsheets/d/")
+        ]
+        assert len(google_sheet_result) == 1
+        google_sheet_url = google_sheet_result[0]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"
@@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None
             # TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202
             # Reset config to it's default values
             CONFIG.load_config("config_example.yml")
-
         # Command output has excel file message
-        assert result.output.split("\n")[8] == (
+        result_list = result.output.split("\n")
+        assert (
             "Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx"
+            in result_list
         )
 
         sheet1 = workbook["Sheet1"]