Skip to content

Commit

Permalink
[SCHEMATIC-183] Use paths from file view for manifest generation (#1529)
Browse files Browse the repository at this point in the history
source manifest file paths from synapse fileviews at generation
  • Loading branch information
GiaJordan authored Dec 10, 2024
1 parent c681346 commit ef87c39
Show file tree
Hide file tree
Showing 11 changed files with 622 additions and 180 deletions.
2 changes: 2 additions & 0 deletions schematic/manifest/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,6 +1904,8 @@ def get_manifest(
# TODO: avoid explicitly exposing Synapse store functionality
# just instantiate a Store class and let it decide at runtime/config
# the store type
# TODO: determine which parts of fileview are necessary for `get` operations
# and pass query parameters at object instantiation to avoid having to re-query
if access_token:
# for getting an existing manifest on AWS
store = SynapseStorage(access_token=access_token)
Expand Down
4 changes: 3 additions & 1 deletion schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -2119,7 +2119,9 @@ def filename_validation(

where_clauses = []

dataset_clause = f"parentId='{dataset_scope}'"
dataset_clause = SynapseStorage.build_clause_from_dataset_id(
dataset_id=dataset_scope
)
where_clauses.append(dataset_clause)

self._login(
Expand Down
165 changes: 70 additions & 95 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@
import numpy as np
import pandas as pd
import synapseclient
import synapseutils
from opentelemetry import trace
from synapseclient import Annotations as OldAnnotations
from synapseclient import (
Column,
Entity,
EntityViewSchema,
EntityViewType,
File,
Expand Down Expand Up @@ -416,6 +414,30 @@ def query_fileview(
else:
raise AccessCredentialsError(self.storageFileview)

@staticmethod
def build_clause_from_dataset_id(
dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
) -> str:
"""
Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
Args:
dataset_id: Synapse ID of a dataset that should be used to limit the query
dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:
clause for the query or an empty string if no dataset ID is provided
"""
# Calling this method without specifying synIDs will complete but will not scope the view
if (not dataset_id) and (not dataset_folder_list):
return ""

# This will be used to gather files under a dataset recursively with a fileview query instead of walking
if dataset_folder_list:
search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
return f"parentId IN ({search_folders})"

# `dataset_id` should be provided when all files are stored directly under the dataset folder
return f"parentId='{dataset_id}'"

def _build_query(
self, columns: Optional[list] = None, where_clauses: Optional[list] = None
):
Expand Down Expand Up @@ -666,7 +688,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
def getFilesInStorageDataset(
self, datasetId: str, fileNames: List = None, fullpath: bool = True
) -> List[Tuple[str, str]]:
"""Gets all files in a given dataset folder.
"""Gets all files (excluding manifest files) in a given dataset folder.
Args:
datasetId: synapse ID of a storage dataset.
Expand All @@ -680,105 +702,58 @@ def getFilesInStorageDataset(
Raises:
ValueError: Dataset ID not found.
"""
# select all files within a given storage dataset folder (top level folder in
# a Synapse storage project or folder marked with contentType = 'dataset')
walked_path = synapseutils.walk(
self.syn, datasetId, includeTypes=["folder", "file"]
)

current_entity_location = self.synapse_entity_tracker.get(
synapse_id=datasetId, syn=self.syn, download_file=False
)

def walk_back_to_project(
current_location: Entity, location_prefix: str, skip_entry: bool
) -> str:
"""
Recursively walk back up the project structure to get the paths of the
names of each of the directories where we started the walk function.
Args:
current_location (Entity): The current entity location in the project structure.
location_prefix (str): The prefix to prepend to the path.
skip_entry (bool): Whether to skip the current entry in the path. When
this is True it means we are looking at our starting point. If our
starting point is the project itself we can go ahead and return
back the project as the prefix.
Returns:
str: The path of the names of each of the directories up to the project root.
"""
if (
skip_entry
and "concreteType" in current_location
and current_location["concreteType"] == PROJECT_ENTITY
):
return f"{current_location.name}/{location_prefix}"
file_list = []

updated_prefix = (
location_prefix
if skip_entry
else f"{current_location.name}/{location_prefix}"
)
if (
"concreteType" in current_location
and current_location["concreteType"] == PROJECT_ENTITY
):
return updated_prefix
current_location = self.synapse_entity_tracker.get(
synapse_id=current_location["parentId"],
syn=self.syn,
download_file=False,
# Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
if self.storageFileviewTable.empty:
raise ValueError(
f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
)
return walk_back_to_project(
current_location=current_location,
location_prefix=updated_prefix,
skip_entry=False,

child_path = self.storageFileviewTable.loc[
self.storageFileviewTable["parentId"] == datasetId, "path"
]
if child_path.empty:
raise LookupError(
f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
)
child_path = child_path.iloc[0]

prefix = walk_back_to_project(
current_location=current_entity_location,
location_prefix="",
skip_entry=True,
)
# Get the dataset path by eliminating the child's portion of the path to account for nested datasets
parent = child_path.split("/")[:-1]
parent = "/".join(parent)

project_id = self.getDatasetProject(datasetId)
project = self.synapse_entity_tracker.get(
synapse_id=project_id, syn=self.syn, download_file=False
)
project_name = project.name
file_list = []
# Format dataset path to be used in table query
dataset_path = f"'{parent}/%'"

# iterate over all results
for dirpath, _, path_filenames in walked_path:
# iterate over all files in a folder
for path_filename in path_filenames:
if ("manifest" not in path_filename[0] and not fileNames) or (
fileNames and path_filename[0] in fileNames
):
# don't add manifest to list of files unless it is specified in the
# list of specified fileNames; return all found files
# except the manifest if no fileNames have been specified
# TODO: refactor for clarity/maintainability

if fullpath:
# append directory path to filename
if dirpath[0].startswith(f"{project_name}/"):
path_without_project_prefix = (
dirpath[0] + "/"
).removeprefix(f"{project_name}/")
path_filename = (
prefix + path_without_project_prefix + path_filename[0],
path_filename[1],
)
else:
path_filename = (
prefix + dirpath[0] + "/" + path_filename[0],
path_filename[1],
)
# When querying, only include files to exclude entity files and subdirectories
where_clauses = [f"path like {dataset_path}", "type='file'"]

# Requery the fileview to specifically get the files in the given dataset
self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)

# Exclude manifest files
non_manifest_files = self.storageFileviewTable.loc[
~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
:,
]

# Remove all files that are not in the list of fileNames
if fileNames:
filename_regex = "|".join(fileNames)

matching_files = non_manifest_files["path"].str.contains(
filename_regex, case=False, regex=True
)

non_manifest_files = non_manifest_files.loc[matching_files, :]

# Truncate path if necessary
if not fullpath:
non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)

# add file name file id tuple, rearranged so that id is first and name follows
file_list.append(path_filename[::-1])
# Return list of files as expected by other methods
file_list = list(non_manifest_files.itertuples(index=False, name=None))

return file_list

Expand Down
9 changes: 5 additions & 4 deletions schematic/utils/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# pylint: disable=anomalous-backslash-in-string

import logging

from typing import Any, Mapping, Sequence, Union, Optional
from functools import reduce
import re
from functools import reduce
from typing import Any, Mapping, Optional, Sequence, Union

from schematic.utils.general import SYN_ID_REGEX

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,7 +70,7 @@ def parse_syn_ids(
if not syn_ids:
return None

project_regex = re.compile("(syn\d+\,?)+")
project_regex = re.compile(SYN_ID_REGEX)
valid = project_regex.fullmatch(syn_ids)

if not valid:
Expand Down
2 changes: 2 additions & 0 deletions schematic/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

T = TypeVar("T")

SYN_ID_REGEX = r"(syn\d+\,?)+"


def find_duplicates(_list: list[T]) -> set[T]:
"""Find duplicate items in a list"""
Expand Down
4 changes: 2 additions & 2 deletions schematic_api/api/openapi/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,8 @@ paths:
- Synapse Storage
/storage/dataset/files:
get:
summary: Get all files in a given dataset folder
description: Get all files in a given dataset folder
summary: Get all files (excluding manifest files) in a given dataset folder
description: Get all files (excluding manifest files) in a given dataset folder
operationId: schematic_api.api.routes.get_files_storage_dataset
security:
- access_token: []
Expand Down
68 changes: 36 additions & 32 deletions tests/integration/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import uuid
from io import BytesIO

import numpy as np
import pandas as pd
import pytest
import requests
from openpyxl import load_workbook
from click.testing import CliRunner
import pandas as pd
import numpy as np
from openpyxl import load_workbook

from schematic.configuration.configuration import Configuration, CONFIG
from schematic.configuration.configuration import CONFIG, Configuration
from schematic.manifest.commands import manifest
from schematic.models.commands import model
from tests.conftest import ConfigurationForTesting
Expand Down Expand Up @@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None:
# command has no (python) errors, has exit code 0
assert result.exit_code == 0
# command output has success message
assert result.output.split("\n")[4] == (
result_list = result.output.split("\n")
assert (
"Your manifest has been validated successfully. "
"There are no errors in your manifest, "
"and it can be submitted without any modifications."
)
) in result_list
# command output has no validation errors
for line in result.output.split("\n")[4]:
assert not line.startswith("error")
errors = [errors for result in result_list if result.startswith("error")]

def test_validate_invalid_manifest(self, runner: CliRunner) -> None:
"""
Expand Down Expand Up @@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest(
os.remove("tests/data/example.Biospecimen.schema.json")

# command output has excel file creation message
result_list = result.output.split("\n")
assert (
result.output.split("\n")[7]
== "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
"Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
in result_list
)

sheet1 = workbook["Sheet1"]
Expand Down Expand Up @@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest(
# Reset config to it's default values
CONFIG.load_config("config_example.yml")

assert result.output.split("\n")[7] == (
"Find the manifest template using this Google Sheet URL:"
)
assert result.output.split("\n")[8].startswith(
"https://docs.google.com/spreadsheets/d/"
)
assert result.output.split("\n")[9] == (
result_list = result.output.split("\n")
assert "Find the manifest template using this Google Sheet URL:" in result_list
assert (
"Find the manifest template using this CSV file path: "
"./CLI_gs_bulk_rna.csv"
)

google_sheet_url = result.output.split("\n")[8]
) in result_list
google_sheet_result = [
result
for result in result_list
if result.startswith("https://docs.google.com/spreadsheets/d/")
]
assert len(google_sheet_result) == 1
google_sheet_url = google_sheet_result[0]

# Download the Google Sheets content as an Excel file and load into openpyxl
export_url = f"{google_sheet_url}/export?format=xlsx"
Expand Down Expand Up @@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations(
os.remove("tests/data/example.BulkRNA-seqAssay.schema.json")
os.remove("./CLI_gs_bulk_rna_annos.csv")

assert result.output.split("\n")[10] == (
"Find the manifest template using this Google Sheet URL:"
)
assert result.output.split("\n")[11].startswith(
"https://docs.google.com/spreadsheets/d/"
)
assert result.output.split("\n")[12] == (
result_list = result.output.split("\n")
assert "Find the manifest template using this Google Sheet URL:" in result_list
assert (
"Find the manifest template using this CSV file path: "
"./CLI_gs_bulk_rna_annos.csv"
)

google_sheet_url = result.output.split("\n")[11]
) in result_list
google_sheet_result = [
result
for result in result_list
if result.startswith("https://docs.google.com/spreadsheets/d/")
]
assert len(google_sheet_result) == 1
google_sheet_url = google_sheet_result[0]

# Download the Google Sheets content as an Excel file and load into openpyxl
export_url = f"{google_sheet_url}/export?format=xlsx"
Expand Down Expand Up @@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None
# TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202
# Reset config to it's default values
CONFIG.load_config("config_example.yml")

# Command output has excel file message
assert result.output.split("\n")[8] == (
result_list = result.output.split("\n")
assert (
"Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx"
in result_list
)

sheet1 = workbook["Sheet1"]
Expand Down
Loading

0 comments on commit ef87c39

Please sign in to comment.