From fd42f88a5e2a3c61faae164a4877f756c0900c72 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 18 Jun 2024 19:56:10 -0700
Subject: [PATCH 01/34] Add support for unit test extraction + running

---
 .gitignore          |   4 +-
 Makefile            |   5 +
 lib.py              |  65 ++++++--
 run.py              | 132 +++++++++++-----
 run_unit.py         |  37 +++++
 setup_unit_tests.py | 363 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 556 insertions(+), 50 deletions(-)
 create mode 100644 run_unit.py
 create mode 100644 setup_unit_tests.py

diff --git a/.gitignore b/.gitignore
index a05d4cd..10df9a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,6 @@ tmp/
 _version_1.0.wdl
 _version_1.1.wdl
 _version_draft-2.wdl
-datasets/
\ No newline at end of file
+datasets/
+unit_tests/
+wdl-1.1-spec/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index f20d4f0..0f0a9c7 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ build: cromwell womtool
 
 clean-build: clean
 	rm -rf build
+
 clean:
 	rm -f results*.json
 	rm -rf miniwdl-logs
@@ -27,6 +28,10 @@ clean:
 clean-csv:
 	rm csv_output_*
 
+clean-unit:
+	rm -rf wdl-1.1-spec
+	rm -rf unit_tests
+
 cromwell:
 	mkdir -p build
 	if [ ! -f build/cromwell.jar ]; then \
diff --git a/lib.py b/lib.py
index 4d90259..0e3733d 100644
--- a/lib.py
+++ b/lib.py
@@ -1,12 +1,13 @@
 # generic helper functions
 
 import os
+import re
 import subprocess
 
 from WDL.Type import Float as WDLFloat, String as WDLString, File as WDLFile, Int as WDLInt, Boolean as WDLBool, \
     Array as WDLArray, Map as WDLMap, Pair as WDLPair, StructInstance as WDLStruct
 
-from typing import Optional, Any
+from typing import Optional, Any, Dict, Union
 from WDL.Type import Base as WDLBase
 
 
@@ -215,6 +216,7 @@ def get_wdl_file(wdl_file: str, wdl_dir: str, version: str) -> str:
 
 
 def run_cmd(cmd, cwd):
+    print(" ".join(cmd))
     p = subprocess.Popen(cmd, stdout=-1, stderr=-1, cwd=cwd)
     stdout, stderr = p.communicate()
 
@@ -302,13 +304,14 @@ def get_test_indices(number_argument):
     return tests
 
 
-def get_specific_tests(conformance_tests, tag_argument, number_argument, exclude_number_argument, id_argument):
+def get_specific_tests(conformance_tests, tag_argument, number_argument, id_argument, exclude_number_argument=None, exclude_tags_argument=None):
     """
     Given the expected tests, tag argument, and number argument, return a list of all test numbers/indices to run
     """
     given_indices = get_test_indices(number_argument)
     exclude_indices = get_test_indices(exclude_number_argument)
     given_tags = get_tags(tag_argument)
+    exclude_tags = get_tags(exclude_tags_argument)
     ids_to_test = None if id_argument is None else set(id_argument.split(','))
     tests = set()
     given_indices = given_indices or []
@@ -316,6 +319,8 @@ def get_specific_tests(conformance_tests, tag_argument, number_argument, exclude
         if exclude_indices is not None and test_number in exclude_indices:
             continue
         test_tags = conformance_tests[test_number]['tags']
+        if exclude_tags is not None and not set(test_tags).isdisjoint(exclude_tags):
+            continue
         test_id = conformance_tests[test_number]['id']
         if test_number in given_indices:
             tests.add(test_number)
@@ -330,14 +335,15 @@ def get_specific_tests(conformance_tests, tag_argument, number_argument, exclude
     return sorted(list(tests))
 
 
-def verify_failure(ret_code: int) -> dict:
+def verify_failure(expected: Dict[str, Any], ret_code: int) -> dict:
     """
     Verify that the workflow did fail
 
     ret_code should be the status code WDL runner outputs when running the test
+    :param expected: Expected failure outpute, ex: return_code: 42
     :param ret_code: return code from WDL runner
 
-    If ret_code is fail (>0 or True), then return success
+    If ret_code is fail (>0 or True) and expected return code matches or doesn't exist, then return success
     If ret_code is success (0 or False), then return failure
     """
     # This currently only tests if the workflow simply failed to run or not. It cannot differentiate
@@ -349,8 +355,22 @@ def verify_failure(ret_code: int) -> dict:
         return {'status': 'FAILED',
                 'reason': f"Workflow did not fail!"}
 
-    # proper failure, return success
-    return {'status': f'SUCCEEDED'}
+    # proper failure, if expected return code is empty, return success
+    expected_ret_code = expected["fail"].get("return_code")
+    success = {'status': f'SUCCEEDED'}
+    if expected_ret_code is None:
+        return success
+    if not isinstance(expected_ret_code, list):
+        expected_ret_code = [expected_ret_code]
+    for rc in expected_ret_code:
+        if rc == "*":
+            # this stands for any return code
+            return success
+        if rc == ret_code:
+            return success
+    # else, the workflow return code does not match the expected return code
+    return {'status': 'FAILED',
+            'reason': f"Workflow did not return the correct return code! Got: {ret_code}. Expected: {','.join(expected_ret_code)}."}
 
 
 def py_type_of_wdl_class(wdl_type: WDLBase):
@@ -372,7 +392,10 @@ def wdl_inner_type(wdl_type):
     Get the interior type of a WDL type. So "Array[String]" gives "String".
     """
     if '[' in wdl_type:
-        return '['.join(wdl_type.split('[')[1:])[:-1]
+        remaining = '['.join(wdl_type.split('[')[1:])  # get remaining string starting from open bracket
+        end_idx = len(remaining) - remaining[::-1].index(']') - 1  # find index of closing bracket
+        # remove postfix quantifiers
+        return remaining[:end_idx]
     else:
         return wdl_type
 
@@ -387,7 +410,7 @@ def wdl_outer_type(wdl_type):
     return wdl_type.split('[')[0]
 
 
-def wdl_type_to_miniwdl_class(wdl_type: Any) -> Optional[WDLBase]:
+def wdl_type_to_miniwdl_class(wdl_type: Union[Dict[str, Any], str]) -> Optional[WDLBase]:
     """
     Given a WDL type name, return a MiniWDL class.
 
@@ -398,6 +421,10 @@ def wdl_type_to_miniwdl_class(wdl_type: Any) -> Optional[WDLBase]:
     :param wdl_type: representation of WDL type
     """
 
+    if isinstance(wdl_type, dict):
+        return WDLStruct
+    # remove postfix quantifiers
+    wdl_type = re.sub('[?+]', '', wdl_type)
     if wdl_type == 'File':
         return WDLFile
     elif wdl_type == 'Int':
@@ -414,8 +441,6 @@ def wdl_type_to_miniwdl_class(wdl_type: Any) -> Optional[WDLBase]:
         return WDLMap
     elif wdl_type == 'Pair':
         return WDLPair
-    elif isinstance(wdl_type, dict):
-        return WDLStruct
     else:
         raise NotImplementedError
         # return None
@@ -428,11 +453,14 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
 
     :param wdl_type: representation of wdl type
     """
-    outer_py_typ = wdl_type_to_miniwdl_class(wdl_outer_type(wdl_type))
+    output_str_typ = wdl_outer_type(wdl_type)
+    outer_py_typ = wdl_type_to_miniwdl_class(output_str_typ)
 
     if outer_py_typ is WDLStruct:
+        optional = '?' in output_str_typ
+
         # objects currently forced to be typed just like structs
-        struct_type = WDLStruct("Struct")
+        struct_type = WDLStruct("Struct", optional=optional)
         members = {}
 
         for k, v in wdl_type.items():
@@ -446,6 +474,8 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         return struct_type
 
     if outer_py_typ is WDLPair:
+        optional = '?' in output_str_typ
+
         inner_type = wdl_inner_type(wdl_type)
 
         key_and_value_type = inner_type.split(',')
@@ -456,9 +486,11 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         value_type = key_and_value_type[1].strip()
         left_type = convert_type(key_type)
         right_type = convert_type(value_type)
-        return WDLPair(left_type, right_type)
+        return WDLPair(left_type, right_type, optional=optional)
 
     if outer_py_typ is WDLMap:
+        optional = '?' in output_str_typ
+
         inner_type = wdl_inner_type(wdl_type)
 
         key_and_value_type = inner_type.split(',')
@@ -474,15 +506,18 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         if None in (converted_key_type, converted_value_type):
             return None
 
-        return WDLMap((converted_key_type, converted_value_type))
+        return WDLMap((converted_key_type, converted_value_type), optional=optional)
 
     if outer_py_typ is WDLArray:
+        optional = '?' in output_str_typ
+        nonempty = '+' in output_str_typ
+
         inner_type = wdl_inner_type(wdl_type)
         if inner_type in ('Array', ''):
             # no given inner type
             return None
         converted_inner_type = convert_type(inner_type)
         # if inner type conversion failed, then type is invalid, so return None
-        return outer_py_typ(converted_inner_type) if converted_inner_type is not None else None
+        return WDLArray(converted_inner_type, optional=optional, nonempty=nonempty) if converted_inner_type is not None else None
     # primitives remaining
     return wdl_type_to_miniwdl_class(wdl_type)()
diff --git a/run.py b/run.py
index 78a0e60..7f25ade 100755
--- a/run.py
+++ b/run.py
@@ -40,7 +40,7 @@ class WDLRunner:
     """
     runner: str
 
-    def format_command(self, wdl_file, json_file, results_file, args, verbose, pre_args=None):
+    def format_command(self, wdl_file, json_file, json_string, results_file, args, verbose, pre_args=None):
         raise NotImplementedError
 
 
@@ -48,8 +48,12 @@ class CromwellStyleWDLRunner(WDLRunner):
     def __init__(self, runner):
         self.runner = runner
 
-    def format_command(self, wdl_file, json_file, results_file, args, verbose, pre_args=None):
-        return list(filter(None, self.runner.split(" "))) + [wdl_file, "-i", json_file, "-m", results_file] + args
+    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+                       args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
+        # One or the other is guaranteed to exist in the prior branch
+        json_input = json_file or json_string
+        json_arg = ["-i", json_input] if json_input is not None else []
+        return list(filter(None, self.runner.split(" "))) + [wdl_file, "-m", results_file] + json_arg + args
 
 
 class CromwellWDLRunner(CromwellStyleWDLRunner):
@@ -58,7 +62,8 @@ class CromwellWDLRunner(CromwellStyleWDLRunner):
     def __init__(self):
         super().__init__('cromwell')
 
-    def format_command(self, wdl_file, json_file, results_file, args, verbose, pre_args=None):
+    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+                       args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
         if self.runner == 'cromwell' and not which('cromwell'):
             with CromwellWDLRunner.download_lock:
                 if self.runner == 'cromwell':
@@ -72,16 +77,19 @@ def format_command(self, wdl_file, json_file, results_file, args, verbose, pre_a
                         run_cmd(cmd='make cromwell'.split(" "), cwd=os.getcwd())
                     self.runner = f'java {log_level} {pre_args} -jar {cromwell} run'
 
-        return super().format_command(wdl_file, json_file, results_file, args, verbose)
+        return super().format_command(wdl_file, json_file, json_string, results_file, args, verbose)
 
 
 class MiniWDLStyleWDLRunner(WDLRunner):
     def __init__(self, runner):
         self.runner = runner
 
-    def format_command(self, wdl_file, json_file, results_file, args, verbose, pre_args=None):
-        return self.runner.split(" ") + [wdl_file, "-i", json_file, "-o", results_file, "-d", "miniwdl-logs",
-                                         "--verbose"] + args
+    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+                       args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
+        json_input = json_file or json_string
+        json_arg = ["-i", json_input] if json_input is not None else []
+        return self.runner.split(" ") + [wdl_file, "-o", results_file, "-d", "miniwdl-logs",
+                                         "--verbose"] + json_arg + args
 
 
 RUNNERS = {
@@ -213,20 +221,20 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                                                       f"Actual output: {result}"}
         return {'status': f'SUCCEEDED'}
 
-    def run_verify(self, expected: dict, results_file: str, ret_code: int) -> dict:
+    def run_verify(self, expected: dict, results_file: str, ret_code: int, exclude_outputs: Optional[list]) -> dict:
         """
         Check either for proper output or proper success/failure of WDL program, depending on if 'fail' is included in
         the conformance test
         """
         if 'fail' in expected.keys():
             # workflow is expected to fail
-            response = verify_failure(ret_code)
+            response = verify_failure(expected, ret_code)
         else:
             # workflow is expected to run
-            response = self.verify_outputs(expected, results_file, ret_code)
+            response = self.verify_outputs(expected, results_file, ret_code, exclude_outputs)
         return response
 
-    def verify_outputs(self, expected: dict, results_file: str, ret_code: int) -> dict:
+    def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclude_outputs: Optional[list]) -> dict:
         """
         Verify that the test result outputs are the same as the expected output from the conformance file
 
@@ -245,12 +253,32 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int) -> di
         except json.JSONDecodeError:
             return {'status': 'FAILED', 'reason': f'Results file at {results_file} is not JSON'}
 
-        if len(test_results['outputs']) != len(expected):
+        if exclude_outputs is not None:
+            # I'm not sure if it is possible but the wdl-tests spec seems to say the type can also be a string
+            exclude_outputs = list(exclude_outputs) if not isinstance(exclude_outputs, list) else exclude_outputs
+            # remove the outputs that we are not allowed to compare
+            test_result_outputs = dict()
+            for k, v in test_results['outputs'].items():
+                for excluded in exclude_outputs:
+                    if k.endswith(excluded):
+                        # we first check that an output variable ends with the excluded name
+                        # ex: got: output.csvs, omit: csvs      so omit
+                        # then we check that the length of the two match
+                        # or that the character right before the endswith match is a namespace separator
+                        # ex: got: output.csvs, omit: output.csvs       so omit
+                        # ex: got: output.not_csvs, omit: csvs      so don't omit as '_' != '.'
+                        if len(k) == len(excluded) or k[::-1][len(excluded)] == ".":
+                            continue
+                    test_result_outputs[k] = v
+
+        else:
+            test_result_outputs = test_results['outputs']
+        if len(test_result_outputs) != len(expected):
             return {'status': 'FAILED',
                     'reason': f"'outputs' section expected {len(expected)} results ({list(expected.keys())}), got "
-                              f"{len(test_results['outputs'])} instead ({list(test_results['outputs'].keys())})"}
+                              f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())})"}
 
-        result_outputs = test_results['outputs']
+        result_outputs = test_result_outputs
 
         result = {'status': f'SUCCEEDED', 'reason': None}
 
@@ -287,17 +315,27 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         inputs = test['inputs']
         wdl_dir = inputs['dir']
         wdl_input = inputs.get('wdl', f'{wdl_dir}.wdl')  # default wdl name
-        json_input = inputs.get('json', f'{wdl_dir}.json')  # default json name
         abs_wdl_dir = os.path.abspath(wdl_dir)
         if version in WDL_VERSIONS:
             wdl_input = f'{wdl_dir}/{wdl_input}'
         else:
             return {'status': 'FAILED', 'reason': f'WDL version {version} is not supported!'}
+        wdl_file = os.path.abspath(get_wdl_file(wdl_input, abs_wdl_dir, version))
 
-        json_path = f'{wdl_dir}/{json_input}'  # maybe return failing result if no json file found
+        json_input = inputs.get('json')
+        if json_input is None:
+            json_file = None
+        else:
+            json_path = f'{wdl_dir}/{json_input}'
+            json_file = os.path.abspath(json_path)
+
+        json_string = json.dumps(inputs['json_string']) if inputs.get('json_string') else None
+
+        # Only one of json_string or input json file can exist
+        if json_string is not None and json_file is not None:
+            return {'status': 'FAILED', 'reason': f'Config file specifies both a json string and json input file! Only one can be supplied! '
+                                                  f'Check the input section for test id {test["id"]}.'}
 
-        wdl_file = os.path.abspath(get_wdl_file(wdl_input, abs_wdl_dir, version))
-        json_file = os.path.abspath(json_path)
         test_args = args[runner].split(" ") if args[runner] is not None else []
         unique_id = uuid4()
         # deal with jobstore_path argument for toil
@@ -305,13 +343,14 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
             unique_jobstore_path = os.path.join(jobstore_path, f"wdl-jobstore-{unique_id}")
             test_args.extend(["--jobstore", unique_jobstore_path])
         outputs = test['outputs']
+        exclude_outputs = test.get('exclude_output')
         results_file = os.path.abspath(f'results-{unique_id}.json')
         wdl_runner = RUNNERS[runner]
         # deal with cromwell arguments to define java system properties
         pre_args = None
         if runner == "cromwell":
             pre_args = args["cromwell_pre_args"]
-        cmd = wdl_runner.format_command(wdl_file, json_file, results_file, test_args, verbose, pre_args)
+        cmd = wdl_runner.format_command(wdl_file, json_file, json_string, results_file, test_args, verbose, pre_args)
 
         realtime = None
         if time:
@@ -325,7 +364,11 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         if verbose:
             with self.LOG_LOCK:
                 announce_test(test_index, test, version, runner)
-        response = self.run_verify(outputs, results_file, ret_code)
+        response = self.run_verify(outputs, results_file, ret_code, exclude_outputs)
+
+        if response["status"] == "FAILED" and test.get("priority") == "optional":
+            # an optional test can be reported as a warning if it fails
+            response["status"] = "WARNING"
 
         if time:
             response['time'] = {"real": realtime}
@@ -343,6 +386,13 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
         Returns a result that can have status SKIPPED, SUCCEEDED, or FAILED.
         """
         response = {'description': test.get('description'), 'number': test_index, 'id': test.get('id')}
+        if test.get("priority") == "ignore":
+            # config specifies to ignore this test, so skip
+            if progress:
+                print(f"Ignoring test {test_index} (ID: {test['id']}) with runner {runner} on WDL version {version}.")
+            response.update({'status': 'IGNORED'})
+            if verbose:
+                response.update({'reason': f'Priority of the test is set to ignore.'})
         if version not in test['versions']:
             # Test to skip, if progress is true, then output
             if progress:
@@ -365,8 +415,9 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
     def run_and_generate_tests_args(self, tags: Optional[str], numbers: Optional[str], versions: str, runner: str,
                                     threads: int = 1, time: bool = False, verbose: bool = False, quiet: bool = False,
                                     args: Optional[Dict[str, Any]] = None, jobstore_path: Optional[str] = None,
-                                    exclude_numbers: Optional[str] = None, ids: Optional[str] = None,
-                                    repeat: Optional[int] = None, progress: bool = False) -> Tuple[List[Any], bool]:
+                                    exclude_numbers: Optional[str] = None, exclude_tags: Optional[str] = None,
+                                    ids: Optional[str] = None, repeat: Optional[int] = None, progress: bool = False)\
+            -> Tuple[List[Any], bool]:
         # Get all the versions to test.
         # Unlike with CWL, WDL requires a WDL file to declare a specific version,
         # and prohibits mixing file versions in a workflow, although some runners
@@ -374,10 +425,13 @@ def run_and_generate_tests_args(self, tags: Optional[str], numbers: Optional[str
         # But the tests all need to be for single WDL versions.
         versions_to_test = set(versions.split(','))
         selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=tags, number_argument=numbers,
-                                            exclude_number_argument=exclude_numbers, id_argument=ids)
+                                            id_argument=ids, exclude_number_argument=exclude_numbers, exclude_tags_argument=exclude_tags)
         selected_tests_amt = len(selected_tests) * len(versions_to_test) * repeat
         successes = 0
         skips = 0
+        ignored = 0
+        warnings = 0
+        failed = 0
         test_responses = list()
         print(f'Testing runner {runner} on WDL versions: {",".join(versions_to_test)}\n')
         with ProcessPoolExecutor(max_workers=threads) as executor:  # process instead of thread so realtime works
@@ -427,19 +481,27 @@ def run_and_generate_tests_args(self, tags: Optional[str], numbers: Optional[str
                 successes += 1
             elif response['status'] == 'SKIPPED':
                 skips += 1
+            elif response['status'] == 'IGNORED':
+                ignored += 1
+            elif response['status'] == 'WARNING':
+                warnings += 1
+            elif response['status'] == 'FAILED':
+                failed += 1
 
         print(
-            f'{selected_tests_amt - skips} tests run, {successes} succeeded, {selected_tests_amt - skips - successes} '
-            f'failed, {skips} skipped'
+            f'{selected_tests_amt - skips} tests run, {successes} succeeded, {failed} '
+            f'failed, {skips} skipped, {ignored} ignored, {warnings} warnings'
         )
 
-        if successes < selected_tests_amt - skips:
-            # identify the failing tests
-            failed_ids = [str(response['number']) for response in test_responses if
-                          response['status'] not in {'SUCCEEDED', 'SKIPPED'}]
-            print(f"\tFailures: {','.join(failed_ids)}")
+
+        # identify the failing tests
+        failed_ids = [str(response['number']) for response in test_responses if
+                      response['status'] not in {'SUCCEEDED', 'SKIPPED'}]
+        print(f"\tFailures: {','.join(failed_ids)}")
+        if len(failed_ids) > 0:
             return test_responses, False
-        return test_responses, True
+        else:
+            return test_responses, True
 
     def run_and_generate_tests(self, options: argparse.Namespace) -> Tuple[List[Any], bool]:
         """
@@ -458,7 +520,8 @@ def run_and_generate_tests(self, options: argparse.Namespace) -> Tuple[List[Any]
                                                 runner=options.runner, time=options.time, verbose=options.verbose,
                                                 quiet=options.quiet, threads=options.threads, args=args,
                                                 jobstore_path=options.jobstore_path,
-                                                exclude_numbers=options.exclude_numbers, ids=options.id,
+                                                exclude_numbers=options.exclude_numbers,
+                                                exclude_tags=options.exclude_tags, ids=options.id,
                                                 repeat=options.repeat, progress=options.progress)
 
 
@@ -484,6 +547,7 @@ def add_options(parser) -> None:
                         help="Time the conformance test run.")
     parser.add_argument("--quiet", default=False, action="store_true")
     parser.add_argument("--exclude-numbers", default=None, help="Exclude certain test numbers.")
+    parser.add_argument("--exclude-tags", default=None, help="Exclude certain test tags.")
     parser.add_argument("--toil-args", default=None, help="Arguments to pass into toil-wdl-runner. Ex: "
                                                           "--toil-args=\"caching=False\"")
     parser.add_argument("--miniwdl-args", default=None, help="Arguments to pass into miniwdl. Ex: "
@@ -498,7 +562,7 @@ def add_options(parser) -> None:
     parser.add_argument("--id", default=None, help="Specify WDL tests by ID.")
     parser.add_argument("--repeat", default=1, type=int, help="Specify how many times to run each test.")
     # This is to deal with jobstores being created in the /data/tmp directory on Phoenix, which appears to be unique
-    # per worker, thus causing JobstoreNotFound exceptions when delegating to many workers at a time
+    # per worker, thus causing JobstoreNotFound exceptions when delegating too many workers at a time
     parser.add_argument("--jobstore-path", "-j", default=None, help="Specify the PARENT directory for the jobstores to "
                                                                     "be created in.")
     # Test responses are collected and sorted, so this option allows the script to print out the current progress
diff --git a/run_unit.py b/run_unit.py
new file mode 100644
index 0000000..3fd40cd
--- /dev/null
+++ b/run_unit.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# PYTHON_ARGCOMPLETE_OK
+import argparse
+import sys
+
+import argcomplete
+
+from run import WDLConformanceTestRunner, add_options
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_options(parser)
+
+    unit_conformance_path = "unit_tests/test_config.yaml"
+
+    parser.add_argument("--config", "-c", default=unit_conformance_path,
+                        help="Specify the path of the conformance config file.")
+    parser.set_defaults(version="1.1")  # override default to 1.1
+    parser.set_defaults(runner="toil-wdl-runner")  # cromwell doesn't support 1.1+
+
+    argcomplete.autocomplete(parser)
+    args = parser.parse_args()
+
+    if args.versions not in ["1.1", "1.2"]:
+        raise RuntimeError(f"WDL version is not valid; unit tests are only supported on WDL 1.1+.")
+
+    conformance_runner = WDLConformanceTestRunner(conformance_file=args.config)
+    _, successful_run = conformance_runner.run_and_generate_tests(args)
+
+    if not successful_run:
+        # Fail the program overall if tests failed.
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
new file mode 100644
index 0000000..5fd7d46
--- /dev/null
+++ b/setup_unit_tests.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+# PYTHON_ARGCOMPLETE_OK
+
+"""
+Extract unit tests from the WDL spec (1.1+) and create a conformance file that is conformance to our representation for this test suite
+"""
+import subprocess
+import regex as re
+import os
+import glob
+
+import json
+from ruamel.yaml import YAML
+import argparse
+import argcomplete
+from pathlib import Path
+import re
+import shutil
+from typing import Optional, Union, List, Set, Callable, Any, Dict
+
+import WDL
+
+TEST_RE = re.compile(
+    r"^<details>\s*<summary>\s*Example: (.+?)\s*```wdl(.+?)```\s*</summary>\s*(?:<p>\s*(?:Example input:\s*```json(.*?)```)?\s*(?:Example output:\s*```json(.*?)```)?\s*(?:Test config:\s*```json(.*)```)?\s*</p>\s*)?</details>$",
+    re.I | re.S,
+)
+FILENAME_RE = re.compile(r"(.+?)(_fail)?(_task)?.wdl")
+VERSION_RE = re.compile(r"version ([\d.]+)")
+
+regex_output_section_str = r"(?<=output)(?:\s*{\s*)([\s\S]+?)(?:})"
+regex_var_types_str = r"([\w\[\]+?]+)\s(\w+)(?:[\s\S]*?(?= =))"
+
+
+def extract_output_types_defer(wdl_file):
+    """
+    The main issue with using miniwdl's parser is parsing the wdl file is dependent on miniwdl,
+    so if the spec changes and miniwdl is unable to support it, this test framework won't be able to run.
+    The simplest way around this (that isn't creating another parser, maybe pyparsing?) is probably something like this;
+    regex won't work as regular languages are too basic.
+
+    Thus use a simple iterator to detect all variable names instead of miniwdl's parser
+    To be used if a wdl file has a type that miniwdl doesn't support, ex Object
+    """
+    regex_var_types = re.compile(regex_var_types_str)
+
+    with open(wdl_file, "r") as f:
+        wdl_contents = f.read()
+
+    var_types = {}
+    # can't regex this as regex cannot handle nested constructs
+    # this is a little fragile if the entire declaration isn't on the same line
+    # find all var declarations
+    for line in wdl_contents.split("\n"):
+        line = line.strip()
+        if "=" in line:
+            match = re.search(regex_var_types, line)
+            if match is None:
+                # this line is not a var declaration
+                continue
+            name = match.group(2)
+            typ = match.group(1)
+            var_types[name] = typ
+
+    return var_types
+
+
+def wdl_type_to_string(wdl_type: WDL.Type.Base) -> Union[Dict[str, Any], str]:
+    if isinstance(wdl_type, WDL.Type.StructInstance):
+        # this is the only construct that we need to decipher to string form
+        dictified = {}
+        for key, subtype in wdl_type.members.items():
+            dictified[key] = wdl_type_to_string(subtype)
+        return dictified
+    return str(wdl_type)
+
+
+def extract_output_types(wdl_file, fail):
+    """
+    Use miniwdl's parser to extract the output types from a wdl file
+    """
+    if fail:
+        # since we expect this workflow to fail, there should be no outputs
+        return {}
+
+    try:
+        document: WDL.Tree.Document = WDL.load(uri=str(wdl_file))
+    except WDL.Error.InvalidType as e:
+        if "Unknown type Object" in str(e):
+            # one of the tests wants to test if Objects are supported, but miniwdl doesn't support objects
+            # defer to the other more simpler parser
+            return extract_output_types_defer(wdl_file)
+        else:
+            raise
+
+    target: Union[WDL.Tree.Workflow, WDL.Tree.Task]
+
+    if document.workflow:
+        target = document.workflow
+    elif len(document.tasks) == 1:
+        target = document.tasks[0]
+    else:
+        raise Exception(
+            "Multiple tasks founds with no workflow!")  # this in theory shouldn't be hit, but the source repo has a suspicious --tasks argument passed to miniwdl sometimes
+
+    var_types = dict()
+    if target.outputs is not None:
+        for output in target.outputs:
+            var_types[output.name] = wdl_type_to_string(output.type)
+
+    return var_types
+
+
+def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any])\
+        -> Union[Dict[Any, Any], List[Any], bool, int, str, float, None]:
+    if isinstance(json_obj, dict):
+        return {k: recursive_json_apply(v, func) for k, v in json_obj.items()}
+    elif isinstance(json_obj, list):
+        return [recursive_json_apply(v, func) for v in json_obj]
+    elif json_obj is None:
+        return None
+    else:
+        # primitive type
+        return func(json_obj)
+
+
+def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], output_data_dir: Optional[Path], config: list):
+    """
+    Given the regex, create the corresponding config entry for conformance.yaml.
+
+    Separated from write_test_file as miniwdl's parser requires all imported files to exist,
+    and this is not necessary the case if iterating the spec file top to bottom. And we
+    can't use a simple regex as the type of grammar is too basic for things like nested constructs
+    """
+    file_name, wdl, input_json, output_json, config_json = m.groups()
+
+    f = FILENAME_RE.match(file_name)
+
+    wdl_file = output_dir / file_name
+
+    if config_json is not None:
+        config_entry = json.loads(config_json)
+    else:
+        config_entry = {}
+
+    target, is_fail, is_task = f.groups()
+
+    is_fail = is_fail or config_entry.get("fail", False)  # failure can also be specified in the json
+
+    config_entry["id"] = target
+
+    wdl_dir, wdl_base = os.path.split(wdl_file)
+
+    # the spec assumes all input files are the basenames
+    # if we detect a filename in the input, change to a proper relative path
+    input_json_dict = {}
+    if input_json is not None:
+        def if_file_convert(maybe_file):
+            if maybe_file in all_data_files:
+                # this is a file
+                return os.path.join(output_data_dir, maybe_file)  # output_data_dir should already be a relative path dir to the repo TLD
+            else:
+                return maybe_file
+
+        for k, v in json.loads(input_json.strip()).items():
+            input_json_dict[k] = recursive_json_apply(v, if_file_convert)
+
+    config_entry["inputs"] = {
+        "dir": wdl_dir,
+        "wdl": wdl_base,
+        "json_string": input_json_dict
+    }
+
+    if output_json is not None:
+        config_entry["outputs"] = json.loads(output_json.strip())
+    else:
+        config_entry["outputs"] = {}
+
+    output_var_types = extract_output_types(wdl_file, bool(is_fail))
+
+    if "return_code" not in config_entry:
+        config_entry["return_code"] = "*"
+    elif isinstance(config_entry["return_code"], str):
+        config_entry["return_code"] = [config_entry["return_code"]]
+
+    if bool(is_fail):
+        config_entry["outputs"] = {"fail": {}}
+        config_entry["outputs"]["fail"] = {"return_code": config_entry["return_code"]}
+    else:
+        if output_json is not None:
+            for k, v in json.loads(output_json.strip()).items():
+                k_base = k.split(".")[-1]
+                config_entry["outputs"][k] = {
+                    "type": output_var_types[k_base],
+                    "value": v
+                }
+
+    if "type" not in config_entry:
+        config_entry["type"] = "task" if is_task else "workflow"
+    if "target" not in config_entry:
+        config_entry["target"] = target
+    if "priority" not in config_entry:
+        config_entry["priority"] = "required"
+    if "exclude_output" not in config_entry:
+        config_entry["exclude_output"] = []
+    elif isinstance(config_entry["exclude_output"], str):
+        config_entry["exclude_output"] = [config_entry["exclude_output"]]
+    if "dependencies" not in config_entry:
+        config_entry["dependencies"] = []
+    elif isinstance(config_entry["dependencies"], str):
+        config_entry["dependencies"] = [config_entry["dependencies"]]
+    if "tags" not in config_entry:
+        config_entry["tags"] = []
+    elif isinstance(config_entry["tags"], str):
+        config_entry["tags"] = [config_entry["tags"]]
+
+    config_entry["tags"].append("unit")
+
+    config_entry["description"] = f"Unit test for {file_name}"
+
+    config_entry["versions"] = [version]
+
+    config.append(config_entry)
+
+
+def write_test_files(m: re.Match, output_dir: Path, version: str):
+    """
+    Given the parsed regex, write the test file into the output directory.
+    Checks if the version and the file to be written match. Also ensures that the file
+    does not exist beforehand.
+    """
+    file_name, wdl, input_json, output_json, config_json = m.groups()
+
+    if file_name is None:
+        raise Exception("Missing file name")
+    f = FILENAME_RE.match(file_name)
+    if f is None:
+        raise Exception(f"Invalid file name: {file_name}")
+
+    wdl = wdl.strip()
+    v = VERSION_RE.search(wdl)
+    if v is None:
+        raise Exception("WDL does not contain version statement")
+    elif v.group(1) != version:
+        raise Exception(f"Invalid WDL version {wdl}")
+
+    wdl_file = output_dir / file_name
+    if wdl_file.exists():
+        raise Exception(f"Test file already exists: {wdl_file}")
+    with open(wdl_file, "w") as o:
+        o.write(wdl)
+
+
+def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, version: str, output_type: str):
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True)
+
+    config = []
+    all_m = []
+    with open(spec) as s:
+        buf = None
+        for line in s:
+            if buf is None and "<details>" in line:
+                buf = [line]
+            elif buf is not None:
+                buf.append(line)
+                if "</details>" in line:
+                    ex = "".join(buf)
+                    buf = None
+                    m = TEST_RE.match(ex)
+                    if m is None:
+                        raise Exception(f"Regex does not match example {ex}")
+                    else:
+                        try:
+                            all_m.append(m)
+                            write_test_files(m, output_dir, version)
+                        except Exception as e:
+                            raise Exception(
+                                f"Error writing files for example {ex}"
+                            ) from e
+
+    all_data_files = None
+    if data_dir is not None:
+        all_data_files = set(os.path.basename(x) for x in glob.glob(str(data_dir / "**"), recursive=True))
+
+    if data_dir is not None:
+        output_data_dir = output_dir / "data"
+    else:
+        output_data_dir = None
+
+    for m in all_m:
+        generate_config_file(m, output_dir, version, all_data_files, output_data_dir, config)
+
+    if output_type == "json":
+        config_file = output_dir / "test_config.json"
+        with open(config_file, "w") as o:
+            json.dump(config, o, indent=2)
+    else:
+        config_file = output_dir / "test_config.yaml"
+        with open(config_file, "w") as o:
+            yaml = YAML()
+            yaml.dump(config, o)
+
+    if data_dir is not None and data_dir.exists():
+        shutil.copytree(data_dir, output_data_dir, symlinks=True, dirs_exist_ok=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        usage="%(prog)s [options]",
+        description=(
+            "Extracts the unit tests from the spec to be runnable by this test suite;"
+            "test files will be outputted into the `unit_tests` directory and a new conformance file will be created."
+            "To run the unit tests, see [insert other script here]"
+        )
+    )
+    parser.add_argument(
+        "--version",
+        "-v",
+        default="1.1",
+        help="Version of the spec to grab unit tests out of. Only WDL 1.1 and above has support for built in unit tests.",
+        choices=["1.1", "1.2"]
+    )
+    parser.add_argument(
+        "--output-type",
+        default="yaml",
+        choices=["yaml", "json"],
+        help="Specify conformance file output type."
+    )
+    parser.add_argument(
+        "--force-pull",
+        default=False,
+        action="store_true",
+        help="Default behavior does not pull the spec repo if it was already pulled. Specify this argument to force a refresh."
+    )
+    argcomplete.autocomplete(parser)
+    args = parser.parse_args()
+
+    spec_dir = f"wdl-{args.version}-spec"
+    if not os.path.exists(spec_dir) or args.force_pull is True:
+        # while the WDL spec has its bugs, use a fixed version
+        # see openwdl issues #653, #654, #661, #662, #663, #664, #665, #666
+        # cmd = f"git clone https://github.com/openwdl/wdl.git {spec_dir}"
+        cmd = f"git clone https://github.com/stxue1/wdl.git"
+        subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+
+    os.chdir(spec_dir)
+
+    # may be fragile if WDL changes their branch naming scheme
+    # todo: remove -fixes suffix after wdl fixes their spec, see comment above
+    cmd = f"git checkout wdl-{args.version}-fixes"
+    subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+
+    os.chdir("..")
+
+    # temp
+    cmd = f"rm -rf unit_tests"
+    subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+
+    extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type)
+
+
+if __name__ == "__main__":
+    main()

From 207db8fed579f73ffe9ddc506fa3b1ffe90f67d3 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 18 Jun 2024 20:31:09 -0700
Subject: [PATCH 02/34] Add some fixes for optionals and remove print
 statements

---
 lib.py | 15 +++-------
 run.py | 90 ++++++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/lib.py b/lib.py
index 0e3733d..27d0155 100644
--- a/lib.py
+++ b/lib.py
@@ -216,7 +216,6 @@ def get_wdl_file(wdl_file: str, wdl_dir: str, version: str) -> str:
 
 
 def run_cmd(cmd, cwd):
-    print(" ".join(cmd))
     p = subprocess.Popen(cmd, stdout=-1, stderr=-1, cwd=cwd)
     stdout, stderr = p.communicate()
 
@@ -456,9 +455,10 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
     output_str_typ = wdl_outer_type(wdl_type)
     outer_py_typ = wdl_type_to_miniwdl_class(output_str_typ)
 
-    if outer_py_typ is WDLStruct:
-        optional = '?' in output_str_typ
+    optional = '?' in output_str_typ
+    nonempty = '+' in output_str_typ
 
+    if outer_py_typ is WDLStruct:
         # objects currently forced to be typed just like structs
         struct_type = WDLStruct("Struct", optional=optional)
         members = {}
@@ -474,8 +474,6 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         return struct_type
 
     if outer_py_typ is WDLPair:
-        optional = '?' in output_str_typ
-
         inner_type = wdl_inner_type(wdl_type)
 
         key_and_value_type = inner_type.split(',')
@@ -489,8 +487,6 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         return WDLPair(left_type, right_type, optional=optional)
 
     if outer_py_typ is WDLMap:
-        optional = '?' in output_str_typ
-
         inner_type = wdl_inner_type(wdl_type)
 
         key_and_value_type = inner_type.split(',')
@@ -509,9 +505,6 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         return WDLMap((converted_key_type, converted_value_type), optional=optional)
 
     if outer_py_typ is WDLArray:
-        optional = '?' in output_str_typ
-        nonempty = '+' in output_str_typ
-
         inner_type = wdl_inner_type(wdl_type)
         if inner_type in ('Array', ''):
             # no given inner type
@@ -520,4 +513,4 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         # if inner type conversion failed, then type is invalid, so return None
         return WDLArray(converted_inner_type, optional=optional, nonempty=nonempty) if converted_inner_type is not None else None
     # primitives remaining
-    return wdl_type_to_miniwdl_class(wdl_type)()
+    return wdl_type_to_miniwdl_class(wdl_type)(optional=optional)
diff --git a/run.py b/run.py
index 7f25ade..6fad018 100755
--- a/run.py
+++ b/run.py
@@ -30,7 +30,6 @@
 from lib import run_cmd, py_type_of_wdl_class, verify_failure, announce_test, print_response, convert_type, run_setup, \
     get_specific_tests, get_wdl_file
 
-
 WDL_VERSIONS = ["draft-2", "1.0", "1.1"]
 
 
@@ -149,9 +148,10 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
         # WDLStruct also represents WDLObject
         # Objects in conformance will be forced to be typed, same as Structs
         if isinstance(typ, WDLStruct):
+            nonoptional_result = {k: v for k, v in result.items() if not typ.members[k].optional}
             try:
-                if len(expected) != len(result):
-                    return {'status': 'FAILED', 'reason': f"Size of expected and result do not match!\n"
+                if len(expected) < len(nonoptional_result) or len(expected) > len(result):
+                    return {'status': 'FAILED', 'reason': f"Size of expected and result are invalid! The outputs likely don't match.\n"
                                                           f"Expected output: {expected}\n"
                                                           f"Actual output: {result}!"}
                 for key in expected.keys():
@@ -241,6 +241,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
         :param expected: expected value object
         :param results_file: filepath of resulting output file from the WDL runner
         :param ret_code: return code from WDL runner
+        :param exclude_outputs: outputs to exclude when comparing
         """
         if ret_code:
             return {'status': 'FAILED', 'reason': f"Workflow failed to run!"}
@@ -253,7 +254,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
         except json.JSONDecodeError:
             return {'status': 'FAILED', 'reason': f'Results file at {results_file} is not JSON'}
 
-        if exclude_outputs is not None:
+        if exclude_outputs is not None and len(exclude_outputs) > 0:
             # I'm not sure if it is possible but the wdl-tests spec seems to say the type can also be a string
             exclude_outputs = list(exclude_outputs) if not isinstance(exclude_outputs, list) else exclude_outputs
             # remove the outputs that we are not allowed to compare
@@ -278,12 +279,10 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
                     'reason': f"'outputs' section expected {len(expected)} results ({list(expected.keys())}), got "
                               f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())})"}
 
-        result_outputs = test_result_outputs
-
         result = {'status': f'SUCCEEDED', 'reason': None}
 
         # compare expected output to result output
-        for identifier, output in result_outputs.items():
+        for identifier, output in test_result_outputs.items():
             try:
                 python_type = convert_type(expected[identifier]['type'])
             except KeyError:
@@ -412,12 +411,63 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
             response["repeat"] = repeat
         return response
 
-    def run_and_generate_tests_args(self, tags: Optional[str], numbers: Optional[str], versions: str, runner: str,
-                                    threads: int = 1, time: bool = False, verbose: bool = False, quiet: bool = False,
-                                    args: Optional[Dict[str, Any]] = None, jobstore_path: Optional[str] = None,
-                                    exclude_numbers: Optional[str] = None, exclude_tags: Optional[str] = None,
-                                    ids: Optional[str] = None, repeat: Optional[int] = None, progress: bool = False)\
-            -> Tuple[List[Any], bool]:
+    def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:
+        tags = options.tags
+        numbers = options.numbers
+        versions = options.versions
+        runner = options.runner
+        time = options.time
+        verbose = options.verbose
+        quiet = options.quiet
+        jobstore_path = options.jobstore_path
+        exclude_numbers = options.exclude_numbers
+        exclude_tags = options.exclude_tags
+        ids = options.id
+        repeat = options.repeat
+        progress = options.progress
+        # To be used with pycharm's debugger which is currently broken if there is concurrency
+        versions_to_test = set(versions.split(','))
+        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=tags, number_argument=numbers,
+                                            id_argument=ids, exclude_number_argument=exclude_numbers, exclude_tags_argument=exclude_tags)
+        print(f"===DEBUG===")
+        print(f'Testing runner {runner} on WDL versions: {",".join(versions_to_test)}\n')
+        for test_index in selected_tests:
+            try:
+                test = self.tests[test_index]
+            except KeyError:
+                print(f'ERROR: Provided test [{test_index}] do not exist.')
+                sys.exit(1)
+            for version in versions_to_test:
+                for iteration in range(repeat):
+                    # Handle each test as a concurrent job
+                    self.handle_test(
+                        test_index,
+                        test,
+                        runner,
+                        version,
+                        time,
+                        verbose,
+                        quiet,
+                        args,
+                        jobstore_path,
+                        iteration + 1 if repeat is not None else None,
+                        progress)
+
+    def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> Tuple[List[Any], bool]:
+        tags = options.tags
+        numbers = options.numbers
+        versions = options.versions
+        runner = options.runner
+        time = options.time
+        verbose = options.verbose
+        quiet = options.quiet
+        threads = options.threads
+        jobstore_path = options.jobstore_path
+        exclude_numbers = options.exclude_numbers
+        exclude_tags = options.exclude_tags
+        ids = options.id
+        repeat = options.repeat
+        progress = options.progress
         # Get all the versions to test.
         # Unlike with CWL, WDL requires a WDL file to declare a specific version,
         # and prohibits mixing file versions in a workflow, although some runners
@@ -493,7 +543,6 @@ def run_and_generate_tests_args(self, tags: Optional[str], numbers: Optional[str
             f'failed, {skips} skipped, {ignored} ignored, {warnings} warnings'
         )
 
-
         # identify the failing tests
         failed_ids = [str(response['number']) for response in test_responses if
                       response['status'] not in {'SUCCEEDED', 'SKIPPED'}]
@@ -516,13 +565,11 @@ def run_and_generate_tests(self, options: argparse.Namespace) -> Tuple[List[Any]
             if runner == "cromwell":
                 args[runner] = options.cromwell_args
                 args["cromwell_pre_args"] = options.cromwell_pre_args
-        return self.run_and_generate_tests_args(tags=options.tags, numbers=options.numbers, versions=options.versions,
-                                                runner=options.runner, time=options.time, verbose=options.verbose,
-                                                quiet=options.quiet, threads=options.threads, args=args,
-                                                jobstore_path=options.jobstore_path,
-                                                exclude_numbers=options.exclude_numbers,
-                                                exclude_tags=options.exclude_tags, ids=options.id,
-                                                repeat=options.repeat, progress=options.progress)
+        if options.debug:
+            self._run_debug(options=options, args=args)
+            return [], False
+
+        return self.run_and_generate_tests_args(options=options, args=args)
 
 
 def add_options(parser) -> None:
@@ -568,6 +615,7 @@ def add_options(parser) -> None:
     # Test responses are collected and sorted, so this option allows the script to print out the current progress
     parser.add_argument("--progress", default=False, action="store_true", help="Print the progress of the test suite "
                                                                                "as it runs.")
+    parser.add_argument("--debug", default=False)
 
 
 def main(argv=None):

From e34399afea06362fcbf56fce7a83b3416ed46a53 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 25 Jun 2024 20:29:41 -0700
Subject: [PATCH 03/34] Fix map key type checks + format + debug to print
 command

---
 lib.py              |  52 ++++++------
 run.py              | 195 +++++++++++++++++++++++++-------------------
 setup_unit_tests.py |  71 ++++++++++++++--
 3 files changed, 203 insertions(+), 115 deletions(-)

diff --git a/lib.py b/lib.py
index 27d0155..800d375 100644
--- a/lib.py
+++ b/lib.py
@@ -7,7 +7,7 @@
 from WDL.Type import Float as WDLFloat, String as WDLString, File as WDLFile, Int as WDLInt, Boolean as WDLBool, \
     Array as WDLArray, Map as WDLMap, Pair as WDLPair, StructInstance as WDLStruct
 
-from typing import Optional, Any, Dict, Union
+from typing import Optional, Any, Dict, Union, List, Type
 from WDL.Type import Base as WDLBase
 
 
@@ -215,7 +215,9 @@ def get_wdl_file(wdl_file: str, wdl_dir: str, version: str) -> str:
     return generate_wdl(wdl_file, wdl_dir, version, outfile_name=outfile_name)
 
 
-def run_cmd(cmd, cwd):
+def run_cmd(cmd: List[str], cwd: str, debug: bool = False):
+    if debug:
+        print(" ".join(cmd))
     p = subprocess.Popen(cmd, stdout=-1, stderr=-1, cwd=cwd)
     stdout, stderr = p.communicate()
 
@@ -333,13 +335,25 @@ def get_specific_tests(conformance_tests, tag_argument, number_argument, id_argu
                 tests.add(test_number)
     return sorted(list(tests))
 
+def verify_return_code(expected_ret_code: Union[int, List[int], str], got_ret_code: int):
+    if not isinstance(expected_ret_code, list):
+        expected_ret_code = [expected_ret_code]
+    success = {'status': 'SUCCEEDED'}
+    for rc in expected_ret_code:
+        if rc == "*":
+            # this stands for any return code
+            return success
+        if got_ret_code == rc:
+            return success
+    return {'status': 'FAILED',
+            'reason': f"Workflow did not return the correct return code! Got: {got_ret_code}. Expected: {','.join(expected_ret_code)}."}
+
 
-def verify_failure(expected: Dict[str, Any], ret_code: int) -> dict:
+def verify_failure(ret_code: int) -> dict:
     """
     Verify that the workflow did fail
 
     ret_code should be the status code WDL runner outputs when running the test
-    :param expected: Expected failure outpute, ex: return_code: 42
     :param ret_code: return code from WDL runner
 
     If ret_code is fail (>0 or True) and expected return code matches or doesn't exist, then return success
@@ -354,25 +368,10 @@ def verify_failure(expected: Dict[str, Any], ret_code: int) -> dict:
         return {'status': 'FAILED',
                 'reason': f"Workflow did not fail!"}
 
-    # proper failure, if expected return code is empty, return success
-    expected_ret_code = expected["fail"].get("return_code")
-    success = {'status': f'SUCCEEDED'}
-    if expected_ret_code is None:
-        return success
-    if not isinstance(expected_ret_code, list):
-        expected_ret_code = [expected_ret_code]
-    for rc in expected_ret_code:
-        if rc == "*":
-            # this stands for any return code
-            return success
-        if rc == ret_code:
-            return success
-    # else, the workflow return code does not match the expected return code
-    return {'status': 'FAILED',
-            'reason': f"Workflow did not return the correct return code! Got: {ret_code}. Expected: {','.join(expected_ret_code)}."}
+    return {'status': f'SUCCEEDED'}
 
 
-def py_type_of_wdl_class(wdl_type: WDLBase):
+def py_type_of_wdl_class(wdl_type: WDLBase) -> Union[Type[int], Type[float], Type[bool], Type[str]]:
     """
     Return python equivalent type for a given WDL.Type class
     """
@@ -393,7 +392,7 @@ def wdl_inner_type(wdl_type):
     if '[' in wdl_type:
         remaining = '['.join(wdl_type.split('[')[1:])  # get remaining string starting from open bracket
         end_idx = len(remaining) - remaining[::-1].index(']') - 1  # find index of closing bracket
-        # remove postfix quantifiers
+        # remove outer type postfix quantifiers
         return remaining[:end_idx]
     else:
         return wdl_type
@@ -406,7 +405,12 @@ def wdl_outer_type(wdl_type):
     # deal with structs
     if isinstance(wdl_type, dict):
         return wdl_type
-    return wdl_type.split('[')[0]
+    if '[' in wdl_type:
+        reverse_idx = wdl_type[::-1].index(']')
+        postfix = wdl_type[len(wdl_type) - reverse_idx:]
+        return wdl_type.split('[')[0] + postfix
+    else:
+        return wdl_type
 
 
 def wdl_type_to_miniwdl_class(wdl_type: Union[Dict[str, Any], str]) -> Optional[WDLBase]:
@@ -440,6 +444,8 @@ def wdl_type_to_miniwdl_class(wdl_type: Union[Dict[str, Any], str]) -> Optional[
         return WDLMap
     elif wdl_type == 'Pair':
         return WDLPair
+    elif wdl_type == 'Object':
+        return WDLString
     else:
         raise NotImplementedError
         # return None
diff --git a/run.py b/run.py
index 6fad018..602bdd5 100755
--- a/run.py
+++ b/run.py
@@ -28,7 +28,7 @@
 from WDL.Type import Base as WDLBase
 
 from lib import run_cmd, py_type_of_wdl_class, verify_failure, announce_test, print_response, convert_type, run_setup, \
-    get_specific_tests, get_wdl_file
+    get_specific_tests, get_wdl_file, verify_return_code
 
 WDL_VERSIONS = ["draft-2", "1.0", "1.1"]
 
@@ -116,6 +116,9 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
         :param result: result value object from WDL runner
         :param typ: type of output from conformance file
         """
+        if typ.optional and expected is None and result is None:
+            # an optional result does not need to exist
+            return {'status': f'SUCCEEDED'}
         if isinstance(typ, WDLArray):
             try:
                 if len(expected) != len(result):
@@ -137,8 +140,15 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                     return {'status': 'FAILED', 'reason': f"Size of expected and result do not match!\n"
                                                           f"Expected output: {expected}\n"
                                                           f"Actual result was: {result}!"}
-                for key in expected.keys():
-                    status_result = self.compare_outputs(expected[key], result[key], typ.item_type[1])
+                # compare both the key and values of the map
+                expected_map_keys, result_map_keys = list(expected.keys()), list(result.keys())
+                for i in range(len(expected)):
+                    expected_key = expected_map_keys[i]
+                    result_key = result_map_keys[i]
+                    status_result = self.compare_outputs(expected_key, result_key, typ.item_type[0])
+                    if status_result['status'] == 'FAILED':
+                        return status_result
+                    status_result = self.compare_outputs(expected[expected_key], result[result_key], typ.item_type[1])
                     if status_result['status'] == 'FAILED':
                         return status_result
             except (KeyError, TypeError):
@@ -169,12 +179,38 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                                                       f"Expected output: {expected}\n"
                                                       f"Actual output: {result}!"}
             # check that output types are correct
-            if not isinstance(expected, py_type_of_wdl_class(typ)) or not isinstance(result,
-                                                                                     py_type_of_wdl_class(
-                                                                                         typ)):
-                return {'status': 'FAILED', 'reason': f"Incorrect types!\n"
-                                                      f"Expected output: {expected}\n"
-                                                      f"Actual output: {result}!"}
+            if not isinstance(expected, py_type_of_wdl_class(typ)) or not isinstance(result, py_type_of_wdl_class(typ)):
+                # When outputting in both miniwdl and toil, Map keys are always strings regardless of the specified type
+                # When Map[Int, Int], the output will be {"1": 1}
+                # Or when Map[Float, Int], the output will be {"1.000000": 1}
+                # This only applies to Int and Float types, as Boolean key types don't seem to be supported in miniwdl
+                if isinstance(typ, WDLInt):
+                    try:
+                        # ensure the stringified version is equivalent to an int
+                        py_type_of_wdl_class(typ)(result)
+                        py_type_of_wdl_class(typ)(expected)
+                    except ValueError:
+                        # the string representation does not represent the right type
+                        return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}! Most likely a Map key type is incorrect.\n"
+                                                              f"Expected output: {expected}\n"
+                                                              f"Actual output: {result}!"}
+                elif isinstance(typ, WDLFloat):
+                    try:
+                        # ensure the stringified version is equivalent to an int
+                        py_type_of_wdl_class(typ)(result)
+                        py_type_of_wdl_class(typ)(expected)
+                        if not ("." in result and "." in expected):
+                            # the string representation is not a float but an int
+                            raise ValueError
+                    except ValueError:
+                        # the string representation does not represent the right type
+                        return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}! Most likely a Map key type is incorrect.\n"
+                                                              f"Expected output: {expected}\n"
+                                                              f"Actual output: {result}!"}
+                else:
+                    return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}!\n"
+                                                          f"Expected output: {expected}\n"
+                                                          f"Actual output: {result}!"}
 
         if isinstance(typ, WDLFile):
             # check file path exists
@@ -221,17 +257,25 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                                                       f"Actual output: {result}"}
         return {'status': f'SUCCEEDED'}
 
-    def run_verify(self, expected: dict, results_file: str, ret_code: int, exclude_outputs: Optional[list]) -> dict:
+    def run_verify(self, expected: dict, results_file: str, ret_code: int) -> dict:
         """
         Check either for proper output or proper success/failure of WDL program, depending on if 'fail' is included in
         the conformance test
         """
-        if 'fail' in expected.keys():
+        outputs = expected['outputs']
+        exclude_outputs = expected.get('exclude_output')
+
+        if expected.get("fail"):
             # workflow is expected to fail
-            response = verify_failure(expected, ret_code)
+            response = verify_failure(ret_code)
         else:
             # workflow is expected to run
-            response = self.verify_outputs(expected, results_file, ret_code, exclude_outputs)
+            response = self.verify_outputs(outputs, results_file, ret_code, exclude_outputs)
+
+        if response["status"] == "SUCCEEDED" and expected.get("return_code") is not None:
+            # check return code if it exists
+            response.update(verify_return_code(expected["return_code"], ret_code))
+
         return response
 
     def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclude_outputs: Optional[list]) -> dict:
@@ -243,14 +287,14 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
         :param ret_code: return code from WDL runner
         :param exclude_outputs: outputs to exclude when comparing
         """
-        if ret_code:
-            return {'status': 'FAILED', 'reason': f"Workflow failed to run!"}
-
         try:
             with open(results_file, 'r') as f:
                 test_results = json.load(f)
         except OSError:
-            return {'status': 'FAILED', 'reason': f'Results file at {results_file} cannot be opened'}
+            # some runners won't create a results file on workflow failure
+            # this may not necessarily be an error; failure is only ensured when the config specifies failure
+            # and a nonzero exit code is possible on both a failing and successful task
+            test_results = {}
         except json.JSONDecodeError:
             return {'status': 'FAILED', 'reason': f'Results file at {results_file} is not JSON'}
 
@@ -259,7 +303,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
             exclude_outputs = list(exclude_outputs) if not isinstance(exclude_outputs, list) else exclude_outputs
             # remove the outputs that we are not allowed to compare
             test_result_outputs = dict()
-            for k, v in test_results['outputs'].items():
+            for k, v in test_results.get('outputs', {}).items():
                 for excluded in exclude_outputs:
                     if k.endswith(excluded):
                         # we first check that an output variable ends with the excluded name
@@ -273,11 +317,11 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
                     test_result_outputs[k] = v
 
         else:
-            test_result_outputs = test_results['outputs']
+            test_result_outputs = test_results.get('outputs', {})
         if len(test_result_outputs) != len(expected):
             return {'status': 'FAILED',
                     'reason': f"'outputs' section expected {len(expected)} results ({list(expected.keys())}), got "
-                              f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())})"}
+                              f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())}) with return code {ret_code}"}
 
         result = {'status': f'SUCCEEDED', 'reason': None}
 
@@ -303,7 +347,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
     LOG_LOCK = threading.Lock()
 
     def run_single_test(self, test_index: int, test: dict, runner: str, version: str, time: bool, verbose: bool,
-                        quiet: bool, args: Optional[Dict[str, Any]], jobstore_path: Optional[str]) -> dict:
+                        quiet: bool, args: Optional[Dict[str, Any]], jobstore_path: Optional[str], debug: bool) -> dict:
         """
         Run a test and log success or failure.
 
@@ -341,8 +385,6 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         if runner == "toil-wdl-runner" and jobstore_path is not None:
             unique_jobstore_path = os.path.join(jobstore_path, f"wdl-jobstore-{unique_id}")
             test_args.extend(["--jobstore", unique_jobstore_path])
-        outputs = test['outputs']
-        exclude_outputs = test.get('exclude_output')
         results_file = os.path.abspath(f'results-{unique_id}.json')
         wdl_runner = RUNNERS[runner]
         # deal with cromwell arguments to define java system properties
@@ -354,16 +396,16 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         realtime = None
         if time:
             realtime_start = timeit.default_timer()
-            (ret_code, stdout, stderr) = run_cmd(cmd=cmd, cwd=os.path.dirname(os.path.abspath(__file__)))
+            (ret_code, stdout, stderr) = run_cmd(cmd=cmd, cwd=os.path.dirname(os.path.abspath(__file__)), debug=debug)
             realtime_end = timeit.default_timer()
             realtime = realtime_end - realtime_start
         else:
-            (ret_code, stdout, stderr) = run_cmd(cmd=cmd, cwd=os.path.dirname(os.path.abspath(__file__)))
+            (ret_code, stdout, stderr) = run_cmd(cmd=cmd, cwd=os.path.dirname(os.path.abspath(__file__)), debug=debug)
 
         if verbose:
             with self.LOG_LOCK:
                 announce_test(test_index, test, version, runner)
-        response = self.run_verify(outputs, results_file, ret_code, exclude_outputs)
+        response = self.run_verify(test, results_file, ret_code)
 
         if response["status"] == "FAILED" and test.get("priority") == "optional":
             # an optional test can be reported as a warning if it fails
@@ -378,7 +420,7 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
 
     def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, version: str, time: bool,
                     verbose: bool, quiet: bool, args: Optional[Dict[str, Any]], jobstore_path: Optional[str],
-                    repeat: Optional[int] = None, progress: bool = False) -> Dict[str, Any]:
+                    repeat: Optional[int] = None, progress: bool = False, debug: bool = False) -> Dict[str, Any]:
         """
         Decide if the test should be skipped. If not, run it.
 
@@ -406,31 +448,18 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
             if progress:
                 print(f"Running test {test_index} (ID: {test['id']}) with runner {runner} on WDL version {version}.")
             response.update(
-                self.run_single_test(test_index, test, runner, version, time, verbose, quiet, args, jobstore_path))
+                self.run_single_test(test_index, test, runner, version, time, verbose, quiet, args, jobstore_path, debug))
         if repeat is not None:
             response["repeat"] = repeat
         return response
 
     def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:
-        tags = options.tags
-        numbers = options.numbers
-        versions = options.versions
-        runner = options.runner
-        time = options.time
-        verbose = options.verbose
-        quiet = options.quiet
-        jobstore_path = options.jobstore_path
-        exclude_numbers = options.exclude_numbers
-        exclude_tags = options.exclude_tags
-        ids = options.id
-        repeat = options.repeat
-        progress = options.progress
         # To be used with pycharm's debugger which is currently broken if there is concurrency
-        versions_to_test = set(versions.split(','))
-        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=tags, number_argument=numbers,
-                                            id_argument=ids, exclude_number_argument=exclude_numbers, exclude_tags_argument=exclude_tags)
+        versions_to_test = set(options.versions.split(','))
+        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=options.tags, number_argument=options.numbers,
+                                            id_argument=options.id, exclude_number_argument=options.exclude_numbers, exclude_tags_argument=options.exclude_tags)
         print(f"===DEBUG===")
-        print(f'Testing runner {runner} on WDL versions: {",".join(versions_to_test)}\n')
+        print(f'Testing runner {options.runner} on WDL versions: {",".join(versions_to_test)}\n')
         for test_index in selected_tests:
             try:
                 test = self.tests[test_index]
@@ -438,53 +467,40 @@ def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]
                 print(f'ERROR: Provided test [{test_index}] do not exist.')
                 sys.exit(1)
             for version in versions_to_test:
-                for iteration in range(repeat):
+                for iteration in range(options.repeat):
                     # Handle each test as a concurrent job
                     self.handle_test(
                         test_index,
                         test,
-                        runner,
+                        options.runner,
                         version,
-                        time,
-                        verbose,
-                        quiet,
+                        options.time,
+                        options.verbose,
+                        options.quiet,
                         args,
-                        jobstore_path,
-                        iteration + 1 if repeat is not None else None,
-                        progress)
+                        options.jobstore_path,
+                        iteration + 1 if options.repeat is not None else None,
+                        options.progress,
+                        options.debug)
 
     def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> Tuple[List[Any], bool]:
-        tags = options.tags
-        numbers = options.numbers
-        versions = options.versions
-        runner = options.runner
-        time = options.time
-        verbose = options.verbose
-        quiet = options.quiet
-        threads = options.threads
-        jobstore_path = options.jobstore_path
-        exclude_numbers = options.exclude_numbers
-        exclude_tags = options.exclude_tags
-        ids = options.id
-        repeat = options.repeat
-        progress = options.progress
         # Get all the versions to test.
         # Unlike with CWL, WDL requires a WDL file to declare a specific version,
         # and prohibits mixing file versions in a workflow, although some runners
         # might allow it.
         # But the tests all need to be for single WDL versions.
-        versions_to_test = set(versions.split(','))
-        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=tags, number_argument=numbers,
-                                            id_argument=ids, exclude_number_argument=exclude_numbers, exclude_tags_argument=exclude_tags)
-        selected_tests_amt = len(selected_tests) * len(versions_to_test) * repeat
+        versions_to_test = set(options.versions.split(','))
+        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=options.tags, number_argument=options.numbers,
+                                            id_argument=options.id, exclude_number_argument=options.exclude_numbers, exclude_tags_argument=options.exclude_tags)
+        selected_tests_amt = len(selected_tests) * len(versions_to_test) * options.repeat
         successes = 0
         skips = 0
         ignored = 0
         warnings = 0
         failed = 0
         test_responses = list()
-        print(f'Testing runner {runner} on WDL versions: {",".join(versions_to_test)}\n')
-        with ProcessPoolExecutor(max_workers=threads) as executor:  # process instead of thread so realtime works
+        print(f'Testing runner {options.runner} on WDL versions: {",".join(versions_to_test)}\n')
+        with ProcessPoolExecutor(max_workers=options.threads) as executor:  # process instead of thread so realtime works
             pending_futures = []
             for test_index in selected_tests:
                 try:
@@ -493,20 +509,21 @@ def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optiona
                     print(f'ERROR: Provided test [{test_index}] do not exist.')
                     sys.exit(1)
                 for version in versions_to_test:
-                    for iteration in range(repeat):
+                    for iteration in range(options.repeat):
                         # Handle each test as a concurrent job
                         result_future = executor.submit(self.handle_test,
                                                         test_index,
                                                         test,
-                                                        runner,
+                                                        options.runner,
                                                         version,
-                                                        time,
-                                                        verbose,
-                                                        quiet,
+                                                        options.time,
+                                                        options.verbose,
+                                                        options.quiet,
                                                         args,
-                                                        jobstore_path,
-                                                        iteration + 1 if repeat is not None else None,
-                                                        progress)
+                                                        options.jobstore_path,
+                                                        iteration + 1 if options.repeat is not None else None,
+                                                        options.progress,
+                                                        options.debug)
                         pending_futures.append(result_future)
             completed_count = 0
             for result_future in as_completed(pending_futures):
@@ -514,7 +531,7 @@ def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optiona
                 # Go get each result
                 result = result_future.result()
                 test_responses.append(result)
-                if progress:
+                if options.progress:
                     # if progress is true, then print a summarized output of the completed test and current status
                     print(
                         f"{completed_count}/{selected_tests_amt}. Test {result['number']} (ID: {result['id']}) completed "
@@ -545,8 +562,16 @@ def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optiona
 
         # identify the failing tests
         failed_ids = [str(response['number']) for response in test_responses if
-                      response['status'] not in {'SUCCEEDED', 'SKIPPED'}]
-        print(f"\tFailures: {','.join(failed_ids)}")
+                      response['status'] in {'FAILED'}]
+        warn_ids = [str(response['number']) for response in test_responses if
+                    response['status'] in {'WARNING'}]
+        if len(failed_ids) > 0:
+            print(f"\tFailures: {','.join(failed_ids)}")
+        else:
+            print("\tNo failures!")
+        if len(warn_ids) > 0:
+            print(f"\tWarnings: {','.join(warn_ids)}")
+
         if len(failed_ids) > 0:
             return test_responses, False
         else:
@@ -615,7 +640,7 @@ def add_options(parser) -> None:
     # Test responses are collected and sorted, so this option allows the script to print out the current progress
     parser.add_argument("--progress", default=False, action="store_true", help="Print the progress of the test suite "
                                                                                "as it runs.")
-    parser.add_argument("--debug", default=False)
+    parser.add_argument("--debug", action="store_true", default=False)
 
 
 def main(argv=None):
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 5fd7d46..4172938 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -4,6 +4,7 @@
 """
 Extract unit tests from the WDL spec (1.1+) and create a conformance file that is conformance to our representation for this test suite
 """
+import hashlib
 import subprocess
 import regex as re
 import os
@@ -18,6 +19,8 @@
 import shutil
 from typing import Optional, Union, List, Set, Callable, Any, Dict
 
+from lib import convert_type
+
 import WDL
 
 TEST_RE = re.compile(
@@ -110,7 +113,61 @@ def extract_output_types(wdl_file, fail):
     return var_types
 
 
-def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any])\
+def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: WDL.Type.Base, data_dir: Optional[Path]):
+    """
+    Get the expected output values with respect to the output types.
+    This is done to get the md5sum of files when converting unit tests.
+
+    Ex: If given hello.txt and type File, try to get the regex of the file
+    If given "hello" and type String, return string
+    """
+    if output_values is None and output_type.optional:
+        return output_values
+    if isinstance(output_type, WDL.Type.File):
+        # this will only populate md5sum if the files exist recursively in the data directory
+        for path in glob.glob(str(data_dir / "**"), recursive=True):
+            if path.endswith(output_values) and os.path.exists(path):
+                with open(path, "rb") as f:
+                    md5sum = hashlib.md5(f.read()).hexdigest()
+                return {'md5sum': md5sum}
+    if isinstance(output_type, WDL.Type.StructInstance):
+        converted_output = dict()
+        for i, (output_key, output_value) in enumerate(output_values.items()):
+            try:
+                output_value_type = output_type.members[output_key]
+            except KeyError:
+                # coercion from map to struct can cause some weird type behavior
+                # since dictionaries passt python 3.6 are ordered, find the corresponding type from the current output's position
+                output_value_type = list(output_type.members.values())[i]
+            converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir)
+        return converted_output
+    if isinstance(output_type, WDL.Type.Map):
+        converted_output = dict()
+        for output_key, output_value in output_values.items():
+            new_output_key = convert_typed_output_values(output_key, output_type.item_type[0], data_dir)
+            new_output_value = convert_typed_output_values(output_value, output_type.item_type[1], data_dir)
+            converted_output[new_output_key] = new_output_value
+    if isinstance(output_type, WDL.Type.Pair):
+        converted_output = dict()
+        # key should be left or right
+        converted_output["left"] = convert_typed_output_values(output_values["left"], output_type.left_type, data_dir)
+        converted_output["right"] - convert_typed_output_values(output_values["right"], output_type.right_type, data_dir)
+        return converted_output
+    if isinstance(output_type, WDL.Type.Array):
+        converted_output = list()
+        for output in output_values:
+            converted_output.append(convert_typed_output_values(output, output_type.item_type, data_dir))
+        return converted_output
+    # else this is a primitive type
+    return output_values
+
+
+def convert_typed_output_values_from_string(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: Union[str, Dict[str, Any]], data_dir: Optional[Path]):
+    output_type_wdl = convert_type(output_type)
+    return convert_typed_output_values(output_values, output_type_wdl, data_dir)
+
+
+def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any]) \
         -> Union[Dict[Any, Any], List[Any], bool, int, str, float, None]:
     if isinstance(json_obj, dict):
         return {k: recursive_json_apply(v, func) for k, v in json_obj.items()}
@@ -123,7 +180,7 @@ def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, s
         return func(json_obj)
 
 
-def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], output_data_dir: Optional[Path], config: list):
+def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], data_dir: Optional[Path], output_data_dir: Optional[Path], config: list):
     """
     Given the regex, create the corresponding config entry for conformance.yaml.
 
@@ -183,15 +240,15 @@ def if_file_convert(maybe_file):
         config_entry["return_code"] = [config_entry["return_code"]]
 
     if bool(is_fail):
-        config_entry["outputs"] = {"fail": {}}
-        config_entry["outputs"]["fail"] = {"return_code": config_entry["return_code"]}
+        config_entry["outputs"] = {}
     else:
         if output_json is not None:
             for k, v in json.loads(output_json.strip()).items():
                 k_base = k.split(".")[-1]
+                output_type = output_var_types[k_base]
                 config_entry["outputs"][k] = {
-                    "type": output_var_types[k_base],
-                    "value": v
+                    "type": output_type,
+                    "value": convert_typed_output_values_from_string(v, output_type, data_dir)
                 }
 
     if "type" not in config_entry:
@@ -288,7 +345,7 @@ def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, versio
         output_data_dir = None
 
     for m in all_m:
-        generate_config_file(m, output_dir, version, all_data_files, output_data_dir, config)
+        generate_config_file(m, output_dir, version, all_data_files, data_dir, output_data_dir, config)
 
     if output_type == "json":
         config_file = output_dir / "test_config.json"

From 7eca2fd1f7066f330f27ccee73f3945cba75c7bd Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 25 Jun 2024 20:39:51 -0700
Subject: [PATCH 04/34] Update license

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 53c5d00..cdf1a65 100644
--- a/LICENSE
+++ b/LICENSE
@@ -204,7 +204,7 @@
    limitations under the License.
 
 All files in this repository excluding files under the integration/gatk directory are under the Apache license.
-All files under the integration/gatk directory are under the BSD 3-Clause license from OpenWDL, copied below.
+All files under the integration/gatk directory and the setup_unit_tests.py script are under the BSD 3-Clause license from OpenWDL, copied below.
 
 BSD 3-Clause License
 

From 2a5cc35e2c17cfbbe1b34a969dc2d255c7087577 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 25 Jun 2024 20:40:30 -0700
Subject: [PATCH 05/34] Update conformance.yaml to have new fail representation
 that is more consistent with the wdl-test spec

---
 conformance.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/conformance.yaml b/conformance.yaml
index c61d7bb..374b7ed 100644
--- a/conformance.yaml
+++ b/conformance.yaml
@@ -210,8 +210,8 @@
     dir: tests/basic_fail
     wdl: basic_fail.wdl
     json: basic_fail.json
-  outputs:
-    fail: {}
+  fail: true
+  outputs: {}
 - description: |
     Basic ceil() test
   tags: ["ceil", "standard_library"]
@@ -802,8 +802,8 @@
     dir: tests/range_as_input
     wdl: range_as_input.wdl
     json: range_invalid.json
-  outputs:
-    fail: {}
+  fail: true
+  outputs: {}
 - description: |
     Legacy test for range_as_input, range=0
   tags: ["range", "legacy"]
@@ -865,8 +865,8 @@
     dir: tests/length_as_input_with_map
     wdl: length_as_input_with_map.wdl
     json: length.json
-  outputs:
-    fail: {}
+  fail: true
+  outputs: {}
 - description: |
     Legacy test for length_as_input, fail
   tags: ["length", "legacy", "fail"]
@@ -876,8 +876,8 @@
     dir: tests/length_as_input
     wdl: length_as_input.wdl
     json: length_invalid.json
-  outputs:
-    fail: {}
+  fail: true
+  outputs: {}
 - description: |
     Legacy test for zip_as_input
   tags: ["zip", "legacy"]
@@ -1133,8 +1133,8 @@
     dir: tests/array_coerce
     wdl: array_coerce.wdl
     json: empty.json
-  outputs:
-    fail: {}
+  fail: true
+  outputs: {}
 - description: |
     Test ENCODE
   versions: ["draft-2", "1.0"]

From be5dbc05689564d14b7dac0baac2f591c85aee05 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Thu, 27 Jun 2024 20:29:10 -0700
Subject: [PATCH 06/34] Add extra metadata support (to use for file
 hashes/regexes)

---
 run.py                   |  2 +-
 run_unit.py              | 12 ++++++
 setup_unit_tests.py      | 81 +++++++++++++++++++++++++++++++---------
 unit_tests_metadata.yaml | 11 ++++++
 4 files changed, 87 insertions(+), 19 deletions(-)
 create mode 100644 unit_tests_metadata.yaml

diff --git a/run.py b/run.py
index 602bdd5..2ae0b3b 100755
--- a/run.py
+++ b/run.py
@@ -321,7 +321,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
         if len(test_result_outputs) != len(expected):
             return {'status': 'FAILED',
                     'reason': f"'outputs' section expected {len(expected)} results ({list(expected.keys())}), got "
-                              f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())}) with return code {ret_code}"}
+                              f"{len(test_result_outputs)} instead ({list(test_result_outputs.keys())}) with exit code {ret_code}"}
 
         result = {'status': f'SUCCEEDED', 'reason': None}
 
diff --git a/run_unit.py b/run_unit.py
index 3fd40cd..0ff2745 100644
--- a/run_unit.py
+++ b/run_unit.py
@@ -16,12 +16,24 @@ def main():
 
     parser.add_argument("--config", "-c", default=unit_conformance_path,
                         help="Specify the path of the conformance config file.")
+    parser.add_argument("--reset", default=False, action="store_true",
+                        help="Specify whether to run the setup script again.")
+    parser.add_argument("--force-pull", default=False, action="store_true",
+                        help="Specify whether to use the cached SPEC or to force a pull."
+                             "The setup script will be run as well.")
     parser.set_defaults(version="1.1")  # override default to 1.1
     parser.set_defaults(runner="toil-wdl-runner")  # cromwell doesn't support 1.1+
 
     argcomplete.autocomplete(parser)
     args = parser.parse_args()
 
+    if args.reset:
+        from setup_unit_tests import main as setup_unit_tests_main
+        argv = ["--version=1.1", "--output-type=yaml"]
+        if args.force_pull:
+            argv.append("--force-pull")
+        setup_unit_tests_main(argv)
+
     if args.versions not in ["1.1", "1.2"]:
         raise RuntimeError(f"WDL version is not valid; unit tests are only supported on WDL 1.1+.")
 
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 4172938..b61a294 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -6,6 +6,8 @@
 """
 import hashlib
 import subprocess
+import sys
+
 import regex as re
 import os
 import glob
@@ -34,6 +36,17 @@
 regex_var_types_str = r"([\w\[\]+?]+)\s(\w+)(?:[\s\S]*?(?= =))"
 
 
+def get_not_none(d: Optional[Dict[Any, Any]], k: str) -> Any:
+    # same as d.get(k) but return None if d is None
+    if d is not None:
+        return d.get(k)
+    return None
+
+def index_not_none(l: Optional[List[Any]], i: int) -> Any:
+    if l is not None:
+        return l[i]
+    return None
+
 def extract_output_types_defer(wdl_file):
     """
     The main issue with using miniwdl's parser is parsing the wdl file is dependent on miniwdl,
@@ -113,7 +126,8 @@ def extract_output_types(wdl_file, fail):
     return var_types
 
 
-def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: WDL.Type.Base, data_dir: Optional[Path]):
+def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: WDL.Type.Base,
+                                data_dir: Optional[Path], metadata: Union[List[Any], Dict[str, Any]]):
     """
     Get the expected output values with respect to the output types.
     This is done to get the md5sum of files when converting unit tests.
@@ -130,6 +144,12 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
                 with open(path, "rb") as f:
                     md5sum = hashlib.md5(f.read()).hexdigest()
                 return {'md5sum': md5sum}
+            # else check if there is extra metadata to read from
+            if get_not_none(metadata, "md5sum") is not None:
+                return {'md5sum': metadata.get("md5sum")}
+            if get_not_none(metadata, "regex") is not None:
+                return {'regex': metadata.get("regex")}
+
     if isinstance(output_type, WDL.Type.StructInstance):
         converted_output = dict()
         for i, (output_key, output_value) in enumerate(output_values.items()):
@@ -137,34 +157,35 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
                 output_value_type = output_type.members[output_key]
             except KeyError:
                 # coercion from map to struct can cause some weird type behavior
-                # since dictionaries passt python 3.6 are ordered, find the corresponding type from the current output's position
+                # since dictionaries past python 3.6 are ordered, find the corresponding type from the current output's position
                 output_value_type = list(output_type.members.values())[i]
-            converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir)
+            converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir, get_not_none(metadata, output_value))
         return converted_output
     if isinstance(output_type, WDL.Type.Map):
         converted_output = dict()
         for output_key, output_value in output_values.items():
-            new_output_key = convert_typed_output_values(output_key, output_type.item_type[0], data_dir)
-            new_output_value = convert_typed_output_values(output_value, output_type.item_type[1], data_dir)
+            new_output_key = convert_typed_output_values(output_key, output_type.item_type[0], data_dir, get_not_none(metadata, output_key))
+            new_output_value = convert_typed_output_values(output_value, output_type.item_type[1], data_dir, get_not_none(metadata, output_value))
             converted_output[new_output_key] = new_output_value
     if isinstance(output_type, WDL.Type.Pair):
         converted_output = dict()
         # key should be left or right
-        converted_output["left"] = convert_typed_output_values(output_values["left"], output_type.left_type, data_dir)
-        converted_output["right"] - convert_typed_output_values(output_values["right"], output_type.right_type, data_dir)
+        converted_output["left"] = convert_typed_output_values(output_values["left"], output_type.left_type, data_dir, get_not_none(metadata, output_values["left"]))
+        converted_output["right"] - convert_typed_output_values(output_values["right"], output_type.right_type, data_dir, get_not_none(metadata, output_values["right"]))
         return converted_output
     if isinstance(output_type, WDL.Type.Array):
         converted_output = list()
-        for output in output_values:
-            converted_output.append(convert_typed_output_values(output, output_type.item_type, data_dir))
+        for i, output in enumerate(output_values):
+            converted_output.append(convert_typed_output_values(output, output_type.item_type, data_dir, index_not_none(metadata, i)))
         return converted_output
     # else this is a primitive type
     return output_values
 
 
-def convert_typed_output_values_from_string(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: Union[str, Dict[str, Any]], data_dir: Optional[Path]):
+def convert_typed_output_values_from_string(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: Union[str, Dict[str, Any]],
+                                            data_dir: Optional[Path], metadata: Optional[Dict[str, Any]]):
     output_type_wdl = convert_type(output_type)
-    return convert_typed_output_values(output_values, output_type_wdl, data_dir)
+    return convert_typed_output_values(output_values, output_type_wdl, data_dir, metadata)
 
 
 def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any]) \
@@ -180,7 +201,8 @@ def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, s
         return func(json_obj)
 
 
-def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], data_dir: Optional[Path], output_data_dir: Optional[Path], config: list):
+def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], data_dir: Optional[Path],
+                         output_data_dir: Optional[Path], config: list, metadata: Optional[Dict[str, Any]]) -> None:
     """
     Given the regex, create the corresponding config entry for conformance.yaml.
 
@@ -205,6 +227,12 @@ def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_f
 
     config_entry["id"] = target
 
+    target_metadata = None
+    for m in metadata:
+        if m.get("id") == target:
+            target_metadata = m
+            break
+
     wdl_dir, wdl_base = os.path.split(wdl_file)
 
     # the spec assumes all input files are the basenames
@@ -243,12 +271,13 @@ def if_file_convert(maybe_file):
         config_entry["outputs"] = {}
     else:
         if output_json is not None:
+            target_output_metadata = get_not_none(target_metadata, "outputs")
             for k, v in json.loads(output_json.strip()).items():
                 k_base = k.split(".")[-1]
                 output_type = output_var_types[k_base]
                 config_entry["outputs"][k] = {
                     "type": output_type,
-                    "value": convert_typed_output_values_from_string(v, output_type, data_dir)
+                    "value": convert_typed_output_values_from_string(v, output_type, data_dir, get_not_none(get_not_none(target_output_metadata, k), "value"))
                 }
 
     if "type" not in config_entry:
@@ -307,7 +336,7 @@ def write_test_files(m: re.Match, output_dir: Path, version: str):
         o.write(wdl)
 
 
-def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, version: str, output_type: str):
+def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, version: str, output_type: str, extra_metadata: Optional[Path]):
     if not output_dir.exists():
         output_dir.mkdir(parents=True)
 
@@ -344,8 +373,14 @@ def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, versio
     else:
         output_data_dir = None
 
+    metadata = None
+    if extra_metadata is not None:
+        with open(extra_metadata, "r") as e:
+            yaml = YAML()
+            metadata = yaml.load(e)
+
     for m in all_m:
-        generate_config_file(m, output_dir, version, all_data_files, data_dir, output_data_dir, config)
+        generate_config_file(m, output_dir, version, all_data_files, data_dir, output_data_dir, config, metadata)
 
     if output_type == "json":
         config_file = output_dir / "test_config.json"
@@ -361,7 +396,9 @@ def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, versio
         shutil.copytree(data_dir, output_data_dir, symlinks=True, dirs_exist_ok=False)
 
 
-def main():
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
     parser = argparse.ArgumentParser(
         usage="%(prog)s [options]",
         description=(
@@ -389,11 +426,18 @@ def main():
         action="store_true",
         help="Default behavior does not pull the spec repo if it was already pulled. Specify this argument to force a refresh."
     )
+    parser.add_argument(
+        "--extra-metadata", "-e",
+        default="unit_tests_metadata.yaml",
+        help="Include extra metadata when extracting the unit tests into the config. "
+             "Since we have our own method of testing file outputs, this is mostly used for file hashes/regexes."
+    )
     argcomplete.autocomplete(parser)
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
 
     spec_dir = f"wdl-{args.version}-spec"
     if not os.path.exists(spec_dir) or args.force_pull is True:
+        print("Pulling SPEC from repo...")
         # while the WDL spec has its bugs, use a fixed version
         # see openwdl issues #653, #654, #661, #662, #663, #664, #665, #666
         # cmd = f"git clone https://github.com/openwdl/wdl.git {spec_dir}"
@@ -413,7 +457,8 @@ def main():
     cmd = f"rm -rf unit_tests"
     subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
-    extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type)
+    print("Extracting tests...")
+    extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type, args.extra_metadata)
 
 
 if __name__ == "__main__":
diff --git a/unit_tests_metadata.yaml b/unit_tests_metadata.yaml
new file mode 100644
index 0000000..2f144e8
--- /dev/null
+++ b/unit_tests_metadata.yaml
@@ -0,0 +1,11 @@
+# this should have the same structure as the expected yaml configs, ex conformance.yaml
+- id: optional_output
+  outputs:
+    optional_output.example1:
+      type: File
+      value: { md5sum: c4ca4238a0b923820dcc509a6f75849b }
+    optional_output.file_array:
+      type: Array[File?]
+      value:
+      - md5sum: c4ca4238a0b923820dcc509a6f75849b
+      -
\ No newline at end of file

From cefb6886339e9cbaaebd54b1bd1717b6c9ef0ebe Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 2 Jul 2024 17:08:02 -0700
Subject: [PATCH 07/34] Add json key filepath conversion, basic dependency
 support, various scripts for setup

---
 Makefile                 |  6 ++-
 lib.py                   | 81 +++++++++++++++++++++++++++++++++++++++-
 run.py                   | 12 +++++-
 setup_unit_tests.py      | 27 ++++++++++++--
 unit_tests_metadata.yaml |  9 ++++-
 unit_tests_script.sh     |  4 ++
 6 files changed, 131 insertions(+), 8 deletions(-)
 create mode 100755 unit_tests_script.sh

diff --git a/Makefile b/Makefile
index 0f0a9c7..e7395fc 100644
--- a/Makefile
+++ b/Makefile
@@ -28,10 +28,14 @@ clean:
 clean-csv:
 	rm csv_output_*
 
-clean-unit:
+clean-unit: clean-unit-setup
 	rm -rf wdl-1.1-spec
 	rm -rf unit_tests
 
+clean-unit-setup:
+	rm -rf /mnt/outputs
+	rm -rf /mnt/tmp
+
 cromwell:
 	mkdir -p build
 	if [ ! -f build/cromwell.jar ]; then \
diff --git a/lib.py b/lib.py
index 800d375..251dd89 100644
--- a/lib.py
+++ b/lib.py
@@ -71,7 +71,7 @@ def generate_change_container_specifier(lines, to_replace="container", replace_w
             if line.strip() == "}":
                 in_runtime = False
             if i > 0:
-                yield line[:i] + replace_with + line[i+len(to_replace):]
+                yield line[:i] + replace_with + line[i + len(to_replace):]
             else:
                 yield line
 
@@ -335,6 +335,7 @@ def get_specific_tests(conformance_tests, tag_argument, number_argument, id_argu
                 tests.add(test_number)
     return sorted(list(tests))
 
+
 def verify_return_code(expected_ret_code: Union[int, List[int], str], got_ret_code: int):
     if not isinstance(expected_ret_code, list):
         expected_ret_code = [expected_ret_code]
@@ -520,3 +521,81 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
         return WDLArray(converted_inner_type, optional=optional, nonempty=nonempty) if converted_inner_type is not None else None
     # primitives remaining
     return wdl_type_to_miniwdl_class(wdl_type)(optional=optional)
+
+
+def test_gpu_available():
+    if bool(os.getenv("WDL_CONFORMANCE_TESTS__GPU")):
+        # override
+        return True
+    try:
+        p = subprocess.run("nvidia-smi".split(" "))
+    except (FileNotFoundError, subprocess.SubprocessError):
+        pass
+    else:
+        # we have an nvidia gpu
+        if p.returncode == 0:
+            return True
+    # This is copied from how Toil checks for amd gpus
+    # This may not work, I'm not sure what the current conventions for getting amd gpu data is
+    # see comment in src/toil/lib/accelerators.py::count_amd_gpus
+    try:
+        p = subprocess.run(["amd-smi", "static"])
+    except (FileNotFoundError, subprocess.SubprocessError):
+        pass
+    else:
+        if p.returncode == 0:
+            # we (maybe) have an amd gpu
+            return True
+    return False
+
+
+IGNORE_DEPENDENCIES = ["docker", "root", "singularity"]
+
+
+def test_dependencies(dependencies: Optional[List[str]], response: Dict[str, Any]) -> None:
+    """
+    Given a set of dependencies for a test, see if any of those dependencies are violated.
+    If so, change a failing test to a warning and update the reason.
+    """
+    # todo: maybe there is a better way to deal with dependencies
+    if dependencies is None:
+        return
+    for d in dependencies:
+        if d == "gpu":
+            if not test_gpu_available() and response['status'] == 'FAILED':
+                response["status"] = "WARNING"
+                response["reason"] = (f"Some GPU dependency is necessary but is not available on machine. "
+                                      f"Either run the test on a GPU supported system or set 'WDL_CONFORMANCE_TESTS__GPU=True' to override. "
+                                      f"\nFailing reason:\n") + response["reason"]
+        elif d == "disks":
+            # todo: I'm not really sure how to test for a disk specific error
+            # i could try to check if all mount disks are available but that requires the mount points to be given in advance
+            # toil will return NotImplementedError for a missing mount point, so just catch that as miniwdl does not support this functionality yet
+            # and cromwell is stuck at wdl 1.0
+            if response['status'] == 'FAILED' and "NotImplementedError" in response['stderr']:
+                response["status"] = "WARNING"
+                response["reason"] = (f"Some disk dependency is necessary but is not available on machine."
+                                      f"Check that all the mount points specified in the WDL tasks exist."
+                                      f"\nFailing reason:\n") + response["reason"]
+        elif d == "cpu":
+            # todo: figure out better way to detect cpu errors too
+            if response['status'] == 'FAILED':
+                # miniwdl adjusts cpus to host limit so itll never error
+                # so just deal with toil
+                to_match = re.compile(r"is requesting [0-9.]+ cores, more than the maximum of [0-9.]+ cores")
+                if re.search(to_match, response['stderr']):
+                    response["status"] = "WARNING"
+                    response["reason"] = (f"Some CPU dependency is necessary but is not available on machine."
+                                          f"A WDL task likely requested more CPUs than available."
+                                          f"\nFailing reason:\n") + response["reason"]
+        elif d == "memory":
+            # todo: better, same as cpu above
+            if response['status'] == 'FAILED':
+                to_match = re.compile(r"is requesting [0-9.]+ bytes of memory, more than the maximum of [0-9.]+ bytes of memory")
+                if re.search(to_match, response['stderr']):
+                    response["status"] = "WARNING"
+                    response["reason"] = (f"Some memory dependency is necessary but is not available on machine."
+                                          f"A WDL task likely requested more memory than available."
+                                          f"\nFailing reason:\n") + response["reason"]
+        elif d not in IGNORE_DEPENDENCIES:
+            print(f"Warning: Test framework encountered unsupported dependency {d}. Ignoring...")
diff --git a/run.py b/run.py
index 2ae0b3b..e6b7ffe 100755
--- a/run.py
+++ b/run.py
@@ -28,7 +28,7 @@
 from WDL.Type import Base as WDLBase
 
 from lib import run_cmd, py_type_of_wdl_class, verify_failure, announce_test, print_response, convert_type, run_setup, \
-    get_specific_tests, get_wdl_file, verify_return_code
+    get_specific_tests, get_wdl_file, verify_return_code, test_dependencies
 
 WDL_VERSIONS = ["draft-2", "1.0", "1.1"]
 
@@ -391,6 +391,14 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         pre_args = None
         if runner == "cromwell":
             pre_args = args["cromwell_pre_args"]
+
+        # todo: it seems odd that I'm looking for a dependency when the test spec says its supposed to be used to turn failing tests into warnings
+        # this also isn't the most efficient
+        if test.get('dependencies') is not None and runner == "toil-wdl-runner":
+            if "docker" in test["dependencies"]:
+                test_args.extend(["--container", "docker"])
+            if "singularity" in test["dependencies"]:
+                test_args.extend(["--container", "singularity"])
         cmd = wdl_runner.format_command(wdl_file, json_file, json_string, results_file, test_args, verbose, pre_args)
 
         realtime = None
@@ -451,6 +459,8 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
                 self.run_single_test(test_index, test, runner, version, time, verbose, quiet, args, jobstore_path, debug))
         if repeat is not None:
             response["repeat"] = repeat
+        # Turn failing tests to warnings if they violate a test dependency
+        test_dependencies(dependencies=test.get("dependencies"), response=response)
         return response
 
     def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index b61a294..6dd0639 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -106,7 +106,9 @@ def extract_output_types(wdl_file, fail):
             # defer to the other more simpler parser
             return extract_output_types_defer(wdl_file)
         else:
-            raise
+            raise RuntimeError("Likely unsupported type. Failed to parse WDL file %s!", wdl_file) from e
+    except Exception as e:
+        raise RuntimeError(f"Failed to parse WDL file {str(wdl_file)}!") from e
 
     target: Union[WDL.Tree.Workflow, WDL.Tree.Task]
 
@@ -191,7 +193,7 @@ def convert_typed_output_values_from_string(output_values: Union[None, str, Dict
 def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any]) \
         -> Union[Dict[Any, Any], List[Any], bool, int, str, float, None]:
     if isinstance(json_obj, dict):
-        return {k: recursive_json_apply(v, func) for k, v in json_obj.items()}
+        return {func(k): recursive_json_apply(v, func) for k, v in json_obj.items()}
     elif isinstance(json_obj, list):
         return [recursive_json_apply(v, func) for v in json_obj]
     elif json_obj is None:
@@ -227,7 +229,7 @@ def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_f
 
     config_entry["id"] = target
 
-    target_metadata = None
+    target_metadata: Optional[Dict[str, Any]] = None
     for m in metadata:
         if m.get("id") == target:
             target_metadata = m
@@ -247,7 +249,7 @@ def if_file_convert(maybe_file):
                 return maybe_file
 
         for k, v in json.loads(input_json.strip()).items():
-            input_json_dict[k] = recursive_json_apply(v, if_file_convert)
+            input_json_dict[if_file_convert(k)] = recursive_json_apply(v, if_file_convert)
 
     config_entry["inputs"] = {
         "dir": wdl_dir,
@@ -294,6 +296,9 @@ def if_file_convert(maybe_file):
         config_entry["dependencies"] = []
     elif isinstance(config_entry["dependencies"], str):
         config_entry["dependencies"] = [config_entry["dependencies"]]
+    # add metadata dependencies
+    if get_not_none(target_metadata, "dependencies") is not None:
+        config_entry["dependencies"].extend(target_metadata["dependencies"])
     if "tags" not in config_entry:
         config_entry["tags"] = []
     elif isinstance(config_entry["tags"], str):
@@ -432,6 +437,13 @@ def main(argv=None):
         help="Include extra metadata when extracting the unit tests into the config. "
              "Since we have our own method of testing file outputs, this is mostly used for file hashes/regexes."
     )
+    parser.add_argument(
+        "--script",
+        # default="unit_tests_script.sh",
+        default=None,
+        help="Bash script to run alongside. This is to set up the environment for the test suite, ex mount points and files tests may depend on."
+             "(Probably need to run with root)."
+    )
     argcomplete.autocomplete(parser)
     args = parser.parse_args(argv)
 
@@ -460,6 +472,13 @@ def main(argv=None):
     print("Extracting tests...")
     extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type, args.extra_metadata)
 
+    if args.script is not None:
+        # assume it needs to run as root
+        if os.geteuid() == 0:
+            subprocess.run(["bash", "{args.script}"], shell=True)
+        else:
+            subprocess.run(["sudo", "bash", "{args.script}"], shell=True)
+
 
 if __name__ == "__main__":
     main()
diff --git a/unit_tests_metadata.yaml b/unit_tests_metadata.yaml
index 2f144e8..486fab7 100644
--- a/unit_tests_metadata.yaml
+++ b/unit_tests_metadata.yaml
@@ -8,4 +8,11 @@
       type: Array[File?]
       value:
       - md5sum: c4ca4238a0b923820dcc509a6f75849b
-      -
\ No newline at end of file
+      -
+
+- id: multi_mount_points
+  dependencies:
+  - docker # for some reason singularity defaults to 64MB on / with findmnt
+- id: relative_and_absolute
+  dependencies:
+    - root # this test depends on reading something from a root owned file
\ No newline at end of file
diff --git a/unit_tests_script.sh b/unit_tests_script.sh
new file mode 100755
index 0000000..af43cb8
--- /dev/null
+++ b/unit_tests_script.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# need to run with root
+mkdir /mnt/outputs/
+mkdir /mnt/tmp/
\ No newline at end of file

From 5ff882905f6a9a914ac740f9447ec8694f780eae Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 3 Jul 2024 16:28:29 -0700
Subject: [PATCH 08/34] Documentation

---
 README.md   |  3 +++
 run_unit.py | 11 ++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 64e29fb..774ec7d 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,9 @@ Runners this has been evaluated against:
  - [miniwdl](https://github.com/chanzuckerberg/miniwdl)
  - [Cromwell](https://github.com/broadinstitute/cromwell)
 
+## WDL Specification Unit Tests
+For information about running specification tests from the WDL spec, calling a separate script is required. [See `README_UNIT.md`](README_UNIT.md)
+
 ## Note
 
 The tests in this directory are incomplete and are in the process of being added to.  If you'd like 
diff --git a/run_unit.py b/run_unit.py
index 0ff2745..80d8724 100644
--- a/run_unit.py
+++ b/run_unit.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
 # PYTHON_ARGCOMPLETE_OK
+"""
+run_unit.py: Run the WDL specification/unit tests.
+"""
 import argparse
+import os
 import sys
 
 import argcomplete
@@ -21,15 +25,16 @@ def main():
     parser.add_argument("--force-pull", default=False, action="store_true",
                         help="Specify whether to use the cached SPEC or to force a pull."
                              "The setup script will be run as well.")
-    parser.set_defaults(version="1.1")  # override default to 1.1
+    parser.set_defaults(versions="1.1")  # override default to 1.1
     parser.set_defaults(runner="toil-wdl-runner")  # cromwell doesn't support 1.1+
 
     argcomplete.autocomplete(parser)
     args = parser.parse_args()
 
-    if args.reset:
+    spec_dir = f"wdl-{args.versions}-spec"  # there may be a better way to detect that setup_unit_tests.py was ran before than checking a hard coded directory
+    if args.reset or not os.path.exists(spec_dir):
         from setup_unit_tests import main as setup_unit_tests_main
-        argv = ["--version=1.1", "--output-type=yaml"]
+        argv = [args.versions, "--output-type=yaml"]
         if args.force_pull:
             argv.append("--force-pull")
         setup_unit_tests_main(argv)

From 357f162f36dd7342756dd1d9c17310fab56ad605 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 3 Jul 2024 18:01:45 -0700
Subject: [PATCH 09/34] Argument for repo

---
 setup_unit_tests.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 6dd0639..0969767 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -444,16 +444,20 @@ def main(argv=None):
         help="Bash script to run alongside. This is to set up the environment for the test suite, ex mount points and files tests may depend on."
              "(Probably need to run with root)."
     )
+    parser.add_argument(
+        "--repo",
+        # while the WDL spec has its bugs, use a fixed version
+        # see openwdl issues #653, #654, #661, #662, #663, #664, #665, #666
+        default="https://github.com/stxue1/wdl.git",
+        help="Repository to pull from."
+    )
     argcomplete.autocomplete(parser)
     args = parser.parse_args(argv)
 
     spec_dir = f"wdl-{args.version}-spec"
     if not os.path.exists(spec_dir) or args.force_pull is True:
-        print("Pulling SPEC from repo...")
-        # while the WDL spec has its bugs, use a fixed version
-        # see openwdl issues #653, #654, #661, #662, #663, #664, #665, #666
-        # cmd = f"git clone https://github.com/openwdl/wdl.git {spec_dir}"
-        cmd = f"git clone https://github.com/stxue1/wdl.git"
+        print(f"Pulling SPEC from repo {args.repo}...")
+        cmd = f"git clone {args.repo}"
         subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
     os.chdir(spec_dir)

From 6787a55d74ad8b253cd362681daab5eca94ee31d Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 3 Jul 2024 18:02:12 -0700
Subject: [PATCH 10/34] Add readme for unit tests

---
 README_UNIT.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 README_UNIT.md

diff --git a/README_UNIT.md b/README_UNIT.md
new file mode 100644
index 0000000..36ab4c4
--- /dev/null
+++ b/README_UNIT.md
@@ -0,0 +1,61 @@
+# Unit tests
+The WDL spec includes an extractable set of specification tests as per [their specification](https://github.com/openwdl/wdl-tests).
+This test suite allows for those tests to be extracted and tested against a variety of runners.
+
+## Notes
+- Specification tests are only available for [WDL versions 1.1.1 and above](https://github.com/openwdl/wdl-tests/blob/58ff36209586ed69c9a64d3e0b151a343f12a4eb/README.md?plain=1#L7).
+    > Starting with WDL 1.1.1, nearly all the examples in the WDL specification are also test cases that conform to the WDL markdown test specification.
+- If running locally, it is **not** recommended to run test IDs `hisat2` and `gatk_haplotype_caller` (test indices 66 and 67). These tests will take a lot of network bandwidth and require a large amount of resources.
+- Cromwell is not supported as Cromwell cannot run WDL 1.1+
+- MiniWDL cannot run all tests, at least until [this issue is resolved](https://github.com/chanzuckerberg/miniwdl/issues/693)
+
+## Usage
+### Extracting Tests
+First, use the script `setup_unit_tests.py` to pull the WDL specification and to extract the tests.
+Calling the script with no arguments `python setup_unit_tests.py` is recommended, but passing in arguments will not drastically change the behavior of the script.
+```commandline
+usage: setup_unit_tests.py [options]
+
+Extracts the unit tests from the spec to be runnable by this test suite;test files will be outputted into the `unit_tests` directory and a new conformance file will be created.To run the
+unit tests, see [insert other script here]
+
+options:
+  -h, --help            show this help message and exit
+  --version {1.1,1.2}, -v {1.1,1.2}
+                        Version of the spec to grab unit tests out of. Only WDL 1.1 and above has support for built in unit tests.
+  --output-type {yaml,json}
+                        Specify conformance file output type.
+  --force-pull          Default behavior does not pull the spec repo if it was already pulled. Specify this argument to force a refresh.
+  --extra-metadata EXTRA_METADATA, -e EXTRA_METADATA
+                        Include extra metadata when extracting the unit tests into the config. Since we have our own method of testing file outputs, this is mostly used for file
+                        hashes/regexes.
+  --script SCRIPT       Bash script to run alongside. This is to set up the environment for the test suite, ex mount points and files tests may depend on.(Probably need to run with root).
+  --repo REPO           Repository to pull from.
+```
+
+By default, WDL version 1.1 is used, extra metadata from `unit_tests_metadata.yaml` is used, and `yaml` is the default output type.
+Other arguments by default are none.
+
+The script will pull the github repository for the specific version of WDL at `wdl-spec-[version]`. The unit tests and configuration file will be extracted into the `unit_tests/` folder. The configuration file will be at `unit_tests/test_config.yaml`, which is used for the `run_unit.py` script.
+
+Before running the test suite, the shell script `unit_tests_script.sh` may need to be executed. This shell script creates several directories that the specification tests will require to exist.
+To remove these directories and anything else that the script may create, call `make clean-unit-setup`.
+
+### Running Tests
+THen, use the script `run_unit.py` to actually run the tests according to the configuration file. If the default setup was performed,
+the script can be used as if using `run.py`.
+
+For example: `python run_unit.py --runner toil-wdl-runner --version 1.1 -n 105-109 --quiet --threads=4 --progress`
+
+There are extra arguments specific to `run_unit.py` that may be useful.
+```commandline
+  --config CONFIG, -c CONFIG
+                        Specify the path of the conformance config file.
+  --reset               Specify whether to run the setup script again.
+  --force-pull          Specify whether to use the cached SPEC or to force a pull.The setup script will be run as well.
+```
+The config argument by default is `unit_tests/test_config.yaml`, but can be overridden by passing in the path to another config file.
+Resetting the script will extract the tests from the specification file again with the default behavior, essentially calling `setup_unit_tests.py` with no arguments.
+Forcing a pull will pull the WDL repository, overwriting the existing WDL repository and specification file, before extracting the tests again. `--reset` must be included for `--force-pull` to work.
+
+To remove the WDL repository and the unit tests folder, call `make clean-unit`.
\ No newline at end of file

From 5f2f9696347c2846d4035c658af5e4a64553428d Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 3 Jul 2024 18:17:20 -0700
Subject: [PATCH 11/34] Add note for root test

---
 README_UNIT.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README_UNIT.md b/README_UNIT.md
index 36ab4c4..44efa9d 100644
--- a/README_UNIT.md
+++ b/README_UNIT.md
@@ -6,6 +6,7 @@ This test suite allows for those tests to be extracted and tested against a vari
 - Specification tests are only available for [WDL versions 1.1.1 and above](https://github.com/openwdl/wdl-tests/blob/58ff36209586ed69c9a64d3e0b151a343f12a4eb/README.md?plain=1#L7).
     > Starting with WDL 1.1.1, nearly all the examples in the WDL specification are also test cases that conform to the WDL markdown test specification.
 - If running locally, it is **not** recommended to run test IDs `hisat2` and `gatk_haplotype_caller` (test indices 66 and 67). These tests will take a lot of network bandwidth and require a large amount of resources.
+- The test ID `relative_and_absolute` (test index 52) needs to be run with root as it will read a root owned file
 - Cromwell is not supported as Cromwell cannot run WDL 1.1+
 - MiniWDL cannot run all tests, at least until [this issue is resolved](https://github.com/chanzuckerberg/miniwdl/issues/693)
 

From 919d41607dcb49d5f9c397b3bc6fe89d045bab49 Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Mon, 19 Aug 2024 21:10:30 -0700
Subject: [PATCH 12/34] Update run.py

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.py b/run.py
index e6b7ffe..c7a57db 100755
--- a/run.py
+++ b/run.py
@@ -300,7 +300,7 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
 
         if exclude_outputs is not None and len(exclude_outputs) > 0:
             # I'm not sure if it is possible but the wdl-tests spec seems to say the type can also be a string
-            exclude_outputs = list(exclude_outputs) if not isinstance(exclude_outputs, list) else exclude_outputs
+            exclude_outputs = [exclude_outputs] if not isinstance(exclude_outputs, list) else exclude_outputs
             # remove the outputs that we are not allowed to compare
             test_result_outputs = dict()
             for k, v in test_results.get('outputs', {}).items():

From 9b6fdd4a241839599b4fb66756be20048b65c9f3 Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Mon, 19 Aug 2024 21:11:26 -0700
Subject: [PATCH 13/34] Update run.py

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 run.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/run.py b/run.py
index c7a57db..eff007a 100755
--- a/run.py
+++ b/run.py
@@ -302,19 +302,8 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
             # I'm not sure if it is possible but the wdl-tests spec seems to say the type can also be a string
             exclude_outputs = [exclude_outputs] if not isinstance(exclude_outputs, list) else exclude_outputs
             # remove the outputs that we are not allowed to compare
-            test_result_outputs = dict()
-            for k, v in test_results.get('outputs', {}).items():
-                for excluded in exclude_outputs:
-                    if k.endswith(excluded):
-                        # we first check that an output variable ends with the excluded name
-                        # ex: got: output.csvs, omit: csvs      so omit
-                        # then we check that the length of the two match
-                        # or that the character right before the endswith match is a namespace separator
-                        # ex: got: output.csvs, omit: output.csvs       so omit
-                        # ex: got: output.not_csvs, omit: csvs      so don't omit as '_' != '.'
-                        if len(k) == len(excluded) or k[::-1][len(excluded)] == ".":
-                            continue
-                    test_result_outputs[k] = v
+            excluded = set(excluded_outputs)
+            test_result_outputs = {k: v for k, v in test_results.get('outputs', {}).items() if k.split(".")[-1] not in excluded}
 
         else:
             test_result_outputs = test_results.get('outputs', {})

From 6e740602e9cab1d0529815ea53eb23cc5da5c52d Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Mon, 19 Aug 2024 21:12:40 -0700
Subject: [PATCH 14/34] update to address some comments

---
 run.py | 65 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/run.py b/run.py
index e6b7ffe..4a1345d 100755
--- a/run.py
+++ b/run.py
@@ -24,7 +24,7 @@
 from WDL.Type import Float as WDLFloat, String as WDLString, File as WDLFile, Int as WDLInt, Boolean as WDLBool, \
     Array as WDLArray, Map as WDLMap, Pair as WDLPair, StructInstance as WDLStruct
 
-from typing import Optional, Any, Dict, Tuple, List
+from typing import Optional, Any, Dict, Tuple, List, Union
 from WDL.Type import Base as WDLBase
 
 from lib import run_cmd, py_type_of_wdl_class, verify_failure, announce_test, print_response, convert_type, run_setup, \
@@ -39,7 +39,7 @@ class WDLRunner:
     """
     runner: str
 
-    def format_command(self, wdl_file, json_file, json_string, results_file, args, verbose, pre_args=None):
+    def format_command(self, wdl_file, json_input, results_file, args, verbose, pre_args=None):
         raise NotImplementedError
 
 
@@ -47,10 +47,10 @@ class CromwellStyleWDLRunner(WDLRunner):
     def __init__(self, runner):
         self.runner = runner
 
-    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+    def format_command(self, wdl_file: str, json_input: Union[str, Dict[str, Any]], results_file: str,
                        args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
-        # One or the other is guaranteed to exist in the prior branch
-        json_input = json_file or json_string
+        if isinstance(json_input, dict):
+            json_input = json.dumps(json_input)
         json_arg = ["-i", json_input] if json_input is not None else []
         return list(filter(None, self.runner.split(" "))) + [wdl_file, "-m", results_file] + json_arg + args
 
@@ -61,7 +61,7 @@ class CromwellWDLRunner(CromwellStyleWDLRunner):
     def __init__(self):
         super().__init__('cromwell')
 
-    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+    def format_command(self, wdl_file: str, json_input: Union[str, Dict[str, Any]], results_file: str,
                        args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
         if self.runner == 'cromwell' and not which('cromwell'):
             with CromwellWDLRunner.download_lock:
@@ -76,16 +76,17 @@ def format_command(self, wdl_file: str, json_file: Optional[str], json_string: O
                         run_cmd(cmd='make cromwell'.split(" "), cwd=os.getcwd())
                     self.runner = f'java {log_level} {pre_args} -jar {cromwell} run'
 
-        return super().format_command(wdl_file, json_file, json_string, results_file, args, verbose)
+        return super().format_command(wdl_file, json_input, results_file, args, verbose)
 
 
 class MiniWDLStyleWDLRunner(WDLRunner):
     def __init__(self, runner):
         self.runner = runner
 
-    def format_command(self, wdl_file: str, json_file: Optional[str], json_string: Optional[str], results_file: str,
+    def format_command(self, wdl_file: str, json_input: Union[str, Dict[str, Any]], results_file: str,
                        args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
-        json_input = json_file or json_string
+        if isinstance(json_input, dict):
+            json_input = json.dumps(json_input)
         json_arg = ["-i", json_input] if json_input is not None else []
         return self.runner.split(" ") + [wdl_file, "-o", results_file, "-d", "miniwdl-logs",
                                          "--verbose"] + json_arg + args
@@ -112,7 +113,7 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
 
         In the future, make this return where it failed instead to give less generic error messages
 
-        :param expected: expected value object
+        :param expected: expected value object from conformance file
         :param result: result value object from WDL runner
         :param typ: type of output from conformance file
         """
@@ -179,38 +180,42 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                                                       f"Expected output: {expected}\n"
                                                       f"Actual output: {result}!"}
             # check that output types are correct
-            if not isinstance(expected, py_type_of_wdl_class(typ)) or not isinstance(result, py_type_of_wdl_class(typ)):
+            expected_type = py_type_of_wdl_class(typ)
+            if not isinstance(expected, expected_type) or not isinstance(result, expected_type):
                 # When outputting in both miniwdl and toil, Map keys are always strings regardless of the specified type
                 # When Map[Int, Int], the output will be {"1": 1}
                 # Or when Map[Float, Int], the output will be {"1.000000": 1}
                 # This only applies to Int and Float types, as Boolean key types don't seem to be supported in miniwdl
                 if isinstance(typ, WDLInt):
                     try:
-                        # ensure the stringified version is equivalent to an int
-                        py_type_of_wdl_class(typ)(result)
-                        py_type_of_wdl_class(typ)(expected)
+                        # ensure the stringified version of the runner output is equivalent to an int
+                        expected_type(result)
                     except ValueError:
                         # the string representation does not represent the right type
-                        return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}! Most likely a Map key type is incorrect.\n"
-                                                              f"Expected output: {expected}\n"
-                                                              f"Actual output: {result}!"}
+                        return {'status': 'FAILED', 'reason': f"Runner output {result} is not type Int from the conformance file."}
+                    try:
+                        # ensure the stringified version of the conformance output is equivalent to an int
+                        expected_type(expected)
+                    except ValueError:
+                        return {'status': 'FAILED', 'reason': f"Conformance output and type does not match. Expected output {expected} with expected type Int"}
+
                 elif isinstance(typ, WDLFloat):
                     try:
-                        # ensure the stringified version is equivalent to an int
-                        py_type_of_wdl_class(typ)(result)
-                        py_type_of_wdl_class(typ)(expected)
-                        if not ("." in result and "." in expected):
-                            # the string representation is not a float but an int
-                            raise ValueError
+                        # ensure the stringified version of the runner output is equivalent to an float
+                        expected_type(result)
                     except ValueError:
                         # the string representation does not represent the right type
-                        return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}! Most likely a Map key type is incorrect.\n"
-                                                              f"Expected output: {expected}\n"
-                                                              f"Actual output: {result}!"}
+                        return {'status': 'FAILED', 'reason': f"Runner output {result} is not type Float from the conformance file."}
+                    try:
+                        # ensure the stringified version of the conformance output is equivalent to an float
+                        expected_type(expected)
+                    except ValueError:
+                        return {'status': 'FAILED', 'reason': f"Conformance output and type does not match. Expected output {expected} with expected type Float"}
                 else:
-                    return {'status': 'FAILED', 'reason': f"Incorrect types when expecting type {typ}!\n"
-                                                          f"Expected output: {expected}\n"
-                                                          f"Actual output: {result}!"}
+                    if not isinstance(expected, expected_type):
+                        return {'status': 'FAILED', 'reason': f"Incorrect types! Runner output {expected} is not of type {str(typ)}\n"}
+                    elif not isinstance(result, expected_type):
+                        return {'status': 'FAILED', 'reason': f"Incorrect types! Runner output {result} is not of type {str(typ)}\n"}
 
         if isinstance(typ, WDLFile):
             # check file path exists
@@ -460,7 +465,7 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
         if repeat is not None:
             response["repeat"] = repeat
         # Turn failing tests to warnings if they violate a test dependency
-        test_dependencies(dependencies=test.get("dependencies"), response=response)
+        response.update(test_dependencies(dependencies=test.get("dependencies")))
         return response
 
     def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:

From f9a4b503c5326a47ce4ec71c29de586b043e59ba Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Mon, 19 Aug 2024 21:55:40 -0700
Subject: [PATCH 15/34] Update run.py

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.py b/run.py
index eff007a..5d79bdc 100755
--- a/run.py
+++ b/run.py
@@ -383,7 +383,7 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
 
         # todo: it seems odd that I'm looking for a dependency when the test spec says its supposed to be used to turn failing tests into warnings
         # this also isn't the most efficient
-        if test.get('dependencies') is not None and runner == "toil-wdl-runner":
+        if runner == "toil-wdl-runner" and "dependencies" in test:
             if "docker" in test["dependencies"]:
                 test_args.extend(["--container", "docker"])
             if "singularity" in test["dependencies"]:

From edd7a1b789ee7d3e18157e79915a114039bb6bcf Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Mon, 19 Aug 2024 23:23:44 -0700
Subject: [PATCH 16/34] Address comments

---
 lib.py                                        |  25 ++-
 run.py                                        |  21 +-
 setup_unit_tests.py                           | 211 +++++++++++-------
 ...etadata.yaml => unit_tests_patch_data.yaml |   0
 4 files changed, 159 insertions(+), 98 deletions(-)
 rename unit_tests_metadata.yaml => unit_tests_patch_data.yaml (100%)

diff --git a/lib.py b/lib.py
index 251dd89..4837d86 100644
--- a/lib.py
+++ b/lib.py
@@ -3,6 +3,8 @@
 import os
 import re
 import subprocess
+from argparse import Namespace
+from distutils.util import strtobool
 
 from WDL.Type import Float as WDLFloat, String as WDLString, File as WDLFile, Int as WDLInt, Boolean as WDLBool, \
     Array as WDLArray, Map as WDLMap, Pair as WDLPair, StructInstance as WDLStruct
@@ -305,10 +307,15 @@ def get_test_indices(number_argument):
     return tests
 
 
-def get_specific_tests(conformance_tests, tag_argument, number_argument, id_argument, exclude_number_argument=None, exclude_tags_argument=None):
+def get_specific_tests(conformance_tests, options: Namespace):
     """
     Given the expected tests, tag argument, and number argument, return a list of all test numbers/indices to run
     """
+    tag_argument = options.tags
+    number_argument = options.numbers
+    id_argument = options.id
+    exclude_number_argument = options.exclude_numbers
+    exclude_tags_argument = options.exclude_tags
     given_indices = get_test_indices(number_argument)
     exclude_indices = get_test_indices(exclude_number_argument)
     given_tags = get_tags(tag_argument)
@@ -446,6 +453,9 @@ def wdl_type_to_miniwdl_class(wdl_type: Union[Dict[str, Any], str]) -> Optional[
     elif wdl_type == 'Pair':
         return WDLPair
     elif wdl_type == 'Object':
+        # MiniWDL doesn't support objects
+        # See https://github.com/chanzuckerberg/miniwdl/issues/694
+        # So replace with a placeholder type so the file will at least parse
         return WDLString
     else:
         raise NotImplementedError
@@ -524,9 +534,10 @@ def convert_type(wdl_type: Any) -> Optional[WDLBase]:
 
 
 def test_gpu_available():
-    if bool(os.getenv("WDL_CONFORMANCE_TESTS__GPU")):
+    gpu_env_var = os.getenv("WDL_CONFORMANCE_TESTS_GPU")
+    if gpu_env_var is not None:
         # override
-        return True
+        return bool(strtobool(gpu_env_var))
     try:
         p = subprocess.run("nvidia-smi".split(" "))
     except (FileNotFoundError, subprocess.SubprocessError):
@@ -552,14 +563,17 @@ def test_gpu_available():
 IGNORE_DEPENDENCIES = ["docker", "root", "singularity"]
 
 
-def test_dependencies(dependencies: Optional[List[str]], response: Dict[str, Any]) -> None:
+def test_dependencies(dependencies: Optional[List[str]]) -> Dict[str, Any]:
     """
     Given a set of dependencies for a test, see if any of those dependencies are violated.
     If so, change a failing test to a warning and update the reason.
+
+    The list of dependencies are at https://github.com/openwdl/wdl-tests/blob/main/docs/Specification.md#test-configuration
     """
     # todo: maybe there is a better way to deal with dependencies
+    response = {}
     if dependencies is None:
-        return
+        return response
     for d in dependencies:
         if d == "gpu":
             if not test_gpu_available() and response['status'] == 'FAILED':
@@ -599,3 +613,4 @@ def test_dependencies(dependencies: Optional[List[str]], response: Dict[str, Any
                                           f"\nFailing reason:\n") + response["reason"]
         elif d not in IGNORE_DEPENDENCIES:
             print(f"Warning: Test framework encountered unsupported dependency {d}. Ignoring...")
+    return response
diff --git a/run.py b/run.py
index 91e984c..769604e 100755
--- a/run.py
+++ b/run.py
@@ -303,11 +303,12 @@ def verify_outputs(self, expected: dict, results_file: str, ret_code: int, exclu
         except json.JSONDecodeError:
             return {'status': 'FAILED', 'reason': f'Results file at {results_file} is not JSON'}
 
+        # todo: simplify logic into one branch
         if exclude_outputs is not None and len(exclude_outputs) > 0:
             # I'm not sure if it is possible but the wdl-tests spec seems to say the type can also be a string
             exclude_outputs = [exclude_outputs] if not isinstance(exclude_outputs, list) else exclude_outputs
             # remove the outputs that we are not allowed to compare
-            excluded = set(excluded_outputs)
+            excluded = set(exclude_outputs)
             test_result_outputs = {k: v for k, v in test_results.get('outputs', {}).items() if k.split(".")[-1] not in excluded}
 
         else:
@@ -370,8 +371,8 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
 
         # Only one of json_string or input json file can exist
         if json_string is not None and json_file is not None:
-            return {'status': 'FAILED', 'reason': f'Config file specifies both a json string and json input file! Only one can be supplied! '
-                                                  f'Check the input section for test id {test["id"]}.'}
+            raise RuntimeError(f'Config file specifies both a json string and json input file! Only one can be supplied! '
+                               f'Check the input section for test id {test["id"]}.')
 
         test_args = args[runner].split(" ") if args[runner] is not None else []
         unique_id = uuid4()
@@ -386,8 +387,9 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
         if runner == "cromwell":
             pre_args = args["cromwell_pre_args"]
 
-        # todo: it seems odd that I'm looking for a dependency when the test spec says its supposed to be used to turn failing tests into warnings
-        # this also isn't the most efficient
+        # todo: it seems odd that I'm looking for a dependency before running when the test spec says test frameworks are supposed to be used to turn failing tests into warnings
+        # this is mainly because the docker and singularity strings are custom dependencies that aren't (explicitly) part of the WDL test spec
+        # they mainly tell how toil should run to pass the test
         if runner == "toil-wdl-runner" and "dependencies" in test:
             if "docker" in test["dependencies"]:
                 test_args.extend(["--container", "docker"])
@@ -429,6 +431,7 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
         Returns a result that can have status SKIPPED, SUCCEEDED, or FAILED.
         """
         response = {'description': test.get('description'), 'number': test_index, 'id': test.get('id')}
+        # priority flag is defined in https://github.com/openwdl/wdl-tests/blob/main/docs/Specification.md#test-configuration
         if test.get("priority") == "ignore":
             # config specifies to ignore this test, so skip
             if progress:
@@ -494,8 +497,7 @@ def run_and_generate_tests_args(self, options: argparse.Namespace, args: Optiona
         # might allow it.
         # But the tests all need to be for single WDL versions.
         versions_to_test = set(options.versions.split(','))
-        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=options.tags, number_argument=options.numbers,
-                                            id_argument=options.id, exclude_number_argument=options.exclude_numbers, exclude_tags_argument=options.exclude_tags)
+        selected_tests = get_specific_tests(conformance_tests=self.tests, options=options)
         selected_tests_amt = len(selected_tests) * len(versions_to_test) * options.repeat
         successes = 0
         skips = 0
@@ -638,13 +640,14 @@ def add_options(parser) -> None:
     parser.add_argument("--id", default=None, help="Specify WDL tests by ID.")
     parser.add_argument("--repeat", default=1, type=int, help="Specify how many times to run each test.")
     # This is to deal with jobstores being created in the /data/tmp directory on Phoenix, which appears to be unique
-    # per worker, thus causing JobstoreNotFound exceptions when delegating too many workers at a time
+    # per worker, thus causing JobstoreNotFound exceptions when running across multiple workers
     parser.add_argument("--jobstore-path", "-j", default=None, help="Specify the PARENT directory for the jobstores to "
                                                                     "be created in.")
     # Test responses are collected and sorted, so this option allows the script to print out the current progress
     parser.add_argument("--progress", default=False, action="store_true", help="Print the progress of the test suite "
                                                                                "as it runs.")
-    parser.add_argument("--debug", action="store_true", default=False)
+    parser.add_argument("--debug", action="store_true", default=False, help="Specifically to be used with Pycharm's debugger (or a pgdb based debugger)."
+                                                                            "This makes everything run on the same thread.")
 
 
 def main(argv=None):
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 0969767..6c6b3be 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -3,6 +3,7 @@
 
 """
 Extract unit tests from the WDL spec (1.1+) and create a conformance file that is conformance to our representation for this test suite
+A good portion of this code is taken and modified from https://github.com/openwdl/wdl-tests/blob/main/scripts/extract_tests.py
 """
 import hashlib
 import subprocess
@@ -32,55 +33,36 @@
 FILENAME_RE = re.compile(r"(.+?)(_fail)?(_task)?.wdl")
 VERSION_RE = re.compile(r"version ([\d.]+)")
 
-regex_output_section_str = r"(?<=output)(?:\s*{\s*)([\s\S]+?)(?:})"
+# Regex for finding the WDL type of a variable by reading the WDL file per line
+# For example, given the wdl workflow:
+#   output {
+#     File example1 = "example1.txt"
+#     File? example2 = "example2.txt"
+#     Array[File?] file_array = ["example1.txt", "example2.txt"]
+#     Int file_array_len = length(select_all(file_array))
+#   }
+# For each declaration, the regex will identify type File with variable name example1, etc
 regex_var_types_str = r"([\w\[\]+?]+)\s(\w+)(?:[\s\S]*?(?= =))"
 
 
-def get_not_none(d: Optional[Dict[Any, Any]], k: str) -> Any:
+def get_from(d: Optional[Dict[Any, Any]], k: str) -> Any:
     # same as d.get(k) but return None if d is None
     if d is not None:
         return d.get(k)
     return None
 
-def index_not_none(l: Optional[List[Any]], i: int) -> Any:
+def index_from(l: Optional[List[Any]], i: int) -> Any:
     if l is not None:
         return l[i]
     return None
 
-def extract_output_types_defer(wdl_file):
-    """
-    The main issue with using miniwdl's parser is parsing the wdl file is dependent on miniwdl,
-    so if the spec changes and miniwdl is unable to support it, this test framework won't be able to run.
-    The simplest way around this (that isn't creating another parser, maybe pyparsing?) is probably something like this;
-    regex won't work as regular languages are too basic.
 
-    Thus use a simple iterator to detect all variable names instead of miniwdl's parser
-    To be used if a wdl file has a type that miniwdl doesn't support, ex Object
+def wdl_type_to_string(wdl_type: WDL.Type.Base) -> Union[Dict[str, Any], str]:
     """
-    regex_var_types = re.compile(regex_var_types_str)
-
-    with open(wdl_file, "r") as f:
-        wdl_contents = f.read()
-
-    var_types = {}
-    # can't regex this as regex cannot handle nested constructs
-    # this is a little fragile if the entire declaration isn't on the same line
-    # find all var declarations
-    for line in wdl_contents.split("\n"):
-        line = line.strip()
-        if "=" in line:
-            match = re.search(regex_var_types, line)
-            if match is None:
-                # this line is not a var declaration
-                continue
-            name = match.group(2)
-            typ = match.group(1)
-            var_types[name] = typ
-
-    return var_types
+    Convert a MiniWDL type object to its string representation.
 
-
-def wdl_type_to_string(wdl_type: WDL.Type.Base) -> Union[Dict[str, Any], str]:
+    If a struct is provided, a dictionary will be returned with the name of the member and the associated type
+    """
     if isinstance(wdl_type, WDL.Type.StructInstance):
         # this is the only construct that we need to decipher to string form
         dictified = {}
@@ -93,6 +75,8 @@ def wdl_type_to_string(wdl_type: WDL.Type.Base) -> Union[Dict[str, Any], str]:
 def extract_output_types(wdl_file, fail):
     """
     Use miniwdl's parser to extract the output types from a wdl file
+
+    If it fails, it fallback to a simpler regex parser
     """
     if fail:
         # since we expect this workflow to fail, there should be no outputs
@@ -104,7 +88,8 @@ def extract_output_types(wdl_file, fail):
         if "Unknown type Object" in str(e):
             # one of the tests wants to test if Objects are supported, but miniwdl doesn't support objects
             # defer to the other more simpler parser
-            return extract_output_types_defer(wdl_file)
+            # The majority of the time (currently everything except one), the miniwdl parser will be used
+            return extract_output_types_regex(wdl_file)
         else:
             raise RuntimeError("Likely unsupported type. Failed to parse WDL file %s!", wdl_file) from e
     except Exception as e:
@@ -128,29 +113,74 @@ def extract_output_types(wdl_file, fail):
     return var_types
 
 
+def extract_output_types_regex(wdl_file) -> Dict[str, str]:
+    """
+    Given a WDL file, extract all output variable names and its associated types with a regex. Returns a dictionary of variable name to type.
+    Ex:
+      File filename = "path/to/file"
+    Will get {"filename": "File"}
+
+    Ideally, the parsing should be handled by MiniWDL. This mainly gets around the issue of MiniWDL not supporting objects.
+    """
+    # The main issue with using miniwdl's parser is parsing the wdl file is dependent on miniwdl,
+    # so if the spec changes and miniwdl is unable to support it, this test framework won't be able to run.
+    # The simplest way around this (that isn't creating another parser, maybe pyparsing?) is probably something like this;
+    # regex won't work as regular languages are too basic.
+    #
+    # Thus use a simple iterator to detect all variable names instead of miniwdl's parser
+    # To be used if a wdl file has a type that miniwdl doesn't support, ex Object
+    regex_var_types = re.compile(regex_var_types_str)
+
+    with open(wdl_file, "r") as f:
+        wdl_contents = f.read()
+
+    var_types = {}
+    # can't regex the entire wdl file in one as regex cannot handle nested constructs
+    # so iterate over each line instead
+    # this is a little fragile if the entire declaration isn't on the same line
+    # This also technically grabs the variable names/types for all declarations in the file rather than just the output
+    # todo: deal with when a task and a workflow have two variables (one in the task and one in the workflow) with the same variable name
+    # within the same workflow/task, this is not an issue as multiple declarations with the same variable name is not allowed
+    for line in wdl_contents.split("\n"):
+        line = line.strip()
+        if "=" in line:
+            match = re.search(regex_var_types, line)
+            if match is None:
+                # this line is not a var declaration
+                continue
+            name = match.group(2)
+            typ = match.group(1)
+            var_types[name] = typ
+
+    return var_types
+
+
 def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: WDL.Type.Base,
-                                data_dir: Optional[Path], metadata: Union[List[Any], Dict[str, Any]]):
+                                data_dir: Optional[Path], extra_patch_data: Union[List[Any], Dict[str, Any]]):
     """
-    Get the expected output values with respect to the output types.
+    Get the expected output object (in our conformance representation) with respect to the output types. Do so by searching
+    the extra yaml/configuration file for an associated key value pair. Can also test if the
+    wanted file exists recursively in data_dir. Descend recursively down the type
+
     This is done to get the md5sum of files when converting unit tests.
 
-    Ex: If given hello.txt and type File, try to get the regex of the file
-    If given "hello" and type String, return string
+    Ex: If given hello.txt and type File, try to get the md5sum of the file.
+    If given "hello" and type String, don't do anything and just return the string value "hello"
     """
     if output_values is None and output_type.optional:
         return output_values
     if isinstance(output_type, WDL.Type.File):
         # this will only populate md5sum if the files exist recursively in the data directory
         for path in glob.glob(str(data_dir / "**"), recursive=True):
+            if get_from(extra_patch_data, "md5sum") is not None:
+                return {'md5sum': extra_patch_data.get("md5sum")}
+            if get_from(extra_patch_data, "regex") is not None:
+                return {'regex': extra_patch_data.get("regex")}
+            # The above methods should be used, else this is a fallback. This isn't guaranteed to find the right file if multiple same filenames exist under data_dir
             if path.endswith(output_values) and os.path.exists(path):
                 with open(path, "rb") as f:
                     md5sum = hashlib.md5(f.read()).hexdigest()
                 return {'md5sum': md5sum}
-            # else check if there is extra metadata to read from
-            if get_not_none(metadata, "md5sum") is not None:
-                return {'md5sum': metadata.get("md5sum")}
-            if get_not_none(metadata, "regex") is not None:
-                return {'regex': metadata.get("regex")}
 
     if isinstance(output_type, WDL.Type.StructInstance):
         converted_output = dict()
@@ -158,40 +188,49 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
             try:
                 output_value_type = output_type.members[output_key]
             except KeyError:
-                # coercion from map to struct can cause some weird type behavior
-                # since dictionaries past python 3.6 are ordered, find the corresponding type from the current output's position
+                # coercion from map to struct can cause some weird type behavior when parsing with miniwdl
+                # in the map_to_struct.wdl test, when parsing the output types, the miniwdl parser outputs a struct with the right types
+                # but the wrong keys (right member types but wrong member names)
+                # So the key lookup will fail even though the corresponding types are actually there
+                # Since dictionaries past python 3.6 are ordered, find the corresponding type from the current output's position
                 output_value_type = list(output_type.members.values())[i]
-            converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir, get_not_none(metadata, output_value))
+            converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir, get_from(extra_patch_data, output_value))
         return converted_output
     if isinstance(output_type, WDL.Type.Map):
         converted_output = dict()
         for output_key, output_value in output_values.items():
-            new_output_key = convert_typed_output_values(output_key, output_type.item_type[0], data_dir, get_not_none(metadata, output_key))
-            new_output_value = convert_typed_output_values(output_value, output_type.item_type[1], data_dir, get_not_none(metadata, output_value))
+            new_output_key = convert_typed_output_values(output_key, output_type.item_type[0], data_dir, get_from(extra_patch_data, output_key))
+            new_output_value = convert_typed_output_values(output_value, output_type.item_type[1], data_dir, get_from(extra_patch_data, output_value))
             converted_output[new_output_key] = new_output_value
     if isinstance(output_type, WDL.Type.Pair):
         converted_output = dict()
         # key should be left or right
-        converted_output["left"] = convert_typed_output_values(output_values["left"], output_type.left_type, data_dir, get_not_none(metadata, output_values["left"]))
-        converted_output["right"] - convert_typed_output_values(output_values["right"], output_type.right_type, data_dir, get_not_none(metadata, output_values["right"]))
+        converted_output["left"] = convert_typed_output_values(output_values["left"], output_type.left_type, data_dir, get_from(extra_patch_data, output_values["left"]))
+        converted_output["right"] = convert_typed_output_values(output_values["right"], output_type.right_type, data_dir, get_from(extra_patch_data, output_values["right"]))
         return converted_output
     if isinstance(output_type, WDL.Type.Array):
         converted_output = list()
         for i, output in enumerate(output_values):
-            converted_output.append(convert_typed_output_values(output, output_type.item_type, data_dir, index_not_none(metadata, i)))
+            converted_output.append(convert_typed_output_values(output, output_type.item_type, data_dir, index_from(extra_patch_data, i)))
         return converted_output
     # else this is a primitive type
     return output_values
 
 
 def convert_typed_output_values_from_string(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: Union[str, Dict[str, Any]],
-                                            data_dir: Optional[Path], metadata: Optional[Dict[str, Any]]):
+                                            data_dir: Optional[Path], extra_patch_data: Optional[Dict[str, Any]]):
     output_type_wdl = convert_type(output_type)
-    return convert_typed_output_values(output_values, output_type_wdl, data_dir, metadata)
+    return convert_typed_output_values(output_values, output_type_wdl, data_dir, extra_patch_data)
+
+JSON_PARSEABLE = Union[Dict[Any, Any], List[Any], bool, int, str, float, None]
 
+def recursive_json_apply(json_obj: JSON_PARSEABLE, func: Callable[[Any], Any]) \
+        -> JSON_PARSEABLE:
+    """
+    Transform every primitive type of a JSON object.
 
-def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, str, float, None], func: Callable[[Any], Any]) \
-        -> Union[Dict[Any, Any], List[Any], bool, int, str, float, None]:
+    Will recursively descend down until the "leaf" nodes and run a function on the value.
+    """
     if isinstance(json_obj, dict):
         return {func(k): recursive_json_apply(v, func) for k, v in json_obj.items()}
     elif isinstance(json_obj, list):
@@ -204,12 +243,13 @@ def recursive_json_apply(json_obj: Union[Dict[Any, Any], List[Any], bool, int, s
 
 
 def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_files: Optional[Set[str]], data_dir: Optional[Path],
-                         output_data_dir: Optional[Path], config: list, metadata: Optional[Dict[str, Any]]) -> None:
+                         output_data_dir: Optional[Path], config: list, extra_patch_data: Optional[Dict[str, Any]]) -> None:
+    # Modified from the WDL test extraction example
     """
-    Given the regex, create the corresponding config entry for conformance.yaml.
+    Given the regex match object, create the corresponding config entry for conformance.yaml. Adds the config entry to the config argument (which is a list)
 
     Separated from write_test_file as miniwdl's parser requires all imported files to exist,
-    and this is not necessary the case if iterating the spec file top to bottom. And we
+    and this is not necessarily true if iterating the spec file top to bottom. And we
     can't use a simple regex as the type of grammar is too basic for things like nested constructs
     """
     file_name, wdl, input_json, output_json, config_json = m.groups()
@@ -229,10 +269,10 @@ def generate_config_file(m: re.Match, output_dir: Path, version: str, all_data_f
 
     config_entry["id"] = target
 
-    target_metadata: Optional[Dict[str, Any]] = None
-    for m in metadata:
+    target_data: Optional[Dict[str, Any]] = None
+    for m in extra_patch_data:
         if m.get("id") == target:
-            target_metadata = m
+            target_data = m
             break
 
     wdl_dir, wdl_base = os.path.split(wdl_file)
@@ -261,7 +301,6 @@ def if_file_convert(maybe_file):
         config_entry["outputs"] = json.loads(output_json.strip())
     else:
         config_entry["outputs"] = {}
-
     output_var_types = extract_output_types(wdl_file, bool(is_fail))
 
     if "return_code" not in config_entry:
@@ -273,13 +312,13 @@ def if_file_convert(maybe_file):
         config_entry["outputs"] = {}
     else:
         if output_json is not None:
-            target_output_metadata = get_not_none(target_metadata, "outputs")
+            target_output_data = get_from(target_data, "outputs")
             for k, v in json.loads(output_json.strip()).items():
                 k_base = k.split(".")[-1]
                 output_type = output_var_types[k_base]
                 config_entry["outputs"][k] = {
                     "type": output_type,
-                    "value": convert_typed_output_values_from_string(v, output_type, data_dir, get_not_none(get_not_none(target_output_metadata, k), "value"))
+                    "value": convert_typed_output_values_from_string(v, output_type, data_dir, get_from(get_from(target_output_data, k), "value"))
                 }
 
     if "type" not in config_entry:
@@ -296,9 +335,9 @@ def if_file_convert(maybe_file):
         config_entry["dependencies"] = []
     elif isinstance(config_entry["dependencies"], str):
         config_entry["dependencies"] = [config_entry["dependencies"]]
-    # add metadata dependencies
-    if get_not_none(target_metadata, "dependencies") is not None:
-        config_entry["dependencies"].extend(target_metadata["dependencies"])
+    # add extra_patch_data dependencies
+    if get_from(target_data, "dependencies") is not None:
+        config_entry["dependencies"].extend(target_data["dependencies"])
     if "tags" not in config_entry:
         config_entry["tags"] = []
     elif isinstance(config_entry["tags"], str):
@@ -315,7 +354,7 @@ def if_file_convert(maybe_file):
 
 def write_test_files(m: re.Match, output_dir: Path, version: str):
     """
-    Given the parsed regex, write the test file into the output directory.
+    Given the regex match, write the test file into the output directory.
     Checks if the version and the file to be written match. Also ensures that the file
     does not exist beforehand.
     """
@@ -341,7 +380,7 @@ def write_test_files(m: re.Match, output_dir: Path, version: str):
         o.write(wdl)
 
 
-def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, version: str, output_type: str, extra_metadata: Optional[Path]):
+def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, version: str, output_type: str, extra_patch_data_path: Optional[Path]):
     if not output_dir.exists():
         output_dir.mkdir(parents=True)
 
@@ -378,14 +417,14 @@ def extract_tests(spec: Path, data_dir: Optional[Path], output_dir: Path, versio
     else:
         output_data_dir = None
 
-    metadata = None
-    if extra_metadata is not None:
-        with open(extra_metadata, "r") as e:
+    extra_patch_data = None
+    if extra_patch_data_path is not None:
+        with open(extra_patch_data_path, "r") as e:
             yaml = YAML()
-            metadata = yaml.load(e)
+            extra_patch_data = yaml.load(e)
 
     for m in all_m:
-        generate_config_file(m, output_dir, version, all_data_files, data_dir, output_data_dir, config, metadata)
+        generate_config_file(m, output_dir, version, all_data_files, data_dir, output_data_dir, config, extra_patch_data)
 
     if output_type == "json":
         config_file = output_dir / "test_config.json"
@@ -407,7 +446,7 @@ def main(argv=None):
     parser = argparse.ArgumentParser(
         usage="%(prog)s [options]",
         description=(
-            "Extracts the unit tests from the spec to be runnable by this test suite;"
+            "Extracts the unit tests from the spec to be runnable by this test suite (does not run them. See run_unit.py);"
             "test files will be outputted into the `unit_tests` directory and a new conformance file will be created."
             "To run the unit tests, see [insert other script here]"
         )
@@ -432,23 +471,23 @@ def main(argv=None):
         help="Default behavior does not pull the spec repo if it was already pulled. Specify this argument to force a refresh."
     )
     parser.add_argument(
-        "--extra-metadata", "-e",
-        default="unit_tests_metadata.yaml",
-        help="Include extra metadata when extracting the unit tests into the config. "
+        "--extra-patch_data", "-e",
+        default="unit_tests_patch_data.yaml",
+        help="Include extra patch data when extracting the unit tests into the config. "
              "Since we have our own method of testing file outputs, this is mostly used for file hashes/regexes."
     )
     parser.add_argument(
         "--script",
         # default="unit_tests_script.sh",
         default=None,
-        help="Bash script to run alongside. This is to set up the environment for the test suite, ex mount points and files tests may depend on."
+        help="Bash script to run after extracting the tests. Runs relative to this file. This is to set up the environment for the test suite, ex mount points and files tests may depend on."
              "(Probably need to run with root)."
     )
     parser.add_argument(
         "--repo",
         # while the WDL spec has its bugs, use a fixed version
         # see openwdl issues #653, #654, #661, #662, #663, #664, #665, #666
-        default="https://github.com/stxue1/wdl.git",
+        default="https://github.com/openwdl/wdl.git",
         help="Repository to pull from."
     )
     argcomplete.autocomplete(parser)
@@ -457,14 +496,18 @@ def main(argv=None):
     spec_dir = f"wdl-{args.version}-spec"
     if not os.path.exists(spec_dir) or args.force_pull is True:
         print(f"Pulling SPEC from repo {args.repo}...")
-        cmd = f"git clone {args.repo}"
+        cmd = f"git clone {args.repo} {spec_dir}"
         subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
     os.chdir(spec_dir)
 
     # may be fragile if WDL changes their branch naming scheme
     # todo: remove -fixes suffix after wdl fixes their spec, see comment above
-    cmd = f"git checkout wdl-{args.version}-fixes"
+    if args.version == "1.1":
+        repo_version = "1.1.3"
+    else:
+        repo_version = args.version
+    cmd = f"git checkout wdl-{repo_version}"
     subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
     os.chdir("..")
@@ -474,7 +517,7 @@ def main(argv=None):
     subprocess.run(cmd.split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
 
     print("Extracting tests...")
-    extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type, args.extra_metadata)
+    extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type, args.extra_patch_data)
 
     if args.script is not None:
         # assume it needs to run as root
diff --git a/unit_tests_metadata.yaml b/unit_tests_patch_data.yaml
similarity index 100%
rename from unit_tests_metadata.yaml
rename to unit_tests_patch_data.yaml

From db89c776dce49d9c806a0b4382fe2232865fa561 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Tue, 20 Aug 2024 10:32:28 -0700
Subject: [PATCH 17/34] Move script argument to run_unit

---
 run_unit.py         | 15 +++++++++++++++
 setup_unit_tests.py | 14 --------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/run_unit.py b/run_unit.py
index 80d8724..9ef4cba 100644
--- a/run_unit.py
+++ b/run_unit.py
@@ -5,6 +5,7 @@
 """
 import argparse
 import os
+import subprocess
 import sys
 
 import argcomplete
@@ -25,6 +26,13 @@ def main():
     parser.add_argument("--force-pull", default=False, action="store_true",
                         help="Specify whether to use the cached SPEC or to force a pull."
                              "The setup script will be run as well.")
+    parser.add_argument(
+        "--script",
+        # default="unit_tests_script.sh",
+        default=None,
+        help="Bash script to run after extracting the tests. Runs relative to this file. This is to set up the environment for the test suite,"
+             "ex mount points and files tests may depend on. (Probably need to run with root)."
+    )
     parser.set_defaults(versions="1.1")  # override default to 1.1
     parser.set_defaults(runner="toil-wdl-runner")  # cromwell doesn't support 1.1+
 
@@ -42,6 +50,13 @@ def main():
     if args.versions not in ["1.1", "1.2"]:
         raise RuntimeError(f"WDL version is not valid; unit tests are only supported on WDL 1.1+.")
 
+    if args.script is not None:
+        # assume it needs to run as root
+        if os.geteuid() == 0:
+            subprocess.run(["bash", "{args.script}"], shell=True)
+        else:
+            subprocess.run(["sudo", "bash", "{args.script}"], shell=True)
+
     conformance_runner = WDLConformanceTestRunner(conformance_file=args.config)
     _, successful_run = conformance_runner.run_and_generate_tests(args)
 
diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 6c6b3be..6d3c588 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -476,13 +476,6 @@ def main(argv=None):
         help="Include extra patch data when extracting the unit tests into the config. "
              "Since we have our own method of testing file outputs, this is mostly used for file hashes/regexes."
     )
-    parser.add_argument(
-        "--script",
-        # default="unit_tests_script.sh",
-        default=None,
-        help="Bash script to run after extracting the tests. Runs relative to this file. This is to set up the environment for the test suite, ex mount points and files tests may depend on."
-             "(Probably need to run with root)."
-    )
     parser.add_argument(
         "--repo",
         # while the WDL spec has its bugs, use a fixed version
@@ -519,13 +512,6 @@ def main(argv=None):
     print("Extracting tests...")
     extract_tests(Path(spec_dir) / Path("SPEC.md"), Path(spec_dir) / Path("tests/data"), Path("unit_tests"), args.version, args.output_type, args.extra_patch_data)
 
-    if args.script is not None:
-        # assume it needs to run as root
-        if os.geteuid() == 0:
-            subprocess.run(["bash", "{args.script}"], shell=True)
-        else:
-            subprocess.run(["sudo", "bash", "{args.script}"], shell=True)
-
 
 if __name__ == "__main__":
     main()

From c89324e555421622c9265a21d2662a0427dba214 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 14:36:02 -0700
Subject: [PATCH 18/34] Adjust comment on weird dependency patch behavior

---
 run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/run.py b/run.py
index 769604e..37eab92 100755
--- a/run.py
+++ b/run.py
@@ -388,7 +388,9 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
             pre_args = args["cromwell_pre_args"]
 
         # todo: it seems odd that I'm looking for a dependency before running when the test spec says test frameworks are supposed to be used to turn failing tests into warnings
-        # this is mainly because the docker and singularity strings are custom dependencies that aren't (explicitly) part of the WDL test spec
+        # this is mainly because the docker and singularity strings that I use are custom dependency values that I use
+        # but I use it to carry data through on how to run the runner rather than using it to turn failing tests into warnings
+        # (one of the tests depends on findmnt on /, which differs between singularity and docker
         # they mainly tell how toil should run to pass the test
         if runner == "toil-wdl-runner" and "dependencies" in test:
             if "docker" in test["dependencies"]:

From d2cbc62bfb7f7cb11d2552c865521a52209b7ffd Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 14:39:17 -0700
Subject: [PATCH 19/34] Fix mismatched arguments for get_specific_tests

---
 create_graph.py | 5 +----
 run.py          | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/create_graph.py b/create_graph.py
index 48cf639..73bf868 100755
--- a/create_graph.py
+++ b/create_graph.py
@@ -299,10 +299,7 @@ def create_graph(from_file: str, options: argparse.Namespace) -> None:
         with open(options.conformance_file, "r") as f:
             data = yaml.safe_load(f)
             # this also allows specifying which tests to graph by tag/id/numbers
-            all_test_idx_to_graph = get_specific_tests(conformance_tests=data, tag_argument=options.tags,
-                                                       number_argument=options.numbers,
-                                                       exclude_number_argument=options.exclude_numbers,
-                                                       id_argument=options.id)
+            all_test_idx_to_graph = get_specific_tests(conformance_tests=data, options=options)
             all_test_ids_to_graph = list_of_idx_to_ids(data, all_test_idx_to_graph)
         num_unique_tests_to_display = len(all_test_ids_to_graph)
         iterations = num_unique_tests_to_display // number_of_entries_per_graph + (
diff --git a/run.py b/run.py
index 37eab92..cb72dd8 100755
--- a/run.py
+++ b/run.py
@@ -465,8 +465,7 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
     def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:
         # To be used with pycharm's debugger which is currently broken if there is concurrency
         versions_to_test = set(options.versions.split(','))
-        selected_tests = get_specific_tests(conformance_tests=self.tests, tag_argument=options.tags, number_argument=options.numbers,
-                                            id_argument=options.id, exclude_number_argument=options.exclude_numbers, exclude_tags_argument=options.exclude_tags)
+        selected_tests = get_specific_tests(conformance_tests=self.tests, options=options)
         print(f"===DEBUG===")
         print(f'Testing runner {options.runner} on WDL versions: {",".join(versions_to_test)}\n')
         for test_index in selected_tests:

From 96360d6c9076efc8fa161fa31c510b213b1b7f5b Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 14:53:08 -0700
Subject: [PATCH 20/34] Better doctring for object conversion in setup

---
 setup_unit_tests.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 6d3c588..35dd409 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -156,16 +156,20 @@ def extract_output_types_regex(wdl_file) -> Dict[str, str]:
 
 
 def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any], List[Any]], output_type: WDL.Type.Base,
-                                data_dir: Optional[Path], extra_patch_data: Union[List[Any], Dict[str, Any]]):
+                                data_dir: Optional[Path], extra_patch_data: Union[List[Any], Dict[str, Any]]) -> Optional[Union[str, Dict[str, Any], List[Any]]]:
     """
-    Get the expected output object (in our conformance representation) with respect to the output types. Do so by searching
-    the extra yaml/configuration file for an associated key value pair. Can also test if the
-    wanted file exists recursively in data_dir. Descend recursively down the type
+    Get the expected output object (in our conformance representation) with respect to the output types. Convert/add extra fields when needed
+    to matched our conformance test suite.
+    Do so by searching the extra yaml/configuration file for an associated key value pair. Can also test if the
+    wanted file exists recursively in data_dir. Descends recursively down the type alongside the object.
 
-    This is done to get the md5sum of files when converting unit tests.
+    This is done to get and add the md5sum of files when converting unit tests.
 
-    Ex: If given hello.txt and type File, try to get the md5sum of the file.
+    Ex: If given hello.txt and type File, try to get the md5sum of the file. Then returns the md5sum as {"md5sum": md5sum}
     If given "hello" and type String, don't do anything and just return the string value "hello"
+
+    Returns a json/yaml parseable object with the expected values represented as a string or a compound type with leafs of strings.
+
     """
     if output_values is None and output_type.optional:
         return output_values

From 80c8d2bd04780641618893f12cfefd514c806f7c Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:04:03 -0700
Subject: [PATCH 21/34] Add github issue in comment

---
 setup_unit_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 35dd409..ac72520 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -196,6 +196,7 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
                 # in the map_to_struct.wdl test, when parsing the output types, the miniwdl parser outputs a struct with the right types
                 # but the wrong keys (right member types but wrong member names)
                 # So the key lookup will fail even though the corresponding types are actually there
+                # See https://github.com/chanzuckerberg/miniwdl/issues/712
                 # Since dictionaries past python 3.6 are ordered, find the corresponding type from the current output's position
                 output_value_type = list(output_type.members.values())[i]
             converted_output[output_key] = convert_typed_output_values(output_value, output_value_type, data_dir, get_from(extra_patch_data, output_value))

From ceec5ebabd471dbba309cb9dc569f5045d18fc3c Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:14:37 -0700
Subject: [PATCH 22/34] fix inconsistent underscore

---
 setup_unit_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index ac72520..569b833 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -476,7 +476,7 @@ def main(argv=None):
         help="Default behavior does not pull the spec repo if it was already pulled. Specify this argument to force a refresh."
     )
     parser.add_argument(
-        "--extra-patch_data", "-e",
+        "--extra-patch-data", "-e",
         default="unit_tests_patch_data.yaml",
         help="Include extra patch data when extracting the unit tests into the config. "
              "Since we have our own method of testing file outputs, this is mostly used for file hashes/regexes."

From 59730193e62f7679894e7849ba29ad356a9ab6aa Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Wed, 21 Aug 2024 15:16:50 -0700
Subject: [PATCH 23/34] Update README_UNIT.md

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 README_UNIT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_UNIT.md b/README_UNIT.md
index 44efa9d..efe97b1 100644
--- a/README_UNIT.md
+++ b/README_UNIT.md
@@ -43,7 +43,7 @@ Before running the test suite, the shell script `unit_tests_script.sh` may need
 To remove these directories and anything else that the script may create, call `make clean-unit-setup`.
 
 ### Running Tests
-THen, use the script `run_unit.py` to actually run the tests according to the configuration file. If the default setup was performed,
+Then, use the script `run_unit.py` to actually run the tests according to the configuration file. If the default setup was performed,
 the script can be used as if using `run.py`.
 
 For example: `python run_unit.py --runner toil-wdl-runner --version 1.1 -n 105-109 --quiet --threads=4 --progress`

From 1a462497b741d329e10667da0c9bea6442474a6b Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Wed, 21 Aug 2024 15:17:12 -0700
Subject: [PATCH 24/34] Update lib.py

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 lib.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib.py b/lib.py
index 4837d86..beff16d 100644
--- a/lib.py
+++ b/lib.py
@@ -344,6 +344,11 @@ def get_specific_tests(conformance_tests, options: Namespace):
 
 
 def verify_return_code(expected_ret_code: Union[int, List[int], str], got_ret_code: int):
+    """
+    Return a test result dict that SUCCEEDED if the return code is on the list, and FAILED otherwise.
+    
+    A "*" string matches any return code.
+    """
     if not isinstance(expected_ret_code, list):
         expected_ret_code = [expected_ret_code]
     success = {'status': 'SUCCEEDED'}

From ebd091476f09aa0b12098e42a80b354e76756bea Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:20:21 -0700
Subject: [PATCH 25/34] Add docstring to WDL runner format_command

---
 run.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/run.py b/run.py
index cb72dd8..51170b5 100755
--- a/run.py
+++ b/run.py
@@ -49,6 +49,11 @@ def __init__(self, runner):
 
     def format_command(self, wdl_file: str, json_input: Union[str, Dict[str, Any]], results_file: str,
                        args: List[str], verbose: bool, pre_args: Optional[List[str]] = None) -> List[str]:
+        """
+        Make a list of command parts to invoke a WDL runner that uses Cromwell-style arguments.
+
+        :param json_input: Can either be a json as a string or a dictionary that is json parseable
+        """
         if isinstance(json_input, dict):
             json_input = json.dumps(json_input)
         json_arg = ["-i", json_input] if json_input is not None else []

From 16fb4957d28f548528771bcafbf6612691225ecf Mon Sep 17 00:00:00 2001
From: stxue1 <122345910+stxue1@users.noreply.github.com>
Date: Wed, 21 Aug 2024 15:24:22 -0700
Subject: [PATCH 26/34] Update setup_unit_tests.py

Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
---
 setup_unit_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 569b833..89b00f3 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -26,6 +26,7 @@
 
 import WDL
 
+# Use the same parsing regex as https://github.com/openwdl/wdl-tests/blob/58ff36209586ed69c9a64d3e0b151a343f12a4eb/scripts/extract_tests.py#L10-L13
 TEST_RE = re.compile(
     r"^<details>\s*<summary>\s*Example: (.+?)\s*```wdl(.+?)```\s*</summary>\s*(?:<p>\s*(?:Example input:\s*```json(.*?)```)?\s*(?:Example output:\s*```json(.*?)```)?\s*(?:Test config:\s*```json(.*)```)?\s*</p>\s*)?</details>$",
     re.I | re.S,

From 2f2fa81f119f1924cdffe7395b0551f27dcfbee7 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:26:39 -0700
Subject: [PATCH 27/34] Add regex to conversion output function too

---
 setup_unit_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 89b00f3..03222c7 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -166,7 +166,7 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
 
     This is done to get and add the md5sum of files when converting unit tests.
 
-    Ex: If given hello.txt and type File, try to get the md5sum of the file. Then returns the md5sum as {"md5sum": md5sum}
+    Ex: If given hello.txt and type File, try to get the md5sum of the file. Then returns the md5sum as {"md5sum": md5sum} (If regex is in the patch file, can use that too)
     If given "hello" and type String, don't do anything and just return the string value "hello"
 
     Returns a json/yaml parseable object with the expected values represented as a string or a compound type with leafs of strings.

From 58ce9b6fa7c8e46588d3da00389770ae99894933 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:30:20 -0700
Subject: [PATCH 28/34] Get rid of script setup

---
 Makefile             |  4 ----
 run_unit.py          | 15 ---------------
 unit_tests_script.sh |  4 ----
 3 files changed, 23 deletions(-)
 delete mode 100755 unit_tests_script.sh

diff --git a/Makefile b/Makefile
index e7395fc..ca16c9a 100644
--- a/Makefile
+++ b/Makefile
@@ -32,10 +32,6 @@ clean-unit: clean-unit-setup
 	rm -rf wdl-1.1-spec
 	rm -rf unit_tests
 
-clean-unit-setup:
-	rm -rf /mnt/outputs
-	rm -rf /mnt/tmp
-
 cromwell:
 	mkdir -p build
 	if [ ! -f build/cromwell.jar ]; then \
diff --git a/run_unit.py b/run_unit.py
index 9ef4cba..1883a5c 100644
--- a/run_unit.py
+++ b/run_unit.py
@@ -26,13 +26,6 @@ def main():
     parser.add_argument("--force-pull", default=False, action="store_true",
                         help="Specify whether to use the cached SPEC or to force a pull."
                              "The setup script will be run as well.")
-    parser.add_argument(
-        "--script",
-        # default="unit_tests_script.sh",
-        default=None,
-        help="Bash script to run after extracting the tests. Runs relative to this file. This is to set up the environment for the test suite,"
-             "ex mount points and files tests may depend on. (Probably need to run with root)."
-    )
     parser.set_defaults(versions="1.1")  # override default to 1.1
     parser.set_defaults(runner="toil-wdl-runner")  # cromwell doesn't support 1.1+
 
@@ -49,14 +42,6 @@ def main():
 
     if args.versions not in ["1.1", "1.2"]:
         raise RuntimeError(f"WDL version is not valid; unit tests are only supported on WDL 1.1+.")
-
-    if args.script is not None:
-        # assume it needs to run as root
-        if os.geteuid() == 0:
-            subprocess.run(["bash", "{args.script}"], shell=True)
-        else:
-            subprocess.run(["sudo", "bash", "{args.script}"], shell=True)
-
     conformance_runner = WDLConformanceTestRunner(conformance_file=args.config)
     _, successful_run = conformance_runner.run_and_generate_tests(args)
 
diff --git a/unit_tests_script.sh b/unit_tests_script.sh
deleted file mode 100755
index af43cb8..0000000
--- a/unit_tests_script.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-# need to run with root
-mkdir /mnt/outputs/
-mkdir /mnt/tmp/
\ No newline at end of file

From 0a702c2d23dbbb4389f410cddad3b9159f91ea22 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:31:49 -0700
Subject: [PATCH 29/34] Added comment for type checking runner output and
 expected output

---
 run.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/run.py b/run.py
index 51170b5..3554686 100755
--- a/run.py
+++ b/run.py
@@ -192,12 +192,15 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                 # Or when Map[Float, Int], the output will be {"1.000000": 1}
                 # This only applies to Int and Float types, as Boolean key types don't seem to be supported in miniwdl
                 if isinstance(typ, WDLInt):
+                    # Ensure the actual result is parseable as the correct type
                     try:
                         # ensure the stringified version of the runner output is equivalent to an int
                         expected_type(result)
                     except ValueError:
                         # the string representation does not represent the right type
                         return {'status': 'FAILED', 'reason': f"Runner output {result} is not type Int from the conformance file."}
+                    # For good measure, ensure the expected result is as well.
+                    # TODO: If a test fails because of this it is really a bug in the test definition. (also mirrored below)
                     try:
                         # ensure the stringified version of the conformance output is equivalent to an int
                         expected_type(expected)
@@ -205,12 +208,14 @@ def compare_outputs(self, expected: Any, result: Any, typ: WDLBase):
                         return {'status': 'FAILED', 'reason': f"Conformance output and type does not match. Expected output {expected} with expected type Int"}
 
                 elif isinstance(typ, WDLFloat):
+                    # Ensure the actual result is parseable as the correct type
                     try:
                         # ensure the stringified version of the runner output is equivalent to an float
                         expected_type(result)
                     except ValueError:
                         # the string representation does not represent the right type
                         return {'status': 'FAILED', 'reason': f"Runner output {result} is not type Float from the conformance file."}
+                    # For good measure, ensure the expected result is as well.
                     try:
                         # ensure the stringified version of the conformance output is equivalent to an float
                         expected_type(expected)

From 02bfe53393524187df6ee569bca43931a96db3a0 Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Wed, 21 Aug 2024 15:33:53 -0700
Subject: [PATCH 30/34] Removed deleted makefile target that was still a prereq

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ca16c9a..0f0a9c7 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ clean:
 clean-csv:
 	rm csv_output_*
 
-clean-unit: clean-unit-setup
+clean-unit:
 	rm -rf wdl-1.1-spec
 	rm -rf unit_tests
 

From cbf9aa20ae1ec339f39c64e7432e39d3003d88be Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Thu, 22 Aug 2024 01:10:50 -0700
Subject: [PATCH 31/34] Fix mismatched arguments

---
 run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run.py b/run.py
index ebaa3ba..306b5de 100755
--- a/run.py
+++ b/run.py
@@ -407,7 +407,8 @@ def run_single_test(self, test_index: int, test: dict, runner: str, version: str
                 test_args.extend(["--container", "docker"])
             if "singularity" in test["dependencies"]:
                 test_args.extend(["--container", "singularity"])
-        cmd = wdl_runner.format_command(wdl_file, json_file, json_string, results_file, test_args, verbose, pre_args)
+        json_input = json_string or json_file
+        cmd = wdl_runner.format_command(wdl_file, json_input, results_file, test_args, verbose, pre_args)
 
         realtime = None
         if time:

From 8364c4a1942786f4a919651d8f305dbbc225c04a Mon Sep 17 00:00:00 2001
From: stxue1 <stxue@ucsc.edu>
Date: Thu, 22 Aug 2024 01:19:13 -0700
Subject: [PATCH 32/34] Fix missing passthroughed response causing key errors

---
 lib.py | 24 +++++++++++++-----------
 run.py |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/lib.py b/lib.py
index beff16d..9a009fe 100644
--- a/lib.py
+++ b/lib.py
@@ -568,12 +568,14 @@ def test_gpu_available():
 IGNORE_DEPENDENCIES = ["docker", "root", "singularity"]
 
 
-def test_dependencies(dependencies: Optional[List[str]]) -> Dict[str, Any]:
+def test_dependencies(dependencies: Optional[List[str]], current_result: Dict[str, Any]) -> Dict[str, Any]:
     """
     Given a set of dependencies for a test, see if any of those dependencies are violated.
     If so, change a failing test to a warning and update the reason.
 
     The list of dependencies are at https://github.com/openwdl/wdl-tests/blob/main/docs/Specification.md#test-configuration
+
+    Pass in the dictionary of the current processed result of the test run.
     """
     # todo: maybe there is a better way to deal with dependencies
     response = {}
@@ -581,41 +583,41 @@ def test_dependencies(dependencies: Optional[List[str]]) -> Dict[str, Any]:
         return response
     for d in dependencies:
         if d == "gpu":
-            if not test_gpu_available() and response['status'] == 'FAILED':
+            if not test_gpu_available() and current_result['status'] == 'FAILED':
                 response["status"] = "WARNING"
                 response["reason"] = (f"Some GPU dependency is necessary but is not available on machine. "
                                       f"Either run the test on a GPU supported system or set 'WDL_CONFORMANCE_TESTS__GPU=True' to override. "
-                                      f"\nFailing reason:\n") + response["reason"]
+                                      f"\nFailing reason:\n") + current_result["reason"]
         elif d == "disks":
             # todo: I'm not really sure how to test for a disk specific error
             # i could try to check if all mount disks are available but that requires the mount points to be given in advance
             # toil will return NotImplementedError for a missing mount point, so just catch that as miniwdl does not support this functionality yet
             # and cromwell is stuck at wdl 1.0
-            if response['status'] == 'FAILED' and "NotImplementedError" in response['stderr']:
+            if current_result['status'] == 'FAILED' and "NotImplementedError" in current_result['stderr']:
                 response["status"] = "WARNING"
                 response["reason"] = (f"Some disk dependency is necessary but is not available on machine."
                                       f"Check that all the mount points specified in the WDL tasks exist."
-                                      f"\nFailing reason:\n") + response["reason"]
+                                      f"\nFailing reason:\n") + current_result["reason"]
         elif d == "cpu":
             # todo: figure out better way to detect cpu errors too
-            if response['status'] == 'FAILED':
+            if current_result['status'] == 'FAILED':
                 # miniwdl adjusts cpus to host limit so itll never error
                 # so just deal with toil
                 to_match = re.compile(r"is requesting [0-9.]+ cores, more than the maximum of [0-9.]+ cores")
-                if re.search(to_match, response['stderr']):
+                if re.search(to_match, current_result['stderr']):
                     response["status"] = "WARNING"
                     response["reason"] = (f"Some CPU dependency is necessary but is not available on machine."
                                           f"A WDL task likely requested more CPUs than available."
-                                          f"\nFailing reason:\n") + response["reason"]
+                                          f"\nFailing reason:\n") + current_result["reason"]
         elif d == "memory":
             # todo: better, same as cpu above
-            if response['status'] == 'FAILED':
+            if current_result['status'] == 'FAILED':
                 to_match = re.compile(r"is requesting [0-9.]+ bytes of memory, more than the maximum of [0-9.]+ bytes of memory")
-                if re.search(to_match, response['stderr']):
+                if re.search(to_match, current_result['stderr']):
                     response["status"] = "WARNING"
                     response["reason"] = (f"Some memory dependency is necessary but is not available on machine."
                                           f"A WDL task likely requested more memory than available."
-                                          f"\nFailing reason:\n") + response["reason"]
+                                          f"\nFailing reason:\n") + current_result["reason"]
         elif d not in IGNORE_DEPENDENCIES:
             print(f"Warning: Test framework encountered unsupported dependency {d}. Ignoring...")
     return response
diff --git a/run.py b/run.py
index 306b5de..f2729f5 100755
--- a/run.py
+++ b/run.py
@@ -470,7 +470,7 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
         if repeat is not None:
             response["repeat"] = repeat
         # Turn failing tests to warnings if they violate a test dependency
-        response.update(test_dependencies(dependencies=test.get("dependencies")))
+        response.update(test_dependencies(dependencies=test.get("dependencies"), current_result=response))
         return response
 
     def _run_debug(self, options: argparse.Namespace, args: Optional[Dict[str, Any]]) -> None:

From 68322c010b9038df99ef15b9cfee2976aad62d4a Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 12 Sep 2024 11:37:53 -0400
Subject: [PATCH 33/34] Revise dependency comment

---
 run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run.py b/run.py
index f2729f5..c084739 100755
--- a/run.py
+++ b/run.py
@@ -469,7 +469,8 @@ def handle_test(self, test_index: int, test: Dict[str, Any], runner: str, versio
                 self.run_single_test(test_index, test, runner, version, time, verbose, quiet, args, jobstore_path, debug))
         if repeat is not None:
             response["repeat"] = repeat
-        # Turn failing tests to warnings if they violate a test dependency
+        # Turn failing tests to warnings if any of the tests' dependencies were not
+        # actually available.
         response.update(test_dependencies(dependencies=test.get("dependencies"), current_result=response))
         return response
 

From 136cf6589de10cd5a8cb24902897115c1ecf1c8f Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 12 Sep 2024 11:38:02 -0400
Subject: [PATCH 34/34] Fix spelling

---
 setup_unit_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_unit_tests.py b/setup_unit_tests.py
index 03222c7..6c548ce 100644
--- a/setup_unit_tests.py
+++ b/setup_unit_tests.py
@@ -160,7 +160,7 @@ def convert_typed_output_values(output_values: Union[None, str, Dict[str, Any],
                                 data_dir: Optional[Path], extra_patch_data: Union[List[Any], Dict[str, Any]]) -> Optional[Union[str, Dict[str, Any], List[Any]]]:
     """
     Get the expected output object (in our conformance representation) with respect to the output types. Convert/add extra fields when needed
-    to matched our conformance test suite.
+    to match our conformance test suite.
     Do so by searching the extra yaml/configuration file for an associated key value pair. Can also test if the
     wanted file exists recursively in data_dir. Descends recursively down the type alongside the object.